o
    Zh{r                     @   s  d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZmZmZmZmZ ddlmZ d	d
lmZ eeZed ZdgZG dd deZG dd deZeddG dd dZeddG dd dZeddG dd dZ eddG dd dZ!eddG dd dZ"d/deee#e	f  de"fdd Z$d!e	de fd"d#Z%d$e&de#fd%d&Z'd'Z(d(e)de#fd)d*Z*d+e	d,e#ddfd-d.Z+dS )0z4Contains utilities to manage the HF cache directory.    N)defaultdict)	dataclass)Path)Dict	FrozenSetListLiteralOptionalSetUnion   )HF_HUB_CACHE   )logging)modeldatasetspacez	.DS_Storec                       sD   e Zd ZU dZeeef ed< dedeeef f fddZ  Z	S )CacheNotFoundz9Exception thrown when the Huggingface cache is not found.	cache_dirmsgc                    s$   t  j|g|R i | || _d S N)super__init__r   )selfr   r   argskwargs	__class__ [/var/www/html/lang_env/lib/python3.10/site-packages/huggingface_hub/utils/_cache_manager.pyr   *   s   
zCacheNotFound.__init__)
__name__
__module____qualname____doc__r   strr   __annotations__r   __classcell__r   r   r   r   r   %   s   
 &r   c                   @   s   e Zd ZdZdS )CorruptedCacheExceptionzGException for any unexpected structure in the Huggingface cache-system.N)r    r!   r"   r#   r   r   r   r   r'   /   s    r'   T)frozenc                   @   sx   e Zd ZU dZeed< eed< eed< eed< eed< eed< e	defd	d
Z
e	defddZe	defddZdS )CachedFileInfoa  Frozen data structure holding information about a single cached file.

    Args:
        file_name (`str`):
            Name of the file. Example: `config.json`.
        file_path (`Path`):
            Path of the file in the `snapshots` directory. The file path is a symlink
            referring to a blob in the `blobs` folder.
        blob_path (`Path`):
            Path of the blob file. This is equivalent to `file_path.resolve()`.
        size_on_disk (`int`):
            Size of the blob file in bytes.
        blob_last_accessed (`float`):
            Timestamp of the last time the blob file has been accessed (from any
            revision).
        blob_last_modified (`float`):
            Timestamp of the last time the blob file has been modified/created.

    <Tip warning={true}>

    `blob_last_accessed` and `blob_last_modified` reliability can depend on the OS you
    are using. See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
    for more details.

    </Tip>
    	file_name	file_path	blob_pathsize_on_diskblob_last_accessedblob_last_modifiedreturnc                 C   
   t | jS )z
        (property) Timestamp of the last time the blob file has been accessed (from any
        revision), returned as a human-readable string.

        Example: "2 weeks ago".
        )_format_timesincer.   r   r   r   r   blob_last_accessed_strX      
z%CachedFileInfo.blob_last_accessed_strc                 C   r1   )z
        (property) Timestamp of the last time the blob file has been modified, returned
        as a human-readable string.

        Example: "2 weeks ago".
        )r2   r/   r3   r   r   r   blob_last_modified_strb   r5   z%CachedFileInfo.blob_last_modified_strc                 C   r1   )zi
        (property) Size of the blob file as a human-readable string.

        Example: "42.2K".
        _format_sizer-   r3   r   r   r   size_on_disk_strl      
zCachedFileInfo.size_on_disk_strN)r    r!   r"   r#   r$   r%   r   intfloatpropertyr4   r6   r9   r   r   r   r   r)   3   s   
 		r)   c                   @   s   e Zd ZU dZeed< eed< eed< ee	 ed< ee ed< e
ed< edefd	d
ZedefddZedefddZdS )CachedRevisionInfoaN  Frozen data structure holding information about a revision.

    A revision correspond to a folder in the `snapshots` folder and is populated with
    the exact tree structure as the repo on the Hub but contains only symlinks. A
    revision can be either referenced by 1 or more `refs` or be "detached" (no refs).

    Args:
        commit_hash (`str`):
            Hash of the revision (unique).
            Example: `"9338f7b671827df886678df2bdd7cc7b4f36dffd"`.
        snapshot_path (`Path`):
            Path to the revision directory in the `snapshots` folder. It contains the
            exact tree structure as the repo on the Hub.
        files: (`FrozenSet[CachedFileInfo]`):
            Set of [`~CachedFileInfo`] describing all files contained in the snapshot.
        refs (`FrozenSet[str]`):
            Set of `refs` pointing to this revision. If the revision has no `refs`, it
            is considered detached.
            Example: `{"main", "2.4.0"}` or `{"refs/pr/1"}`.
        size_on_disk (`int`):
            Sum of the blob file sizes that are symlink-ed by the revision.
        last_modified (`float`):
            Timestamp of the last time the revision has been created/modified.

    <Tip warning={true}>

    `last_accessed` cannot be determined correctly on a single revision as blob files
    are shared across revisions.

    </Tip>

    <Tip warning={true}>

    `size_on_disk` is not necessarily the sum of all file sizes because of possible
    duplicated files. Besides, only blobs are taken into account, not the (negligible)
    size of folders and symlinks.

    </Tip>
    commit_hashsnapshot_pathr-   filesrefslast_modifiedr0   c                 C   r1   )z
        (property) Timestamp of the last time the revision has been modified, returned
        as a human-readable string.

        Example: "2 weeks ago".
        r2   rC   r3   r   r   r   last_modified_str   r5   z$CachedRevisionInfo.last_modified_strc                 C   r1   zn
        (property) Sum of the blob file sizes as a human-readable string.

        Example: "42.2K".
        r7   r3   r   r   r   r9      r:   z#CachedRevisionInfo.size_on_disk_strc                 C   r1   )zC
        (property) Total number of files in the revision.
        )lenrA   r3   r   r   r   nb_files   s   
zCachedRevisionInfo.nb_filesN)r    r!   r"   r#   r$   r%   r   r;   r   r)   r<   r=   rE   r9   rH   r   r   r   r   r>   v   s   
 (	r>   c                   @   s   e Zd ZU dZeed< eed< eed< eed< eed< e	e
 ed< eed< eed	< ed
efddZed
efddZed
efddZed
eee
f fddZdS )CachedRepoInfoad  Frozen data structure holding information about a cached repository.

    Args:
        repo_id (`str`):
            Repo id of the repo on the Hub. Example: `"google/fleurs"`.
        repo_type (`Literal["dataset", "model", "space"]`):
            Type of the cached repo.
        repo_path (`Path`):
            Local path to the cached repo.
        size_on_disk (`int`):
            Sum of the blob file sizes in the cached repo.
        nb_files (`int`):
            Total number of blob files in the cached repo.
        revisions (`FrozenSet[CachedRevisionInfo]`):
            Set of [`~CachedRevisionInfo`] describing all revisions cached in the repo.
        last_accessed (`float`):
            Timestamp of the last time a blob file of the repo has been accessed.
        last_modified (`float`):
            Timestamp of the last time a blob file of the repo has been modified/created.

    <Tip warning={true}>

    `size_on_disk` is not necessarily the sum of all revisions sizes because of
    duplicated files. Besides, only blobs are taken into account, not the (negligible)
    size of folders and symlinks.

    </Tip>

    <Tip warning={true}>

    `last_accessed` and `last_modified` reliability can depend on the OS you are using.
    See [python documentation](https://docs.python.org/3/library/os.html#os.stat_result)
    for more details.

    </Tip>
    repo_id	repo_type	repo_pathr-   rH   	revisionslast_accessedrC   r0   c                 C   r1   )z
        (property) Last time a blob file of the repo has been accessed, returned as a
        human-readable string.

        Example: "2 weeks ago".
        )r2   rN   r3   r   r   r   last_accessed_str   r5   z CachedRepoInfo.last_accessed_strc                 C   r1   )z
        (property) Last time a blob file of the repo has been modified, returned as a
        human-readable string.

        Example: "2 weeks ago".
        rD   r3   r   r   r   rE      r5   z CachedRepoInfo.last_modified_strc                 C   r1   rF   r7   r3   r   r   r   r9     r:   zCachedRepoInfo.size_on_disk_strc                 C   s   dd | j D S )zQ
        (property) Mapping between `refs` and revision data structures.
        c                 S   s   i | ]}|j D ]}||qqS r   )rB   ).0revisionrefr   r   r   
<dictcomp>  s    z'CachedRepoInfo.refs.<locals>.<dictcomp>)rM   r3   r   r   r   rB     s   zCachedRepoInfo.refsN)r    r!   r"   r#   r$   r%   REPO_TYPE_Tr   r;   r   r>   r<   r=   rO   rE   r9   r   rB   r   r   r   r   rI      s$   
 %		rI   c                   @   sf   e Zd ZU dZeed< ee ed< ee ed< ee ed< ee ed< ede	fdd	Z
dddZd
S )DeleteCacheStrategya  Frozen data structure holding the strategy to delete cached revisions.

    This object is not meant to be instantiated programmatically but to be returned by
    [`~utils.HFCacheInfo.delete_revisions`]. See documentation for usage example.

    Args:
        expected_freed_size (`float`):
            Expected freed size once strategy is executed.
        blobs (`FrozenSet[Path]`):
            Set of blob file paths to be deleted.
        refs (`FrozenSet[Path]`):
            Set of reference file paths to be deleted.
        repos (`FrozenSet[Path]`):
            Set of entire repo paths to be deleted.
        snapshots (`FrozenSet[Path]`):
            Set of snapshots to be deleted (directory of symlinks).
    expected_freed_sizeblobsrB   repos	snapshotsr0   c                 C   r1   )zt
        (property) Expected size that will be freed as a human-readable string.

        Example: "42.2K".
        )r8   rV   r3   r   r   r   expected_freed_size_str3  r:   z+DeleteCacheStrategy.expected_freed_size_strNc                 C   sx   | j D ]}t|dd q| jD ]}t|dd q| jD ]}t|dd q| jD ]}t|dd q'td| j d dS )	a  Execute the defined strategy.

        <Tip warning={true}>

        If this method is interrupted, the cache might get corrupted. Deletion order is
        implemented so that references and symlinks are deleted before the actual blob
        files.

        </Tip>

        <Tip warning={true}>

        This method is irreversible. If executed, cached files are erased and must be
        downloaded again.

        </Tip>
        repo)	path_typeZsnapshotrR   ZblobzCache deletion done. Saved .N)rX   _try_delete_pathrY   rB   rW   loggerinforZ   )r   pathr   r   r   execute<  s   



zDeleteCacheStrategy.execute)r0   N)r    r!   r"   r#   r;   r%   r   r   r=   r$   rZ   rb   r   r   r   r   rU     s   
 rU   c                   @   sV   e Zd ZU dZeed< ee ed< ee	 ed< e
defddZdedefd	d
ZdS )HFCacheInfoa  Frozen data structure holding information about the entire cache-system.

    This data structure is returned by [`scan_cache_dir`] and is immutable.

    Args:
        size_on_disk (`int`):
            Sum of all valid repo sizes in the cache-system.
        repos (`FrozenSet[CachedRepoInfo]`):
            Set of [`~CachedRepoInfo`] describing all valid cached repos found on the
            cache-system while scanning.
        warnings (`List[CorruptedCacheException]`):
            List of [`~CorruptedCacheException`] that occurred while scanning the cache.
            Those exceptions are captured so that the scan can continue. Corrupted repos
            are skipped from the scan.

    <Tip warning={true}>

    Here `size_on_disk` is equal to the sum of all repo sizes (only blobs). However if
    some cached repos are corrupted, their sizes are not taken into account.

    </Tip>
    r-   rX   warningsr0   c                 C   r1   )z
        (property) Sum of all valid repo sizes in the cache-system as a human-readable
        string.

        Example: "42.2K".
        r7   r3   r   r   r   r9     r5   zHFCacheInfo.size_on_disk_strrM   c                 G   s  t |}tt }| jD ]}|jD ]}|j|v r$|| | ||j qqt|dkr7t	dd
|  t  }t  }t  }t  }	d}
| D ]i\}}|j| }t|dkrd||j |
|j7 }
qI|D ]K}|	|j |jD ]}||jd |  qq|jD ]/}|j|vrd}|D ]}|jD ]}|j|jkrd} nq|s nq|r||j |
|j7 }
qqfqItt|t|t|t|	|
dS )a  Prepare the strategy to delete one or more revisions cached locally.

        Input revisions can be any revision hash. If a revision hash is not found in the
        local cache, a warning is thrown but no error is raised. Revisions can be from
        different cached repos since hashes are unique across repos,

        Examples:
        ```py
        >>> from huggingface_hub import scan_cache_dir
        >>> cache_info = scan_cache_dir()
        >>> delete_strategy = cache_info.delete_revisions(
        ...     "81fd1d6e7847c99f5862c9fb81387956d99ec7aa"
        ... )
        >>> print(f"Will free {delete_strategy.expected_freed_size_str}.")
        Will free 7.9K.
        >>> delete_strategy.execute()
        Cache deletion done. Saved 7.9K.
        ```

        ```py
        >>> from huggingface_hub import scan_cache_dir
        >>> scan_cache_dir().delete_revisions(
        ...     "81fd1d6e7847c99f5862c9fb81387956d99ec7aa",
        ...     "e2983b237dccf3ab4937c97fa717319a9ca1a96d",
        ...     "6c0e6080953db56375760c0471a8c5f2929baf11",
        ... ).execute()
        Cache deletion done. Saved 8.6G.
        ```

        <Tip warning={true}>

        `delete_revisions` returns a [`~utils.DeleteCacheStrategy`] object that needs to
        be executed. The [`~utils.DeleteCacheStrategy`] is not meant to be modified but
        allows having a dry run before actually executing the deletion.

        </Tip>
        r   z,Revision(s) not found - cannot delete them: z, rB   TF)rW   rB   rX   rY   rV   )setr   rX   rM   r?   addremoverG   r_   warningjoinitemsrL   r-   r@   rB   rA   r,   rU   	frozenset)r   rM   Zhashes_to_deleteZrepos_with_revisionsr[   rQ   Zdelete_strategy_blobsZdelete_strategy_refsZdelete_strategy_reposZdelete_strategy_snapshotsZ#delete_strategy_expected_freed_sizeZaffected_repoZrevisions_to_deleteZother_revisionsZrevision_to_deleterR   fileZis_file_aloneZrev_filer   r   r   delete_revisions  sf   &









zHFCacheInfo.delete_revisionsN)r    r!   r"   r#   r;   r%   r   rI   r   r'   r=   r$   r9   rU   rm   r   r   r   r   rc   e  s   
 	rc   r   r0   c                 C   s   | du rt } t|   } |  std|  d| d|  r(td|  dt }g }| 	 D ](}|j
dkr9q1z	|t| W q1 tyY } z|| W Y d}~q1d}~ww tt|tdd |D |d	S )
at  Scan the entire HF cache-system and return a [`~HFCacheInfo`] structure.

    Use `scan_cache_dir` in order to programmatically scan your cache-system. The cache
    will be scanned repo by repo. If a repo is corrupted, a [`~CorruptedCacheException`]
    will be thrown internally but captured and returned in the [`~HFCacheInfo`]
    structure. Only valid repos get a proper report.

    ```py
    >>> from huggingface_hub import scan_cache_dir

    >>> hf_cache_info = scan_cache_dir()
    HFCacheInfo(
        size_on_disk=3398085269,
        repos=frozenset({
            CachedRepoInfo(
                repo_id='t5-small',
                repo_type='model',
                repo_path=PosixPath(...),
                size_on_disk=970726914,
                nb_files=11,
                revisions=frozenset({
                    CachedRevisionInfo(
                        commit_hash='d78aea13fa7ecd06c29e3e46195d6341255065d5',
                        size_on_disk=970726339,
                        snapshot_path=PosixPath(...),
                        files=frozenset({
                            CachedFileInfo(
                                file_name='config.json',
                                size_on_disk=1197
                                file_path=PosixPath(...),
                                blob_path=PosixPath(...),
                            ),
                            CachedFileInfo(...),
                            ...
                        }),
                    ),
                    CachedRevisionInfo(...),
                    ...
                }),
            ),
            CachedRepoInfo(...),
            ...
        }),
        warnings=[
            CorruptedCacheException("Snapshots dir doesn't exist in cached repo: ..."),
            CorruptedCacheException(...),
            ...
        ],
    )
    ```

    You can also print a detailed report directly from the `huggingface-cli` using:
    ```text
    > huggingface-cli scan-cache
    REPO ID                     REPO TYPE SIZE ON DISK NB FILES REFS                LOCAL PATH
    --------------------------- --------- ------------ -------- ------------------- -------------------------------------------------------------------------
    glue                        dataset         116.3K       15 1.17.0, main, 2.4.0 /Users/lucain/.cache/huggingface/hub/datasets--glue
    google/fleurs               dataset          64.9M        6 main, refs/pr/1     /Users/lucain/.cache/huggingface/hub/datasets--google--fleurs
    Jean-Baptiste/camembert-ner model           441.0M        7 main                /Users/lucain/.cache/huggingface/hub/models--Jean-Baptiste--camembert-ner
    bert-base-cased             model             1.9G       13 main                /Users/lucain/.cache/huggingface/hub/models--bert-base-cased
    t5-base                     model            10.1K        3 main                /Users/lucain/.cache/huggingface/hub/models--t5-base
    t5-small                    model           970.7M       11 refs/pr/1, main     /Users/lucain/.cache/huggingface/hub/models--t5-small

    Done in 0.0s. Scanned 6 repo(s) for a total of 3.4G.
    Got 1 warning(s) while scanning. Use -vvv to print details.
    ```

    Args:
        cache_dir (`str` or `Path`, `optional`):
            Cache directory to cache. Defaults to the default HF cache directory.

    <Tip warning={true}>

    Raises:

        `CacheNotFound`
          If the cache directory does not exist.

        [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
          If the cache directory is a file, instead of a directory.

    </Tip>

    Returns: a [`~HFCacheInfo`] object.
    NzCache directory not found: zM. Please use `cache_dir` argument or set `HF_HUB_CACHE` environment variable.)r   z1Scan cache expects a directory but found a file: z.locksc                 s       | ]}|j V  qd S r   )r-   )rP   r[   r   r   r   	<genexpr>e      z!scan_cache_dir.<locals>.<genexpr>)rX   r-   rd   )r   r   
expanduserresolveexistsr   is_file
ValueErrorre   iterdirnamerf   _scan_cached_repor'   appendrc   rk   sum)r   rX   rd   rL   er   r   r   scan_cache_dir  s8   V


r|   rL   c                    s  |   std|  d| jvrtd|  | jjddd\}}|dd }|dd}|d	vr<td
| d|  di  | d }| d }| rN|  sUtd| tt}| r| rhtd| |	dD ]-}|  rtqmt
||}| }| }	W d   n1 sw   Y  ||	 | qmt }
| D ]}|jtv rq| rtd| t }|	dD ]9}|  rqt| }| std| | vr|  |< |t|j| | j| | j | jd qt|dkr
t fdd|D }n| j}|
t|jt|t||jt t fddtdd |D D ||d qt|dkrLtdt| d|  dt dkrjtdd   D }tdd   D }n
|  }|j}|j}t t || |t|
tdd   D ||d S )!zScan a single cache repo and return information about it.

    Any unexpected behavior will raise a [`~CorruptedCacheException`].
    zRepo path is not a directory: z--z6Repo path is not a valid HuggingFace cache directory: r   )maxsplitN/>   r   r   r   z8Repo type must be `dataset`, `model` or `space`, found `z` (z).rY   rB   z,Snapshots dir doesn't exist in cached repo: z!Refs directory cannot be a file: z**/*z*Snapshots folder corrupted. Found a file: zBlob missing (broken symlink): )r*   r+   r-   r,   r.   r/   r   c                 3   s    | ]	} |j  jV  qd S r   )r,   st_mtimerP   rl   Z
blob_statsr   r   ro     s    z$_scan_cached_repo.<locals>.<genexpr>c                 3   s    | ]} | j V  qd S r   st_size)rP   r,   r   r   r   ro     s    

c                 s   rn   r   )r,   r   r   r   r   ro     rp   )r?   rA   rB   r-   r@   rC   z-Reference(s) refer to missing commit hashes: z (c                 s   rn   r   )st_atimerP   statr   r   r   ro     rp   c                 s   rn   r   )r   r   r   r   r   ro     rp   c                 s   rn   r   r   r   r   r   r   ro     rp   )rH   rJ   rL   rK   rM   r-   rN   rC   )!is_dirr'   rw   splitreplacers   r   re   rt   globr$   relative_toopenreadrf   rv   FILES_TO_IGNOREr   rr   r   r)   r   r   r   rG   maxr>   rk   poprz   dictvaluesrI   )rL   rK   rJ   Zsnapshots_pathZ	refs_pathZrefs_by_hashZref_pathref_namefr?   Zcached_revisionsZrevision_pathcached_filesr+   r,   Zrevision_last_modifiedZrepo_last_accessedZrepo_last_modifiedZ
repo_statsr   r   r   rx   j  s   




rx   numc                 C   sD   t | }dD ]}t|dk r|d|   S |d }q|ddS )zkFormat size in bytes into a human-readable string.

    Taken from https://stackoverflow.com/a/1094933
    ) KMGTPEZg     @@z3.1fz.1fY)r<   abs)r   Znum_funitr   r   r   r8     s   
r8   ))secondr   <   )minuter   r   )houri     )dayiQ    )weeki:	 r   )monthi '    )yeari3Ntsc                 C   sn   t   |  }|dk rdS tD ]\}}}t|| }|dur#||kr# nq| d| |dkr2d dS d dS )	zFormat timestamp in seconds into a human-readable string, relative to now.

    Vaguely inspired by Django's `timesince` formatter.
       za few seconds agoN r   sr   z ago)time_TIMESINCE_CHUNKSround)r   deltalabeldividerZ	max_valuevaluer   r   r   r2   
  s   &r2   ra   r\   c              	   C   s   t d| d|   z|  rt|  W d	S t|  W d	S  ty7   t jd| d|  ddd Y d	S  t	yN   t jd| d|  ddd Y d	S w )
aE  Try to delete a local file or folder.

    If the path does not exists, error is logged as a warning and then ignored.

    Args:
        path (`Path`)
            Path to delete. Can be a file or a folder.
        path_type (`str`)
            What path are we deleting ? Only for logging purposes. Example: "snapshot".
    zDelete z: zCouldn't delete z: file not found ()T)exc_infoz: permission denied (N)
r_   r`   rt   osrg   shutilrmtreeFileNotFoundErrorrh   PermissionError)ra   r\   r   r   r   r^     s   ""r^   r   ),r#   r   r   r   collectionsr   dataclassesr   pathlibr   typingr   r   r   r   r	   r
   r   	constantsr   r   r   Z
get_loggerr    r_   rT   r   	Exceptionr   r'   r)   r>   rI   rU   rc   r$   r|   rx   r;   r8   r   r<   r2   r^   r   r   r   r   <module>   sB   $

BLUK  v 