o
    TZhf                     @   s$  d Z ddlZddlZddlZddlZddlZddlZddlZddlZddl	Z	ddl
Z
ddlZddlZddlmZmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZ ddlZddlZddlZdd	l m!Z! dd
l"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z* ddl+m,Z, ddl(m-Z-m.Z. ddl(m/Z0 ddl1m2Z2 ddl3m4Z4 e.5e6Z7dZ8ede9eZ:drdeeee9f  de9fddZ;de9de<fddZ=de9de<fddZ>de9de<fd d!Z?d"e:de:fd#d$Z@dsd'e9d(e9de9fd)d*ZA	dtd'e9d(e9deejBeCf fd+d,ZDdud"e9d-e9d.ee9 de9fd/d0ZEd1e9d2e9de9fd3d4ZFd5e9de9fd6d7ZGdrd8d9ZH	drde9fd:d;ZIdrd<eee9eJf  de9fd=d>ZK	?dvd@e9dAeee9e<f  dBeee9e<f  deJfdCdDZLG dEdF dFeMZNdrdGee9 fdHdIZO		J		KdwdLe9d@e9dMePdNeQdOeQdPeQdejBfdQdRZRdrdSdTZSdUdV ZTG dWdX dXejUjVZVdxdYdZZWdyd[d\ZXdyd]d^ZY	dzdeejB fd`daZZ	d{dejBfdbdcZ[	?dvd@e9dAeee9e<f  dBeee9e<f  dee9 fdddeZ\		%		f	%		%	&			?	%		d|de9fdgdhZ]didj Z^dkdl Z_dmdn Z`doejafdpdqZbdS )}z
Utilities for working with the local dataset cache.
This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
Copyright by the AllenNLP authors.
    N)closingcontextmanager)partial)Path)OptionalTypeVarUnion)patch)urljoinurlparse)strip_protocol)can_be_local)insecure_hashlib)version   )__version__config)DownloadConfig   )_tqdmlogging)tqdm)FileLock)ExtractManager.incompleteThf_modules_cachereturnc                 C   s   | dur| nt j} t| } | tjvrEtj|  tj| dd tjtj	| dsEt
tj	| dd	 W d   | S 1 s@w   Y  | S )z
    Add hf_modules_cache to the python path.
    By default hf_modules_cache='~/.cache/huggingface/modules'.
    It can also be set with the environment variable HF_MODULES_CACHE.
    This is used to add modules such as `datasets_modules`
    NTexist_okz__init__.pyw)r   ZHF_MODULES_CACHEstrsyspathappendosmakedirsexistsjoinopen)r    r*   P/var/www/html/lang_env/lib/python3.10/site-packages/datasets/utils/file_utils.pyinit_hf_modules1   s   

r,   url_or_filenamec                 C   s&   t | jdkotjt | jd  S N z:/r   schemer%   r#   ismountr-   r*   r*   r+   is_remote_urlD   s   &r4   c                 C   s$   t | jdkptjt | jd S r.   r0   r3   r*   r*   r+   is_local_pathH   s   $r5   c                 C   s   t | jdkotj|  S )Nr/   )r   r1   r%   r#   isabsr3   r*   r*   r+   is_relative_pathO   s   r7   r#   c                 C   s6   t jt jt jt| }t| trt|S |S )z'Convert relative path to absolute path.)r%   r#   abspath
expanduser
expandvarsr!   
isinstancer   )r#   Zabs_path_strr*   r*   r+   relative_to_absolute_pathS   s    r<   FT
identifierfilenamec                 C   s6   |r|rt jnt j}n|rt jnt j}d|| |fS N/)r   Z"CLOUDFRONT_DATASETS_DISTRIB_PREFIXZS3_DATASETS_BUCKET_PREFIXZ!CLOUDFRONT_METRICS_DISTRIB_PREFIXZS3_METRICS_BUCKET_PREFIXr(   )r=   r>   use_cdndatasetZendpointr*   r*   r+   hf_bucket_urlY   s   rC   c                 C   s   t t| |||d|dS )N)r=   r>   rA   rB   )max_retries)	http_headrC   )r=   r>   rA   rB   rD   r*   r*   r+   
head_hf_s3a   s   rF   namerevisionc                 C   sD   t tjrdnt}|p|}|rtjj|| |dS tjj|| |dS )Nmain)rH   r#   rG   )r   parser   is_devreleaser   ZREPO_DATASETS_URLformatZREPO_METRICS_URL)r#   rG   rB   rH   Zdefault_revisionr*   r*   r+   hf_github_urlj   s
   rM   	base_name	pathnamesc                 G   s8   t | rtj| gdd |D R  S t| g|R   S )Nc                 s   s(    | ]}t |tjd d V  qdS r@   N)r!   replacer%   seplstrip).0pathnamer*   r*   r+   	<genexpr>u   s   & z#url_or_path_join.<locals>.<genexpr>)r4   	posixpathr(   r   as_posix)rN   rO   r*   r*   r+   url_or_path_joins   s   rY   url_or_pathc                 C   s&   t | r| d | d S tj| S r?   )r4   rindexr%   r#   dirname)rZ   r*   r*   r+   url_or_path_parentz   s   r]   c                 C   sZ   |  d}t|}| }|r"| d}t|}|d|  7 }| dr+|d7 }|S )a  
    Convert `url` into a hashed filename in a repeatable way.
    If `etag` is specified, append its hash to the url's, delimited
    by a period.
    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
    so that TF 2.0 can identify it as a HDF5 file
    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
    utf-8.z.py)encoder   sha256	hexdigestendswith)urletagZ	url_bytesZurl_hashr>   Z
etag_bytesZ	etag_hashr*   r*   r+   hash_url_to_filename   s   
	



rf   c                 K   s  |du rt d	i |}|jptj}t|trt|}t| tr#t| } t| r+t| } t	| rLt
| ||j|j|j|j|j|j|j|j|j|j|jd}ntj| rU| }nt| ratd|  dtd|  d|du ro|S |jr~t|jdj||jd}t |S )
a  
    Given something that might be a URL (or might be a local path),
    determine which. If it's a URL, download the file and cache it, and
    return the path to the cached file. If it's already a local path,
    make sure the file exists and then return the path.

    Return:
        Local path (string)

    Raises:
        FileNotFoundError: in case of non-recoverable file
            (non-existent or no cache on disk)
        ConnectionError: in case of unreachable url
            and no cache on disk
        ValueError: if it couldn't parse the url or filename correctly
        requests.exceptions.ConnectionError: in case of internet connection issue
    N)	cache_dirforce_downloadproxiesresume_download
user_agentlocal_files_onlyuse_etagrD   tokenignore_url_paramsstorage_optionsdownload_desczLocal file z doesn't existzunable to parse z as a URL or as a local path)rg   )force_extractr*   )!r   rg   r   ZDOWNLOADED_DATASETS_PATHr;   r   r!   r   r   r4   get_from_cacherh   ri   rj   rk   rl   rm   rD   rn   ro   rp   rq   r%   r#   r'   r5   FileNotFoundError
ValueErrorZextract_compressed_filer   extractrr   r<   )r-   Zdownload_configZdownload_kwargsrg   Zoutput_pathr*   r*   r+   cached_path   sJ   

rw   rk   c                 C   s   dt  }|dtj 7 }|dtj  7 }|dtj 7 }tjr(|dtj 7 }tjr3|dtj 7 }tj	r>|dtj
 7 }tjrI|dtj 7 }t| tra|d	d	d
d |  D  7 }|S t| trl|d	|  7 }|S )Nz	datasets/z	; python/z; huggingface_hub/z
; pyarrow/z; torch/z; tensorflow/z; jax/z; apache_beam/z; c                 s   s"    | ]\}}| d | V  qdS rP   r*   )rT   kvr*   r*   r+   rV      s     z*get_datasets_user_agent.<locals>.<genexpr>)r   r   Z
PY_VERSIONhuggingface_hubZPYARROW_VERSIONZTORCH_AVAILABLEZTORCH_VERSIONZTF_AVAILABLEZ
TF_VERSIONZJAX_AVAILABLEZJAX_VERSIONZBEAM_AVAILABLEZBEAM_VERSIONr;   dictr(   itemsr!   )rk   Zuar*   r*   r+   get_datasets_user_agent   s$   

"
r}   
deprecatedrd   rn   use_auth_tokenc                 C   sB   |dkrt d| dt |}| tjrtjj|dt	dS i S )zHandle the HF authenticationr~   'use_auth_token' was deprecated in favor of 'token' in version 2.14.0 and will be removed in 3.0.0.
You can remove this warning by passing 'token=
' instead.Zdatasets)rn   Zlibrary_nameZlibrary_version)
warningswarnFutureWarning
startswithr   HF_ENDPOINTrz   utilsZbuild_hf_headersr   )rd   rn   r   r*   r*   r+   "get_authentication_headers_for_url   s   r   c                   @   s   e Zd ZdS )OfflineModeIsEnabledN)__name__
__module____qualname__r*   r*   r*   r+   r   	  s    r   msgc                 C   s(   t jrt| du rddt|  dS )zaRaise an OfflineModeIsEnabled error (subclass of ConnectionError) if HF_DATASETS_OFFLINE is True.NzOffline mode is enabled.zOffline mode is enabled. )r   ZHF_DATASETS_OFFLINEr   r!   )r   r*   r*   r+   !_raise_if_offline_mode_is_enabled  s   

r         ?      $@methodrD   base_wait_timemax_wait_timetimeoutc                 K   s   t d|  d\}}|sd|d7 }ztjd|  ||d|}	d}W n> tjjtjjfya }
 z,||kr7|
t|  d| d||  d t	||d	|d   }t
| W Y d
}
~
nd
}
~
ww |r|	S )a  Wrapper around requests to retry in case it fails with a ConnectTimeout, with exponential backoff.

    Note that if the environment variable HF_DATASETS_OFFLINE is set to 1, then a OfflineModeIsEnabled error is raised.

    Args:
        method (str): HTTP method, such as 'GET' or 'HEAD'.
        url (str): The URL of the resource to fetch.
        max_retries (int): Maximum number of retries, defaults to 0 (no retries).
        base_wait_time (float): Duration (in seconds) to wait before retrying the first time. Wait time between
            retries then grows exponentially, capped by max_wait_time.
        max_wait_time (float): Maximum amount of time between two retries, in seconds.
        **params (additional keyword arguments): Params to pass to :obj:`requests.request`.
    Tried to reach )r   Fr   )r   rd   r   Tz request to z timed out, retrying... []r   Nr*   )r   requestsrequestupper
exceptionsConnectTimeoutConnectionErrorloggerinfomintimesleep)r   rd   rD   r   r   r   paramstriessuccessresponseerrZ
sleep_timer*   r*   r+   _request_with_retry  s"    r   c                 C   sJ   t d|   tj| |d\}}}t|dkrtd| ||d S )Nr   rp   r   z=HEAD can be called with at most one path but was called with r   )r   fsspecget_fs_token_pathslenru   r   )rd   rp   fs_pathsr*   r*   r+   fsspec_head<  s
   r   c                   C   s   t tjddiS )N8HF_DATASETS_STACK_MULTIPROCESSING_DOWNLOAD_PROGRESS_BARS1)r	   r{   r%   environr*   r*   r*   r+   ,stack_multiprocessing_download_progress_barsD  s   r   c                       s   e Zd Zd fdd	Z  ZS )TqdmCallbackNc                    s$   t  j|g|R i | t| _d S N)super__init__r   )selftqdm_kwargsargskwargs	__class__r*   r+   r   K  s   
zTqdmCallback.__init__r   )r   r   r   r   __classcell__r*   r*   r   r+   r   J  s    r   c                 C   s   t d|   tj| |d\}}}t|dkrtd| t|p"dddtjdd	kr8t	
 jr8t	
 jd
 nd dd}|j|d |j|d d S )Nr   r   r   z<GET can be called with at most one path but was called with DownloadingBTr   r   )descunit
unit_scaleposition)r   r   )callback)r   r   r   r   ru   r   r%   r   getmultiprocessingcurrent_process	_identityZget_filerG   )rd   	temp_filerp   r   r   r   r   r   r*   r*   r+   
fsspec_getP  s    r   c                 C   sl   t d|   z$ttjj| |d}|d W d    W dS 1 s$w   Y  W dS  ty5   Y dS w )Nr   r   r   FT)r   r   urllibr   urlopenread	Exception)rd   r   rr*   r*   r+   ftp_headc  s   r   c              
   C   s   t d|   z1td|  d|j  ttjj| |d}t	|| W d    W d S 1 s1w   Y  W d S  tj
jyK } zt|d d }~ww )Nr   zGetting through FTP z into r   )r   r   r   rG   r   r   r   r   shutilcopyfileobjerrorURLErrorr   )rd   r   r   r   er*   r*   r+   ftp_getm  s   &
r         Y@c	              
   C   s(  |d urt |ni }t|dd|d< |dkr!d|dd|d< td| d	|||||d
}	|d u r3|	S |	jdkr:d S |	jd}
|
d urJ|t|
 nd }tdd	|||pTdtj	ddkrht
 jrht
 jd nd d}|	jddD ]}|t| || qsW d    d S 1 sw   Y  d S )N
user-agentrk   r   zbytes=d-RangeGETT)r   rd   streamri   headerscookiesrD   r   i  zContent-Lengthr   r   r   r   r   )r   r   totalinitialr   r   i   )
chunk_size)r{   r}   r   r   status_coder   inthf_tqdmr%   r   r   r   r   iter_contentupdater   write)rd   r   ri   resume_sizer   r   r   rD   r   r   content_lengthr   progresschunkr*   r*   r+   http_getw  sL   


"r   c              
   C   s>   t |pi }t|dd|d< td| ||||||d}|S )Nr   r   HEAD)r   rd   ri   r   r   allow_redirectsr   rD   )copydeepcopyr}   r   r   )rd   ri   r   r   r   r   rD   r   r*   r*   r+   rE     s   
rE   c                 C   sr   |dkrt d| dt |}t| jdvrd S t| |d}t| |dd}|  |jr5|j	
d}|S d }|S )	Nr~   r   r   httphttpsrn      )r   rD   ETag)r   r   r   r   r1   r   rE   raise_for_statusokr   r   )rd   rn   r   r   r   re   r*   r*   r+   request_etag  s"   r   d   c           $         sj  |dkrt d| dt |}
|du rtj}t|tr t|}tj	|dd |r2t
| t| j}n| }d}d}d}d}d}d}t|dd}tj||}tj|rY|sY|sY|S t| |
d	}|durg||d
< |sLt| j}|dkrxt| }n|dvrt| |d}|r|ddp|ddnd}d}zt| d|||	|d}|jdkr|r|jdnd}|j D ]\}}|drd| v r| d| 7 } |j}qd}d| v rd| vr| d7 } nX|jdkrd| v s|jdkrd| v s|jdkrtd| std|js|jdkrd| v rd}td|   n|jdkr1tj | v r1|
du r1t!d |  d!W n t"t#j$j%fyK } z|}W Y d}~nd}~ww |stj|r[|s[|S |rft&d"| d#|durx|jd$krxt&d%|  t'd&|   |durt!d'|  d(t(| d)|durt!d'|  d*|j d)t!d'|  t||}tj||}tj|r|s|S |d+ }t)| tj|r|s|W  d   S |d,  t*d? fd.d/	}d0}|rt+|d1d2}tj rt, j-}| 8}t|  d3|j.  |dkrt/| | n|dvr*t0| |||d4 nt1| ||||||	|d5 W d   n	1 sAw   Y  td6|  d7|  t23|j.| t4d8} t4|  t5|d8|  @  td9|  | |d:}!|d; }"t6|"d<d=d>}#t78|!|# W d   n1 sw   Y  W d   |S W d   |S 1 sw   Y  |S )@a  
    Given a URL, look for the corresponding file in the local cache.
    If it's not there, download it. Then return the path to the cached file.

    Return:
        Local path (string)

    Raises:
        FileNotFoundError: in case of non-recoverable file
            (non-existent or no cache on disk)
        ConnectionError: in case of unreachable url
            and no cache on disk
    r~   r   r   NTr   F)re   r   r   ftpr   r   r   re   )r   ri   r   rD   r      Zdownload_warningzdrive.google.comz	&confirm=zconfirm=z
&confirm=ti  zfirebasestorage.googleapis.comi  i  z7^https?://github.com/.*?/.*?/releases/download/.*?/.*?$z#^https://.*?s3.*?amazonaws.com/.*?$zndownloader.figstatic.comz"Couldn't get ETag version for url i  zUnauthorized for URL zU. Please use the parameter `token=True` after logging in with `huggingface-cli login`z6Cannot find the requested files in the cached path at zi and outgoing traffic has been disabled. To enable file online look-ups, set 'local_files_only' to False.i  zCouldn't find file at r   zCouldn't reach z ()z (error z.lockr   w+bc                 3   s8    t  | }|V  W d    d S 1 sw   Y  d S r   )r)   )modefZincomplete_pathr*   r+   temp_file_managerd  s   "z)get_from_cache.<locals>.temp_file_managerr   za+b)r  zB not found in cache or force_download set to True, downloading to )rp   r   )r   ri   r   r   r   rD   r   zstoring z in cache at i  zcreating metadata file for )rd   re   z.jsonr    r^   )encoding)r  )9r   r   r   r   ZHF_DATASETS_CACHEr;   r   r!   r%   r&   r
   r   r#   rf   r(   r'   r   r1   r   r   r   rE   r   r   r   r|   r   rematchrd   r   r   r   r   OSErrorr   r   Timeoutrt   r   reprr   r   r   statst_sizerG   r   r   r   r   moveumaskchmodr)   jsondump)$rd   rg   rh   ri   Zetag_timeoutrj   rk   rl   rm   rD   rn   r   ro   rp   rq   Z
cached_url	connectedr   r   re   Z
head_errorr1   r>   
cache_pathr   rx   ry   r   Z	lock_pathr  r   r   r  meta	meta_pathZ	meta_filer*   r  r+   rs     s&  


 

"










 
4
44rs   c                         fdd}|S )Nc                    s(   d  d | jd ur| jnd | _| S Nr/   z

)r(   __doc__fndocstrr*   r+   docstring_decorator     $z1add_start_docstrings.<locals>.docstring_decoratorr*   r  r   r*   r  r+   add_start_docstrings     r#  c                     r  )Nc                    s(   | j d ur| j ndd d  | _ | S r  )r  r(   r  r  r*   r+   r     r!  z/add_end_docstrings.<locals>.docstring_decoratorr*   r"  r*   r  r+   add_end_docstrings  r$  r%  c                 C   s   t dd | D S )Nc                 s   s    | ]}|  jV  qd S r   )r  r  )rT   r#   r*   r*   r+   rV     s    z(estimate_dataset_size.<locals>.<genexpr>)sum)r   r*   r*   r+   estimate_dataset_size  s   r'  r  c                 C   s>   t  }	 | d}|s	 t|S ||7 }|dr	 t|S q)NTr      
)	bytearrayr   rc   bytes)r  resbr*   r*   r+   readline  s   

r-  r   )FT)FTr   )TN)Nr~   )r   r   r   r   )NN)r   )Nr   NNr   r   N)NNNTr   r   )NFNr   FNFTr   Nr~   FNN)cr  r   ior  r   r%   rW   r	  r   r"   r   r   r   
contextlibr   r   	functoolsr   pathlibr   typingr   r   r   Zunittest.mockr	   urllib.parser
   r   r   rz   r   Zfsspec.corer   Zfsspec.utilsr   Zhuggingface_hub.utilsr   	packagingr   r/   r   r   Zdownload.download_configr   r   r   r   r   Z	_filelockr   rv   r   Z
get_loggerr   r   ZINCOMPLETE_SUFFIXr!   r   r,   boolr4   r5   r7   r<   rC   Responser   rF   rM   rY   r]   rf   rw   r{   r}   r   r   r   r   r   floatr   r   r   	callbacksr   r   r   r   r   rE   r   rs   r#  r%  r'  	RawIOBaser-  r*   r*   r*   r+   <module>   s   
 	
 		

 I


'




(


 R