o
    TZhU                     @   s6  d Z ddlZddlZddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZm Z m!Z!m"Z"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+ ddl,m-Z-m.Z. ddl/m0Z0 e'e1Z2g dZ3e45dde45dde45dde45dde45dde45dde45dde45dd iZ6d!d"iZ7e8d#d$ ee6e7D Z9G d%d& d&ej:Z;G d'd( d(eZ<d)e=d*e=fd+d,Z>d*ee= fd-d.Z?d)e=d*ee= fd/d0Z@G d1d2 d2e-ZAG d3d4 d4eAZBG d5d6 d6eAZCG d7d8 d8ZDdS )9zDownload manager interface.    N)datetime)partial)chain)CallableDict	GeneratorListOptionalTupleUnion   )config)tqdm)DeprecatedEnum
deprecated)cached_pathget_from_cachehash_url_to_filenameis_relative_path,stack_multiprocessing_download_progress_barsurl_or_path_join)get_size_checksum_dict)
get_logger)NestedDataStructure
map_nestedsize_str)TrackedIterabletracked_str   )DownloadConfig)txtcsvjsonZjsonlZtsvZconllZconlluorigZparquetZpklpicklerelxmlZ504B0304zipZ504B0506Z504B0708Z425A68bz2Z1F8BgzipZFD377A585A00xzZ04224D18Zlz4Z28B52FFDZzstds   Rar!Zrarc                 c   s    | ]}t |V  qd S N)len).0magic_number r/   Y/var/www/html/lang_env/lib/python3.10/site-packages/datasets/download/download_manager.py	<genexpr>O   s
    
r1   c                   @   s   e Zd ZdZdZdZdZdS )DownloadModea)  `Enum` for how to treat pre-existing downloads and data.

    The default mode is `REUSE_DATASET_IF_EXISTS`, which will reuse both
    raw downloads and the prepared dataset if they exist.

    The generations modes:

    |                                     | Downloads | Dataset |
    |-------------------------------------|-----------|---------|
    | `REUSE_DATASET_IF_EXISTS` (default) | Reuse     | Reuse   |
    | `REUSE_CACHE_IF_EXISTS`             | Reuse     | Fresh   |
    | `FORCE_REDOWNLOAD`                  | Fresh     | Fresh   |

    reuse_dataset_if_existsreuse_cache_if_existsforce_redownloadN)__name__
__module____qualname____doc__REUSE_DATASET_IF_EXISTSREUSE_CACHE_IF_EXISTSFORCE_REDOWNLOADr/   r/   r/   r0   r2   U   s
    r2   c                   @   s$   e Zd ZdZdZdZedd ZdS )GenerateModer3   r4   r5   c                 C   s   dS )NzUse 'DownloadMode' instead.r/   selfr/   r/   r0   help_messageo   s   zGenerateMode.help_messageN)r6   r7   r8   r:   r;   r<   propertyr@   r/   r/   r/   r0   r=   j   s    r=   pathreturnc                 C   s*   |  dd }dD ]	}| |d }q	|S )N.z?-_r   )split)rB   	extensionZsymbr/   r/   r0   _get_path_extensiont   s   rH   c              	   C   s   z|  d W n ttjfy   Y dS w | t}|  d ttD ],}t|dt|  }|dur8|  S t	|dt|  }|durOt
d| dq#dS )zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationreadMAGIC_NUMBER_MAX_LENGTHrange$MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)fr.   icompressionr/   r/   r0   *_get_extraction_protocol_with_magic_number~   s    

rW   c                 C   sd   t | } t| }|tv s|dv s| drd S t| d}t|W  d    S 1 s+w   Y  d S )N)tgztar)z.tar.gzz.tar.bz2z.tar.xzrb)strrH   BASE_KNOWN_EXTENSIONSendswithopenrW   )rB   rG   rT   r/   r/   r0   _get_extraction_protocol   s   $r_   c                       s.   e Zd ZdZdef fddZdd Z  ZS )_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.	generatorc                    s    t    || _|| _|| _d S r+   )super__init__ra   argskwargs)r?   ra   rd   re   	__class__r/   r0   rc      s   

z_IterableFromGenerator.__init__c                 c   s2    | j | ji | jD ]}|| _|V  qd | _d S r+   )ra   rd   re   Z	last_item)r?   xr/   r/   r0   __iter__   s
   
z_IterableFromGenerator.__iter__)r6   r7   r8   r9   r   rc   ri   __classcell__r/   r/   rf   r0   r`      s    r`   c                   @   s   e Zd ZdZedd Zedd Zedee	ddf fdd	Z
ed
edee	ddf fddZedddZedddZdS )ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c                 c   sh    t j| dd}|D ]&}|j}| sq
|d u rq
tj|dr#q
||}||fV  g |_	q
~d S )Nzr|*)fileobjmoderD   __)
tarfiler^   nameisregosrB   basename
startswithextractfilemembers)rT   streamtarinfo	file_pathfile_objr/   r/   r0   	_iter_tar   s   

zArchiveIterable._iter_tarc                 c   s`    t | }| D ]#}|j}| rq
|d u rq
tj|dr#q
|	|}||fV  q
d S )Nrn   )
zipfileZipFileinfolistfilenameis_dirrs   rB   rt   ru   r^   )rT   Zzipfmemberrz   r{   r/   r/   r0   	_iter_zip   s   

zArchiveIterable._iter_ziprC   Nc                 c   s:    t |}|dkr| |E d H  d S | |E d H  d S )Nr'   )rW   r   r|   )clsrT   rV   r/   r/   r0   _iter_from_fileobj   s
   z"ArchiveIterable._iter_from_fileobjurlpathc                 c   st    t |}t|d&}|dkr| |E d H  n| |E d H  W d    d S W d    d S 1 s3w   Y  d S )NrZ   r'   )r_   r^   r   r|   )r   r   rV   rT   r/   r/   r0   _iter_from_path   s   "zArchiveIterable._iter_from_pathc                 C      | | j |S r+   )r   )r   rl   r/   r/   r0   from_buf      zArchiveIterable.from_bufc                 C   r   r+   )r   )r   Zurlpath_or_bufr/   r/   r0   	from_path   r   zArchiveIterable.from_path)rC   rk   )r6   r7   r8   r9   staticmethodr|   r   classmethodr   r
   r   r[   r   r   r   r/   r/   r/   r0   rk      s    

rk   c                   @   sJ   e Zd ZdZedeeee f deeddf fddZ	ed	ddZ
dS )
FilesIterablez8An iterable of paths from a list of directories or filesurlpathsrC   Nc                 c   s    t |ts	|g}|D ]C}tj|r|V  qt|D ]1\}}}tdd |D |d d < tj|dr8qt|D ]}|drDq<tj	||V  q<qqd S )Nc                 S   s   g | ]	}| d s|qS )rn   )ru   )r-   dirnamer/   r/   r0   
<listcomp>   s    z2FilesIterable._iter_from_paths.<locals>.<listcomp>rn   )

isinstancelistrs   rB   isfilewalksortedrt   ru   join)r   r   r   dirpathdirnames	filenamesr   r/   r/   r0   _iter_from_paths   s"   

zFilesIterable._iter_from_pathsc                 C   r   r+   )r   )r   r   r/   r/   r0   
from_paths  r   zFilesIterable.from_paths)rC   r   )r6   r7   r8   r9   r   r   r[   r   r   r   r   r/   r/   r/   r0   r      s    *r   c                
   @   s  e Zd ZdZ					d.dee dee dee dee fdd	Zed
d Z	edd Z
edd ZdedefddZeddd Zdd ZdededefddZdeeejf fddZd eeee f fd!d"Zd/d$d%Zd&d' Zd(d) Zd*d+ Zd,d- ZdS )0DownloadManagerFNTdataset_namedata_dirdownload_config	base_pathc                 C   sF   || _ || _|ptjd| _i | _|| _|pt | _	i | _
i | _dS )a4  Download manager constructor.

        Args:
            data_dir:
                can be used to specify a manual directory to get the files from.
            dataset_name (`str`):
                name of dataset this instance will be used for. If
                provided, downloads will contain which datasets they were used for.
            download_config (`DownloadConfig`):
                to specify the cache directory and other
                download options
            base_path (`str`):
                base path that is used when relative paths are used to
                download files. This can be a remote url.
            record_checksums (`bool`, defaults to `True`):
                Whether to record the checksums of the downloaded files. If None, the value is inferred from the builder.
        rD   N)Z_dataset_name	_data_dirrs   rB   abspath
_base_path_recorded_sizes_checksumsrecord_checksumsr   r   downloaded_pathsextracted_paths)r?   r   r   r   r   r   r/   r/   r0   rc   
  s   
zDownloadManager.__init__c                 C   s   | j S r+   )r   r>   r/   r/   r0   
manual_dir-  s   zDownloadManager.manual_dirc                 C   s   t dd | j D S )z+Returns the total size of downloaded files.c                 s   s    | ]}|d  V  qdS )	num_bytesNr/   )r-   Zchecksums_dictr/   r/   r0   r1   4  s    z2DownloadManager.downloaded_size.<locals>.<genexpr>)sumr   valuesr>   r/   r/   r0   downloaded_size1  s   zDownloadManager.downloaded_sizec                    sP   ddl m |j d  du rtd fddtfdd	| }|S )
a  Ship the files using Beam FileSystems to the pipeline temp dir.

        Args:
            downloaded_path_or_paths (`str` or `list[str]` or `dict[str, str]`):
                Nested structure containing the
                downloaded path(s).
            pipeline ([`utils.beam_utils.BeamPipeline`]):
                Apache Beam Pipeline.

        Returns:
            `str` or `list[str]` or `dict[str, str]`
        r   )upload_local_to_remoteZtemp_locationNzFYou need to specify 'temp_location' in PipelineOptions to upload filesc              	      sP   t  tjtj| }td|  dt	tj
|  d| d | | |S )Nz
Uploading z (z) to rD   )	posixpathr   r   ZDOWNLOADED_DATASETS_DIRrs   rB   rt   loggerinfor   getsize)local_file_pathZremote_file_path)
remote_dirr   r/   r0   uploadJ  s   "
z8DownloadManager.ship_files_with_pipeline.<locals>.uploadc                    s    | S r+   r/   )r   )r   r/   r0   <lambda>U  s    z:DownloadManager.ship_files_with_pipeline.<locals>.<lambda>)Zutils.beam_utilsr   _optionsZget_all_optionsrQ   
ValueErrorr   )downloaded_path_or_pathsZpipelineZuploaded_path_or_pathsr/   )r   r   r   r0   ship_files_with_pipeline6  s   

z(DownloadManager.ship_files_with_pipelineurl_or_urlsr   c                 C   sJ   d}t tt| | |ddD ]\}}t|| jd| jt|< qdS )z)Record size/checksum of downloaded files.   zComputing checksums)delaydesc)Zrecord_checksumN)hf_tqdmr   r'   flattenr   r   r   r[   )r?   r   r   r   urlrB   r/   r/   r0   _record_sizes_checksumsZ  s   z'DownloadManager._record_sizes_checksumszCUse `.download`/`.download_and_extract` with `fsspec` URLs instead.c           	   	      s   | j jptj | j j} fdd}t||}t|}t|}t| | D ]1\}}zt	| dd|d d}W n t
yC   d}Y nw |rJ| j jrX||| t	| dd|d q'| || |jS )a  
        Download given urls(s) by calling `custom_download`.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.
            custom_download (`Callable[src_url, dst_path]`):
                The source URL and destination path. For example
                `tf.io.gfile.copy`, that lets you download from  Google storage.

        Returns:
            downloaded_path(s): `str`, The downloaded paths matching the given input
                `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download_custom('s3://my-bucket/data.zip', custom_download_for_my_private_bucket)
        ```
        c                    s   t j t| S r+   )rs   rB   r   r   )r   	cache_dirr/   r0   url_to_downloaded_path  s   z?DownloadManager.download_custom.<locals>.url_to_downloaded_pathTF)r   Zlocal_files_onlyZuse_etagmax_retries)r   r   r   ZDOWNLOADED_DATASETS_PATHr   r   r   r'   r   r   FileNotFoundErrorZforce_downloadr   data)	r?   r   Zcustom_downloadr   r   r   r   rB   cachedr/   r   r0   download_customg  s.   



zDownloadManager.download_customc                 C   s  | j  }d|_|jdu rd|_t| j|d}t }t  t	||d|j
dd}W d   n1 s3w   Y  t | }td| d	  d
 t|}t|}| jtt| |  t }| || t | }td| d	  d
 |jS )ay  Download given URL(s).

        By default, only one process is used for download. Pass customized `download_config.num_proc` to change this behavior.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download. Each URL is a `str`.

        Returns:
            `str` or `list` or `dict`:
                The downloaded paths matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        FNzDownloading datar   TzDownloading data files)Z	map_tuplenum_procr   zDownloading took <   z minzChecksum Computation took )r   copyextract_compressed_fileZdownload_descr   	_downloadr   nowr   r   r   r   r   total_secondsr   r   updatedictr'   r   r   r   )r?   r   r   Zdownload_func
start_timer   durationr/   r/   r0   download  s2   

zDownloadManager.downloadurl_or_filenamerC   c                 C   s>   t |}t|rt| j|}t||d}t|}|| |S )Nr   )r[   r   r   r   r   r   Z
set_origin)r?   r   r   outr/   r/   r0   r     s   
zDownloadManager._downloadpath_or_bufc                 C   s   t |dr
t|S t|S )aK  Iterate over files within an archive.

        Args:
            path_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        rM   )hasattrrk   r   r   )r?   r   r/   r/   r0   iter_archive  s   


zDownloadManager.iter_archivepathsc                 C   s
   t |S )a  Iterate over file paths.

        Args:
            paths (`str` or `list` of `str`):
                Root paths.

        Yields:
            `str`: File path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        )r   r   )r?   r   r/   r/   r0   
iter_files  s   
zDownloadManager.iter_filesr   c                 C   sx   |dkr
t dt | j }d|_t| j|d}t|||j	dd}t
|}t
|}| jtt| |  |jS )ak  Extract given path(s).

        Args:
            path_or_paths (path or `list` or `dict`):
                Path of file to extract. Each path is a `str`.
            num_proc (`int`):
                Use multi-processing if `num_proc` > 1 and the length of
                `path_or_paths` is larger than `num_proc`.

                <Deprecated version="2.6.2">

                Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.

                </Deprecated>

        Returns:
            extracted_path(s): `str`, The extracted paths matching the given input
            path_or_paths.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        r   z'num_proc' was deprecated in version 2.6.2 and will be removed in 3.0.0. Pass `DownloadConfig(num_proc=<num_proc>)` to the initializer instead.Tr   zExtracting data files)r   r   )warningswarnFutureWarningr   r   r   r   r   r   r   r   r   r   r   r'   r   r   )r?   Zpath_or_pathsr   r   Zextract_funcr   r/   r/   r0   extract  s$   
zDownloadManager.extractc                 C   s   |  | |S )a  Download and extract given `url_or_urls`.

        Is roughly equivalent to:

        ```
        extracted_paths = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL or `list` or `dict` of URLs to download and extract. Each URL is a `str`.

        Returns:
            extracted_path(s): `str`, extracted paths of given URL(s).
        )r   r   )r?   r   r/   r/   r0   download_and_extract*  s   z$DownloadManager.download_and_extractc                 C   s
   | j  S r+   )r   r   r>   r/   r/   r0   get_recorded_sizes_checksums<  s   
z,DownloadManager.get_recorded_sizes_checksumsc                 C   s^   t | j t | j  }t| j D ]\}}||v r,tj|r,t	| | j|= qd S r+   )
setr   r   r   r   itemsrs   rB   r   remove)r?   Zpaths_to_deletekeyrB   r/   r/   r0   delete_extracted_files?  s   
z&DownloadManager.delete_extracted_filesc                 C   s   | j jr
|   d S d S r+   )r   Zdelete_extractedr   r>   r/   r/   r0   manage_extracted_filesF  s   z&DownloadManager.manage_extracted_files)NNNNT)r   )r6   r7   r8   Zis_streamingr	   r[   r   rc   rA   r   r   r   r   r   r   r   r   r   r   r   rK   BufferedReaderr   r   r   r   r   r   r   r   r/   r/   r/   r0   r     sD    
#


#
.0

.r   )Er9   enumrK   rs   r   rp   r   r}   r   	functoolsr   	itertoolsr   typingr   r   r   r   r	   r
   r    r   utilsr   r   Zutils.deprecation_utilsr   r   Zutils.file_utilsr   r   r   r   r   r   Zutils.info_utilsr   Zutils.loggingr   Zutils.py_utilsr   r   r   Zutils.trackr   r   r   r   r6   r   r\   bytesfromhexrP   rR   maxrN   Enumr2   r=   r[   rH   rW   r_   r`   rk   r   r   r/   r/   r/   r0   <module>   s\   $ 









=