o
    TZh                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlmZ d dlmZmZ d dlmZmZmZmZmZmZmZmZmZ d dlmZ d dlZd dl m!Z! d d	l"m#Z# d d
l$m%Z% ddl&m'Z' ddl(m)Z) ddl*m+Z+m,Z,m-Z-m.Z.m/Z/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6 e2e7Z8g dZ9i dd e)D ddiZ:dd e)D Z;e<dZ=e>?dde>?dde>?dde>?dde>?dd e>?d!d"e>?d#d$e>?d%d&iZ@d'd(iZAeBd)d* ee@eAD ZCG d+d, d,eDZEd-d. ZFd/d0 ZGdzd1eHd2ee6 fd3d4ZId5d6 ZJd7d8 ZKd9d: ZLdzd2ee6 d;eMfd<d=ZNdzd2ee6 d;eOfd>d?ZPdzd2ee6 d;eMfd@dAZQdzdBdCZRdDdE ZSdFeHd;eHfdGdHZTd;eeH fdIdJZUdzd1eHd2ee6 d;eeH fdKdLZV	dzd1eHd2ee6 d;eeHeeHeeHef f f fdMdNZW	dzd1eHd2ee6 d;eeHeeHeeHef f f fdOdPZXd{ddRdSeHd2ee6 fdTdUZYdzdFeHd2ee6 d;eeH fdVdWZZdXddYd2ee6 fdZd[Z[dzd2ee6 fd\d]Z\G d^d_ d_e]e Z^dFeeHee^f fd`daZ_ddRd2ee6 fdbdcZ`ddRd2ee6 fdddeZadzd2ee6 fdfdgZbdzd2ee6 fdhdiZcdzd2ee6 fdjdkZddzd2ee6 fdldmZed|d2ee6 fdndoZfdzd2ee6 fdpdqZgG drds dseZhG dtdu duehZiG dvdw dwehZjG dxdy dyZkdS )}    N)TimeoutError)BytesIO)chain)PathPurePosixPath)	AnyCallableDict	GeneratorIterableListOptionalTupleUnion)ElementTree)ClientError)EntryNotFoundError)version   )config)COMPRESSION_FILESYSTEMS)"get_authentication_headers_for_urlget_datasets_user_agent	http_headis_local_pathis_relative_pathurl_or_path_join)
get_logger)
map_nested   )DownloadConfig)txtcsvjsonZjsonlZtsvZconllZconlluorigparquetZpklpicklerelxmlc                 C   s   i | ]
}|j d |jqS ).)	extensionlstripprotocol.0Zfs_class r/   c/var/www/html/lang_env/lib/python3.10/site-packages/datasets/download/streaming_download_manager.py
<dictcomp>8   s    r1   zipc                 C   s   h | ]}|j qS r/   )r,   r-   r/   r/   r0   	<setcomp><   s    r3   z(?<!:):/Z504B0304Z504B0506Z504B0708Z425A68bz2Z1F8BgzipZFD377A585A00xzZ04224D18Zlz4Z28B52FFDZzstds   Rar!Zrarc                 c   s    | ]}t |V  qd S N)len)r.   magic_numberr/   r/   r0   	<genexpr>M   s
    
r:   c                   @   s   e Zd ZdS )NonStreamableDatasetErrorN)__name__
__module____qualname__r/   r/   r/   r0   r;   S   s    r;   c                 G   sP   t | d^} }t| rtjj| g|R  S tj| g|R  } d| g| S )u#  
    This function extends os.path.join to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xjoin function allows you to apply the join on the first path of the chain.

    Example::

        >>> xjoin("zip://folder1::https://host.com/archive.zip", "file.txt")
        zip://folder1/file.txt::https://host.com/archive.zip
    ::)strsplitr   ospathjoin	posixpath)apbr/   r/   r0   xjoinW   s
   rI   c                 C   s\   t | d^} }t| rtjt|  } nt| } | 	dr&| d7 } d
| g| S )u#  
    This function extends os.path.dirname to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xdirname function allows you to apply the dirname on the first path of the chain.

    Example::

        >>> xdirname("zip://folder1/file.txt::https://host.com/archive.zip")
        zip://folder1::https://host.com/archive.zip
    r?   ://)r@   rA   r   rB   rC   dirnamer   as_posixrE   endswithrD   rF   rH   r/   r/   r0   xdirnamer   s   

rP   urlpathdownload_configc                 C   s`   t | d^}}t|rtj|S t| |d\} }| d^}}tj| |d^}}||S )a  Extend `os.path.exists` function to support both local and remote files.

    Args:
        urlpath (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r?   rR   storage_options)	_as_strrA   r   rB   rC   exists!_prepare_path_and_storage_optionsfsspecget_fs_token_paths)rQ   rR   main_hop	rest_hopsrU   fs_r/   r/   r0   xexists   s   
r_   c                 C   s8   t | d^} }t| rtjt|  S t| S )u  
    This function extends os.path.basename to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xbasename function allows you to apply the basename on the first path of the chain.

    Example::

        >>> xbasename("zip://folder1/file.txt::https://host.com/archive.zip")
        file.txt
    r?   )	r@   rA   r   rB   rC   basenamer   rM   rE   rO   r/   r/   r0   	xbasename   s   
ra   c                 C   sb   t | d^} }t| rtjt|  S t| \} }d| 	dr)| d n| g| |fS )u,  
    This function extends os.path.split to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplit function allows you to apply the xsplit on the first path of the chain.

    Example::

        >>> xsplit("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1::https://host.com/archive.zip', 'file.txt')
    r?   rJ   rK   )
r@   rA   r   rB   rC   r   rM   rE   rD   rN   )rF   rH   tailr/   r/   r0   xsplit   s
   &rc   c                 C   sP   t | d^} }t| rtjt|  S t| \} }d	| g| |fS )u8  
    This function extends os.path.splitext to support the "::" hop separator. It supports both paths and urls.

    A shorthand, particularly useful where you have multiple hops, is to “chain” the URLs with the special separator "::".
    This is used to access files inside a zip file over http for example.

    Let's say you have a zip file at https://host.com/archive.zip, and you want to access the file inside the zip file at /folder1/file.txt.
    Then you can just chain the url this way:

        zip://folder1/file.txt::https://host.com/archive.zip

    The xsplitext function allows you to apply the splitext on the first path of the chain.

    Example::

        >>> xsplitext("zip://folder1/file.txt::https://host.com/archive.zip")
        ('zip://folder1/file::https://host.com/archive.zip', '.txt')
    r?   )
r@   rA   r   rB   rC   splitextr   rM   rE   rD   )rF   rH   extr/   r/   r0   	xsplitext   s
   rf   returnc                 C   s`   t | d^}}t|rtj| S t| |d\} }| d^}}tj| |d^}}||S )zExtend `os.path.isfile` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r?   rS   rT   )	r@   rA   r   rB   rC   isfilerX   rY   rZ   )rC   rR   r[   r\   rU   r]   r^   r/   r/   r0   xisfile   s   

ri   c           	      C   s   t | d^}}t|rtj| S t| |d\} }| d^}}tj| |d^}}z|	|}W n t
y@   td|  w |du rbt| |d}t| }W d   |S 1 s]w   Y  |S )zExtend `os.path.getsize` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `int`: optional
    r?   rS   rT   zNo such file: N)r@   rA   r   rB   rC   getsizerX   rY   rZ   sizer   FileNotFoundErrorxopenr8   read)	rC   rR   r[   r\   rU   r]   r^   rk   fr/   r/   r0   xgetsize
  s$   

rp   c                 C   s|   t | d^}}t|rtj| S t| |d\} }| d^}}tj| |d^}}|dd }|	ds9dS ||S )zExtend `os.path.isdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `bool`
    r?   rS   rT   :///T)
r@   rA   r   rB   rC   isdirrX   rY   rZ   strip)rC   rR   r[   r\   rU   r]   r^   
inner_pathr/   r/   r0   xisdir&  s   


rw   c                 C   sf   t | d^}}t|r|rtjj||dS tj|S |r-tj|t |dd dS tj|S )zExtend `os.path.relpath` function to support remote files.

    Args:
        path (`str`): URL path.
        start (`str`): Start URL directory path.

    Returns:
        `str`
    r?   )startr   )r@   rA   r   rB   rC   relpathrE   )rC   rx   r[   r\   r/   r/   r0   xrelpath=  s   
 ,rz   c                    s$   | j tj  fdd}|| _ d S )Nc                     s   d }t d d D ]9}z| i |}W  |S  ttfyB } z|}tdtj d| d  d ttj W Y d }~q	d }~ww t	d|)Nr   z4Got disconnected from remote data host. Retrying in zsec [rs   ]zServer Disconnected)
ranger   r   loggerwarningr   ZSTREAMING_READ_RETRY_INTERVALtimesleepConnectionError)argskwargsZdisconnect_errretryouterrmax_retriesrn   r/   r0   read_with_retriesR  s   	
z?_add_retries_to_file_obj_read_method.<locals>.read_with_retries)rn   r   ZSTREAMING_READ_MAX_RETRIES)file_objr   r/   r   r0   $_add_retries_to_file_obj_read_methodN  s   
r   rC   c                 C   s*   |  dd }dD ]	}| |d }q	|S )Nr)   rr   z?-_r   )rA   )rC   r*   Zsymbr/   r/   r0   _get_path_extensione  s   r   c              	   C   s   z|  d W n ttjfy   Y dS w | t}|  d ttD ],}t|dt|  }|dur8|  S t	|dt|  }|durOt
d| dq#dS )zQread the magic number from a file-like object and return the compression protocolr   NzCompression protocol 'z' not implemented.)seekAttributeErrorioUnsupportedOperationrn   MAGIC_NUMBER_MAX_LENGTHr|   $MAGIC_NUMBER_TO_COMPRESSION_PROTOCOLget0MAGIC_NUMBER_TO_UNSUPPORTED_COMPRESSION_PROTOCOLNotImplementedError)ro   r9   icompressionr/   r/   r0   *_get_extraction_protocol_with_magic_numbero  s    

r   c                 C   s   t | } | dd }t|}|tv s|dv s|drd S |tv r&t| S t| |d\} }z#tj| fi |p7i }t	|W  d    W S 1 sJw   Y  W d S  t
yf   | tjret
| d d  w )Nr?   r   tgztarz.tar.gzz.tar.bz2z.tar.xzrS   S
If the repo is private or gated, make sure to log in with `huggingface-cli login`.)r@   rA   r   BASE_KNOWN_EXTENSIONSrN   !COMPRESSION_EXTENSION_TO_PROTOCOLrX   rY   openr   rl   
startswithr   HF_ENDPOINT)rQ   rR   rC   r*   rU   ro   r/   r/   r0   _get_extraction_protocol  s0   (r   c                 C   sJ   g }i }|  dD ]}t||d\}}|| || q	d||fS )Nr?   rS   )rA   ,_prepare_single_hop_path_and_storage_optionsappendupdaterD   )rQ   rR   Zprepared_urlpathZprepared_storage_optionsZhoprU   r/   r/   r0   rX     s   
rX   c           	      C   s  |du rdn|j }| tjr%d| v r%d| ttjd d ddd } d| v r0| dd nd}|durA||jv rA|j| }n|durU||jvrUd	d
 |j D }ni }|r]||i}|dv ri t	| |ddt
 iddid||i ||< d| v rt| }d}|j D ]\}}|dr| d| 7 } |j}d|i||i ||< qd| v rd| vr| d7 } | drd|| d d< | |fS |dkr|tjd||i ||< tjtdk rd|| d< | |fS ) a\  
    Prepare the URL and the kwargs that must be passed to the HttpFileSystem or to requests.get/head

    In particular it resolves google drive URLs
    It also adds the authentication headers for the Hugging Face Hub, for both https:// and hf:// paths.

    Storage options are formatted in the form {protocol: storage_options_for_protocol}
    Nz	/resolve/zhf://r   @rq   r   filec                 S   s"   i | ]\}}|t  vr||qS r/   )rY   Zavailable_protocols)r.   Zoption_nameZoption_valuer/   r/   r0   r1     s
    z@_prepare_single_hop_path_and_storage_options.<locals>.<dictcomp>)httphttps)tokenz
user-agent	trust_envT)headersZclient_kwargszdrive.google.comZdownload_warningz	&confirm=cookieszconfirm=z
&confirm=tz"https://raw.githubusercontent.com/identityr   zAccept-EncodingZhf)r   Zendpointz0.21.0default
block_size)r   r   r   r   r8   replacerA   rU   itemsr   r   r   r   r   ZHF_HUB_VERSIONr   parse)	rQ   rR   r   r,   rU   responser   kvr/   r/   r0   r     s\   $





r   rrS   r   c             
   O   s   t | }|d^}}t|r!|dd t||g|R i |S t||d\} }i ||p/i }ztj| g|R d|i| }	W n, ty[ }
 zt|
dkrVt	d|
 d}
~
w t
yo   | tjrnt
| d d w t|	 |	S )	a  Extend `open` function to support remote files using `fsspec`.

    It also has a retry mechanism in case connection fails.
    The `args` and `kwargs` are passed to `fsspec.open`, except `token` which is used for queries to private repos on huggingface.co

    Args:
        file (`str`): Path name of the file to be opened.
        mode (`str`, *optional*, default "r"): Mode in which the file is opened.
        *args: Arguments to be passed to `fsspec.open`.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Keyword arguments to be passed to `fsspec.open`.

    Returns:
        file object
    r?   r   NrS   modezCannot seek streaming HTTP filezStreaming is not possible for this dataset because data host server doesn't support HTTP range requests. You can still load this dataset in non-streaming mode by passing `streaming=False` (default)r   )rV   rA   r   popr   rX   rY   
ValueErrorr@   r;   rl   r   r   r   r   )r   r   rR   r   r   Zfile_strr[   r\   rU   r   er/   r/   r0   rm     s:   $rm   c           	      C   s   t | d^}}t|rt| S t| |d\} }| d^}}tj| |d^}}|dd }|drB|	|sBt
d|  |j|dd	}d
d |D S )zExtend `os.listdir` function to support remote files.

    Args:
        path (`str`): URL path.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    r?   rS   rT   rq   rr   rs   zDirectory doesn't exist: F)detailc                 S   s   g | ]}t j|d qS )rs   )rB   rC   r`   rstrip)r.   rC   r/   r/   r0   
<listcomp>*  s    zxlistdir.<locals>.<listcomp>)rV   rA   r   rB   listdirrX   rY   rZ   ru   rt   rl   )	rC   rR   r[   r\   rU   r]   r^   rv   pathsr/   r/   r0   xlistdir  s   

r   F)	recursiverR   c          	         s   t | d^}t|rtj||dS t| |d\} }| d^}tj| |d^}}|dd }||}t|jt	rA|jn|jd   fdd	|D S )
a  Extend `glob.glob` function to support remote files.

    Args:
        urlpath (`str`): URL path with shell-style wildcard patterns.
        recursive (`bool`, default `False`): Whether to match the "**" pattern recursively to zero or more
            directories or subdirectories.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `list` of `str`
    r?   )r   rS   rT   rq   r   rr   c                    s&   g | ]}d    d| g qS )r?   rq   )rD   )r.   globbed_pathr,   r\   r/   r0   r   H  s   & zxglob.<locals>.<listcomp>)
rV   rA   r   globrX   rY   rZ   
isinstancer,   r@   )	rQ   r   rR   r[   rU   r]   r^   rv   globbed_pathsr/   r   r0   xglob-  s   
r   c                 k   s    t | d^}}t|rtj|fi |E dH  dS t| |d\} }| d^}}tj| |d^}}|dd }|drG|	|sGg S t
|jtrP|jn|jd }	|j|fi |D ]\}
}}d|	 d|
 g| ||fV  q^dS )au  Extend `os.walk` function to support remote files.

    Args:
        urlpath (`str`): URL root path.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs: Additional keyword arguments forwarded to the underlying filesystem.


    Yields:
        `tuple`: 3-tuple (dirpath, dirnames, filenames).
    r?   NrS   rT   rq   rr   rs   )rV   rA   r   rB   walkrX   rY   rZ   ru   rt   r   r,   r@   rD   )rQ   rR   r   r[   r\   rU   r]   r^   rv   r,   dirpathdirnames	filenamesr/   r/   r0   xwalkK  s   $r   c                       s   e Zd ZdZ fddZddee fddZddee fdd	Zd
d Z	e
d ddZe
defddZe
defddZe
defddZdd Zdeedf dd fddZdedd fddZ fddZ  ZS )!xPathzHExtension of `pathlib.Path` to support both local paths and remote URLs.c                    s\   t   }|d^}}t|r|S |dd}td|}||dr)d7 }|S d7 }|S )Nr?   \rs   rq   rJ   rK    )super__str__rA   r   r   #SINGLE_SLASH_AFTER_PROTOCOL_PATTERNsubrN   )selfZpath_strr[   r\   Zpath_as_posix	__class__r/   r0   r   j  s   
zxPath.__str__NrR   c                 C   s   t t| |dS )zExtend `pathlib.Path.exists` method to support both local and remote files.

        Args:
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Returns:
            `bool`
        rS   )r_   r@   )r   rR   r/   r/   r0   rW   t  s   	zxPath.existsc                 c   s    |   }|d^}}t|rt||E dH  dS |rA|d }t||d\}}|dd |i}d||g|dd }nd}tjt	|||d^}}	|t	||}
|
D ]}t
| d|j d| g| V  qYdS )a]  Glob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.
            download_config : mainly use token or storage_options to support different platforms and auth types.

        Yields:
            [`xPath`]
        r?   Nr   rS   rq   r   rT   )rM   rA   r   r   r   rX   rD   rY   rZ   rI   typer,   )r   patternrR   Z
posix_pathr[   r\   rQ   rU   r]   r^   r   r   r/   r/   r0   r     s    
(z
xPath.globc                 K   s   | j d| fi |S )zRglob function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Args:
            pattern (`str`): Pattern that resulting paths must match.

        Yields:
            [`xPath`]
        z**/)r   )r   r   r   r/   r/   r0   rglob  s   	zxPath.rglobrg   c                 C   s   t | t|  S )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            [`xPath`]
        )r   rP   rM   r   r/   r/   r0   parent  s   zxPath.parentc                 C      t |  dd jS )zName function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r?   r   )r   rM   rA   namer   r/   r/   r0   r        z
xPath.namec                 C   r   )zStem function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r?   r   )r   rM   rA   stemr   r/   r/   r0   r     r   z
xPath.stemc                 C   r   )zSuffix function for argument of type :obj:`~pathlib.Path` that supports both local paths end remote URLs.

        Returns:
            `str`
        r?   r   )r   rM   rA   suffixr   r/   r/   r0   r     r   zxPath.suffixc                 O   s   t t| g|R i |S )a  Extend :func:`xopen` to support argument of type :obj:`~pathlib.Path`.

        Args:
            **args: Arguments passed to :func:`fsspec.open`.
            **kwargs: Keyword arguments passed to :func:`fsspec.open`.

        Returns:
            `io.FileIO`: File-like object.
        )rm   r@   )r   r   r   r/   r/   r0   r     s   
z
xPath.openrG   .c                 G   s   t | t|  g|R  S )zExtend :func:`xjoin` to support argument of type :obj:`~pathlib.Path`.

        Args:
            *p (`tuple` of `str`): Other path components.

        Returns:
            [`xPath`]
        )r   rI   rM   r   rG   r/   r/   r0   joinpath  s   	zxPath.joinpathc                 C   s
   |  |S r7   )r   r   r/   r/   r0   __truediv__  s   
zxPath.__truediv__c                    s`   t | d^}}t|rt| t t |S t| dt| t|| g| S )Nr?   )	r@   rA   r   r   r   with_suffixrD   r   rM   )r   r   r[   r\   r   r/   r0   r     s   .zxPath.with_suffixr7   )rg   r   )r<   r=   r>   __doc__r   r   r    rW   r   r   propertyr   r@   r   r   r   r   r   r   r   r   __classcell__r/   r/   r   r0   r   g  s$    
 r   c                 C   s"   t | tr	t| S ttt| S r7   )r   r   r@   )rC   r/   r/   r0   rV     s   "rV   c                O   T   dd l }t| dr|j| g|R i |S t| } |jt| d|dg|R i |S Nr   rn   rbrS   )r5   hasattrr   r@   rm   )filepath_or_bufferrR   r   r   r5   r/   r/   r0   
xgzip_open  
   
"r   c                O   r   r   )numpyr   loadr@   rm   )r   rR   r   r   npr/   r/   r0   xnumpy_load  r   r   c                 K   sh   dd l }t| dr|j| fi |S t| } |dddkr&t| |d|d< |jt| d|dfi |S )Nr   rn   r   ZinferrS   r   )pandasr   Zread_csvr@   r   r   rm   r   rR   r   pdr/   r/   r0   xpandas_read_csv  s   
r   c              
   K   s   dd l }t| dr*z
|j| fi |W S  ty)   |jt|  fi | Y S w t| } z|jt| d|dfi |W S  tyX   |jtt| d|d fi | Y S w r   )r   r   Z
read_excelr   r   rn   r@   rm   r   r/   r/   r0   xpandas_read_excel  s"   

r   c                 K   sL   dd l m} t| dr|j| fi |S t| } |jt| d|dfi |S )Nr   rn   r   )r   rR   )Zpyarrow.parquetr%   r   Z
read_tabler@   rm   )r   rR   r   Zpqr/   r/   r0   xpyarrow_parquet_read_table%  s
   
r   c                 K   sD   dd l m} t| dr|j| fi |S |jt| d|dfi |S r   )Zscipy.ior   r   Zloadmatrm   )r   rR   r   sior/   r/   r0   xsio_loadmat/  s   
r   c                 C   sX   t | drtj| |dS t| d|d}tj||dW  d   S 1 s%w   Y  dS )a  Extend `xml.etree.ElementTree.parse` function to support remote files.

    Args:
        source: File path or file object.
        parser (`XMLParser`, *optional*, default `XMLParser`): Parser instance.
        download_config : mainly use token or storage_options to support different platforms and auth types.

    Returns:
        `xml.etree.ElementTree.Element`: Root element of the given source document.
    rn   )parserr   rS   N)r   ETr   rm   )sourcer   rR   ro   r/   r/   r0   	xet_parse8  s
   
$r   c                 K   sh   t | drtjjj| fi |S t| d|d}tjjj|fi |W  d   S 1 s-w   Y  dS )a  Extend `xml.dom.minidom.parse` function to support remote files.

    Args:
        filename_or_file (`str` or file): File path or file object.
        download_config : mainly use token or storage_options to support different platforms and auth types.
        **kwargs (optional): Additional keyword arguments passed to `xml.dom.minidom.parse`.

    Returns:
        :obj:`xml.dom.minidom.Document`: Parsed document.
    rn   r   rS   N)r   r(   domZminidomr   rm   )Zfilename_or_filerR   r   ro   r/   r/   r0   xxml_dom_minidom_parseJ  s
   
$r  c                   @   s&   e Zd ZdZdefddZdd ZdS )_IterableFromGeneratorzkUtility class to create an iterable from a generator function, in order to reset the generator when needed.	generatorc                 O   s   || _ || _|| _d S r7   r  r   r   )r   r  r   r   r/   r/   r0   __init___  s   
z_IterableFromGenerator.__init__c                 c   s     | j | ji | jE d H  d S r7   r  r   r/   r/   r0   __iter__d  s   z_IterableFromGenerator.__iter__N)r<   r=   r>   r   r   r  r  r/   r/   r/   r0   r  \  s    r  c                   @   s   e Zd ZdZedd Zedd Zedee	ddf fdd	Z
e	dd
edee dee	ddf fddZedddZeddee dd fddZdS )ArchiveIterablezIAn iterable of (path, fileobj) from a TAR archive, used by `iter_archive`c                 c   sh    t j| dd}|D ]&}|j}| sq
|d u rq
tj|dr#q
||}||fV  g |_	q
~d S )Nzr|*)fileobjr   r)   __)
tarfiler   r   isregrB   rC   r`   r   extractfilemembers)ro   streamtarinfo	file_pathr   r/   r/   r0   	_iter_tark  s   

zArchiveIterable._iter_tarc                 c   s`    t | }| D ]#}|j}| rq
|d u rq
tj|dr#q
|	|}||fV  q
d S )Nr
  )
zipfileZipFileinfolistfilenameis_dirrB   rC   r`   r   r   )ro   Zzipfmemberr  r   r/   r/   r0   	_iter_zip|  s   

zArchiveIterable._iter_ziprg   Nc                 c   s:    t |}|dkr| |E d H  d S | |E d H  d S )Nr2   )r   r  r  )clsro   r   r/   r/   r0   _iter_from_fileobj  s
   z"ArchiveIterable._iter_from_fileobjrQ   rR   c                 c   s~    t ||d}t|d|dd&}|dkr| |E d H  n| |E d H  W d    d S W d    d S 1 s8w   Y  d S )NrS   r   r   )rR   r   r2   )r   rm   r  r  )r  rQ   rR   r   ro   r/   r/   r0   _iter_from_urlpath  s   "z"ArchiveIterable._iter_from_urlpathc                 C   s   | | j |S r7   )r  )r  r	  r/   r/   r0   from_buf  s   zArchiveIterable.from_bufc                 C      | | j ||S r7   )r  )r  urlpath_or_bufrR   r/   r/   r0   from_urlpath     zArchiveIterable.from_urlpathr7   )rg   r  )r<   r=   r>   r   staticmethodr  r  classmethodr
   r   r  r@   r   r    r  r  r!  r/   r/   r/   r0   r  h  s*    

r  c                   @   sd   e Zd ZdZe	d
deeee f dee	 de
eddf fddZed
dee	 dd fdd	ZdS )FilesIterablez8An iterable of paths from a list of directories or filesNurlpathsrR   rg   c                 c   s    t |ts	|g}|D ]J}t||dr|V  qt||drRt||dD ]-\}}}tdd |D |d d < t|dr=q#t|D ]}|drIqAt||V  qAq#qt	|d S )NrS   c                 S   s   g | ]	}| d s|qS )r
  )r   )r.   rL   r/   r/   r0   r     s    z5FilesIterable._iter_from_urlpaths.<locals>.<listcomp>r
  )
r   listri   rw   r   sortedra   r   rI   rl   )r  r&  rR   rQ   r   r   r   r  r/   r/   r0   _iter_from_urlpaths  s&   

z!FilesIterable._iter_from_urlpathsc                 C   r  r7   )r)  )r  r&  rR   r/   r/   r0   from_urlpaths  r"  zFilesIterable.from_urlpathsr7   )r<   r=   r>   r   r$  r   r@   r   r   r    r
   r)  r*  r/   r/   r/   r0   r%    s    r%  c                
   @   s   e Zd ZdZdZ				ddee dee dee dee fdd	Ze	d
d Z
dd ZdedefddZdd ZdedefddZdd Zdeeejf dee fddZdeeee f dee fddZdS )StreamingDownloadManagera  
    Download manager that uses the "::" separator to navigate through (possibly remote) compressed archives.
    Contrary to the regular `DownloadManager`, the `download` and `extract` methods don't actually download nor extract
    data, but they rather return the path or url that could be opened using the `xopen` function which extends the
    built-in `open` function to stream data from remote files.
    TNdataset_namedata_dirrR   	base_pathc                 C   s.   || _ || _|ptjd| _|pt | _d S )Nr)   )Z_dataset_name	_data_dirrB   rC   abspath
_base_pathr    rR   )r   r,  r-  rR   r.  r/   r/   r0   r    s   z!StreamingDownloadManager.__init__c                 C   s   | j S r7   )r/  r   r/   r/   r0   
manual_dir  s   z#StreamingDownloadManager.manual_dirc                 C   s   t | j|dd}|S )aU  Normalize URL(s) of files to stream data from.
        This is the lazy version of `DownloadManager.download` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input url_or_urls.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        ```
        TZ	map_tuple)r   	_downloadr   url_or_urlsr/   r/   r0   download  s   z!StreamingDownloadManager.downloadrQ   rg   c                 C   s    t |}t|rt| j|}|S r7   )r@   r   r   r1  )r   rQ   r/   r/   r0   r4    s   z"StreamingDownloadManager._downloadc                 C   s   t | j|dd}|S )a  Add extraction protocol for given url(s) for streaming.

        This is the lazy version of `DownloadManager.extract` for streaming.

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) of files to stream data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.

        Example:

        ```py
        >>> downloaded_files = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> extracted_files = dl_manager.extract(downloaded_files)
        ```
        Tr3  )r   _extract)r   r6  r&  r/   r/   r0   extract  s   z StreamingDownloadManager.extractc                 C   s   t |}t|| jd}|dd }t|}|dv s|dr'td| d|d u r-|S |tv rUtj	
|dd }d|v rI|d |d n|}| d	| d| S | d
| S )NrS   r?   r   r   r   z+Extraction protocol for TAR archives like 'z' is not implemented in streaming mode. Please use `dl_manager.iter_archive` instead.

Example usage:

	url = dl_manager.download(url)
	tar_archive_iterator = dl_manager.iter_archive(url)

	for filename, file in tar_archive_iterator:
		...r)   rq   z://::)r@   r   rR   rA   r   rN   r   !SINGLE_FILE_COMPRESSION_PROTOCOLSrB   rC   r`   rindex)r   rQ   r,   rC   r*   Z
inner_filer/   r/   r0   r8    s   
	z!StreamingDownloadManager._extractc                 C   s   |  | |S )a0  Prepare given `url_or_urls` for streaming (add extraction protocol).

        This is the lazy version of `DownloadManager.download_and_extract` for streaming.

        Is equivalent to:

        ```
        urls = dl_manager.extract(dl_manager.download(url_or_urls))
        ```

        Args:
            url_or_urls (`str` or `list` or `dict`):
                URL(s) to stream from data from. Each url is a `str`.

        Returns:
            url(s): (`str` or `list` or `dict`), URL(s) to stream data from matching the given input `url_or_urls`.
        )r9  r7  r5  r/   r/   r0   download_and_extract/  s   z-StreamingDownloadManager.download_and_extractr   c                 C   s$   t |dr
t|S tj|| jdS )aN  Iterate over files within an archive.

        Args:
            urlpath_or_buf (`str` or `io.BufferedReader`):
                Archive path or archive binary file object.

        Yields:
            `tuple[str, io.BufferedReader]`:
                2-tuple (path_within_archive, file_object).
                File object is opened in binary mode.

        Example:

        ```py
        >>> archive = dl_manager.download('https://storage.googleapis.com/seldon-datasets/sentence_polarity_v1/rt-polaritydata.tar.gz')
        >>> files = dl_manager.iter_archive(archive)
        ```
        rn   rS   )r   r  r  r!  rR   )r   r   r/   r/   r0   iter_archiveC  s   

z%StreamingDownloadManager.iter_archiver&  c                 C   s   t j|| jdS )a  Iterate over files.

        Args:
            urlpaths (`str` or `list` of `str`):
                Root paths.

        Yields:
            str: File URL path.

        Example:

        ```py
        >>> files = dl_manager.download_and_extract('https://huggingface.co/datasets/beans/resolve/main/data/train.zip')
        >>> files = dl_manager.iter_files(files)
        ```
        rS   )r%  r*  rR   )r   r&  r/   r/   r0   
iter_files\  s   z#StreamingDownloadManager.iter_files)NNNN)r<   r=   r>   r   Zis_streamingr   r@   r    r  r   r2  r7  r4  r9  r8  r<  r   r   BufferedReaderr   r   r=  r   r>  r/   r/   r/   r0   r+    s2    

 &r+  r7   )r   )NN)lr   r   rB   rE   rer  r   Zxml.dom.minidomr(   r  asyncior   r   	itertoolsr   pathlibr   r   typingr   r   r	   r
   r   r   r   r   r   Z	xml.etreer   r   rY   Zaiohttp.client_exceptionsr   Zhuggingface_hub.utilsr   	packagingr   r   r   Zfilesystemsr   Zutils.file_utilsr   r   r   r   r   r   Zutils.loggingr   Zutils.py_utilsr   rR   r    r<   r}   r   r   r:  compiler   bytesfromhexr   r   maxr   	Exceptionr;   rI   rP   r@   r_   ra   rc   rf   boolri   intrp   rw   rz   r   r   r   r   rX   r   rm   r   r   r   r   r   rV   r   r   r   r   r   r   r   r  r  r  r%  r+  r/   r/   r/   r0   <module>   s    , 










 

> / 	


	A 