o
    +if}                     @   s(  d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl#m%Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/ e0e"j1Z2e$3e4Z5G dd de0Z6G dd de7Z8dZ9e"j1ddge"j:g de"j;g diZ<dZ=ej>e?dk rdd gZ@g d!ZAnej>e?d"k rd#d gZ@g d$ZAnd%d&gZ@g d'ZAe"j1e"j:e"j;gZBd(d) eBD ZCd*d) eBD ZDe"j1d+giZEe9gZFeDeCeEgZGej>e?dk rg d,ZHnd-d.gZHd/ZIg d0ZJd1e0d2eKfd3d4ZLd5eeee0f d2ee0eee0 d6f f fd7d8ZMd9e0d1e0d2eKfd:d;ZNd9e0d1e0d2eKfd<d=ZOd>e
e0gee0 f d2ee0ee0 f fd?d@ZPd>e
e0gee0 f d2ee0 fdAdBZQ		dZd1e0dCe0dDeee0  dEee d2ee0 f
dFdGZRd[dCe0dEee d2ee0ee0 f fdHdIZS	d[dCe0dEee d2ee0 fdJdKZT	d[dLe0dEee d2ee0 fdMdNZU		dZdOee0 dEee dPeeV d2ee0 fdQdRZWG dSd6 d6ee0 ZXG dTdU dUee0eXf ZYG dVdW dWee0 ZZG dXdY dYee0eZf Z[dS )\    N)partial)	has_magic)PathPurePath)CallableDictListOptionalSetTupleUnion)	url_to_fs)HTTPFileSystem)HfFileSystem)version)
thread_map   )config)DownloadConfig)	_split_re)Split)logging)tqdm)!_prepare_path_and_storage_optionsis_local_pathis_relative_path	xbasenamexjoin)glob_pattern_to_regexstring_to_dictc                   @      e Zd ZdS )UrlN__name__
__module____qualname__ r&   r&   L/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/data_files.pyr!          r!   c                   @   r    )EmptyDatasetErrorNr"   r&   r&   r&   r'   r)   #   r(   r)   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*traintraining)
validationvaliddevval)testtestingeval
evaluationz-._ 0-9z2023.9.0z**[{sep}/]{keyword}[{sep}]*z{keyword}[{sep}]*)z{keyword}/**z{keyword}[{sep}]*/**z**[{sep}/]{keyword}/**z**[{sep}/]{keyword}[{sep}]*/**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}]*)z{keyword}/**/*z{keyword}[{sep}]*/**/*z**/*[{sep}/]{keyword}/**/*z"**/*[{sep}/]{keyword}[{sep}]*/**/*z**/{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*)z**/{keyword}/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**z**/*[{sep}]{keyword}[{sep}]*/**c                 C       i | ]}|d d t | D qS )c                 S   $   g | ]}t D ]	}|j|td qqS )keywordsep)"KEYWORDS_IN_FILENAME_BASE_PATTERNSformatNON_WORDS_CHARS.0r7   patternr&   r&   r'   
<listcomp>J       <dictcomp>.<listcomp>SPLIT_KEYWORDSr=   splitr&   r&   r'   
<dictcomp>I       rF   c                 C   r4   )c                 S   r5   r6   )"KEYWORDS_IN_DIR_NAME_BASE_PATTERNSr:   r;   r<   r&   r&   r'   r?   R   r@   rA   rB   rD   r&   r&   r'   rF   Q   rG   z**)zmetadata.csv**/metadata.csvzmetadata.jsonl**/metadata.jsonlrI   rJ   z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr>   returnc                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S Nr&   )r=   wilcard_characterr>   r&   r'   	<genexpr>}       z%contains_wildcards.<locals>.<genexpr>)anyWILDCARD_CHARACTERSrN   r&   rN   r'   contains_wildcards|   s   rS   patternsDataFilesListc                 C   s   t | trdd |  D S t | trt| giS t | trntdd | D rj| D ]"}t |trCt|dkrCd|v rCt |dttfsJt	d| q(d	d
 | D }tt
|t|krct	d| dd | D S t| iS tt| S )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 S   s*   i | ]\}}t |t|tr|n|gqS r&   str
isinstancelist)r=   keyvaluer&   r&   r'   rF      s   * z%sanitize_patterns.<locals>.<dictcomp>c                 s   s    | ]}t |tV  qd S rL   )rX   dictr=   r>   r&   r&   r'   rO      s    z$sanitize_patterns.<locals>.<genexpr>   rE   pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c                 S   s   g | ]}|d  qS rE   r&   r]   r&   r&   r'   r?          z%sanitize_patterns.<locals>.<listcomp>z*Some splits are duplicated in data_files: c                 S   s6   i | ]}t |d  t|d tr|d n|d gqS )rE   r_   rV   r]   r&   r&   r'   rF      s    ()rX   r\   itemsrW   SANITIZED_DEFAULT_SPLITrY   rQ   lenget
ValueErrorsetsanitize_patterns)rT   r>   splitsr&   r&   r'   rh      s2   
	


rh   matched_rel_pathc                 C   s<   dd t | jjD }dd t |jjD }t|t|kS )u  
    When a path matches a pattern, we additionnally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 S      g | ]	}| d r|qS __
startswithr=   partr&   r&   r'   r?          z6_is_inside_unrequested_special_dir.<locals>.<listcomp>c                 S   rk   rl   rn   rp   r&   r&   r'   r?      rr   )r   parentpartsrd   )rj   r>   data_dirs_to_ignore_in_pathdata_dirs_to_ignore_in_patternr&   r&   r'   "_is_inside_unrequested_special_dir   s   rw   c                 C   s8   dd t | jD }dd t |jD }t|t|kS )u:  
    When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 S   (   g | ]}| d rt|d hks|qS .ro   rg   rp   r&   r&   r'   r?          
zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>c                 S   rx   ry   r{   rp   r&   r&   r'   r?      r|   )r   rt   rd   )rj   r>   hidden_directories_in_pathhidden_directories_in_patternr&   r&   r'   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir   s   5r   pattern_resolverc                    sF  t D ]Wdd}z| |}W n	 ty   Y qw t|dkrYfdd|D tdd D r=tdt d	 d
fddtD tt	t  }fdd|D   S qt
D ]< g }  D ]&\}}|D ]}z| |}W n	 ty{   Y qjw t|dkr||  nqjqd|r fdd|D   S q\td| d|  )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   c                    s&   h | ]}t t|tt d  qS r`   )r   r   r   )r=   psplit_patternr&   r'   	<setcomp>  s    z+_get_data_files_patterns.<locals>.<setcomp>c                 s   s    | ]
}t t| V  qd S rL   )rematchr   rD   r&   r&   r'   rO     s    z+_get_data_files_patterns.<locals>.<genexpr>zSplit name should match 'z'' but got 'z'.c                    s   g | ]
}| v rt |qS r&   )rW   rD   )ri   r&   r'   r?         z,_get_data_files_patterns.<locals>.<listcomp>c                    s   i | ]
}| j |d gqS )r`   )r:   rD   r   r&   r'   rF     r   z,_get_data_files_patterns.<locals>.<dictcomp>c                    s   i | ]}| | qS r&   r&   rD   )patterns_dictr&   r'   rF   +  s    Couldn't resolve pattern  with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorrd   rQ   rf   r   DEFAULT_SPLITSsortedrg   ALL_DEFAULT_PATTERNSrb   append)r   r>   
data_filessorted_splitsnon_empty_splitsrE   rT   r&   )r   r   ri   r'   _get_data_files_patterns  sH   


r   c              	   C   s^   g }t D ]}z| |}t|dkr|| W q ty    Y qw |r%|S td| d|  )zM
    Get the supported metadata patterns from a directory or repository.
    r   r   r   )METADATA_PATTERNSrd   r   r   )r   non_empty_patternsr>   metadata_filesr&   r&   r'   _get_metadata_files_patterns/  s   
r   	base_pathallowed_extensionsdownload_configc                    s~  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| fi |\}t	t
t| h t|jtr@|jn|jd }|dkrM|d ndi }|dkratjtdkrad|d	< fd
d|j| fddi| D } dur fdd|D }	t|	t|k rtt	|t	|	 }
td|  d|
  n|}	|	sd|  d} dur|dt  7 }t||	S )a{  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicilty mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   filez://hfz0.20.0Fexpand_infoc                    sR   g | ]%\}}|d  dkr't | vrt|st|s|r#|n| qS )typer   )r   rw   r   ro   )r=   filepathinfo)files_to_ignore
fs_patternprotocol_prefixr&   r'   r?   ~  s    z#resolve_pattern.<locals>.<listcomp>detailTNc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS )rz   Nr&   )r=   suffixr   r&   r'   rO     s    z-resolve_pattern.<locals>.<listcomp>.<genexpr>rz   r   N)rQ   r   rE   )r=   r   r   r&   r'   r?     s    &z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   osr_   
splitdriver8   r   r   rg   FILES_TO_IGNOREr   rX   protocolrW   r   HF_HUB_VERSIONr   parseglobrb   rd   rY   loggerr   r   )r>   r   r   r   storage_optionsfsr   glob_kwargsmatched_pathsoutinvalid_matched_files	error_msgr&   )r   r   r   r   r'   resolve_pattern@  sB   /
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )uA
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {'train': ['**']}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['**/train[-._ 0-9]*', '**/*[-._ 0-9]train[-._ 0-9]*', '**/training[-._ 0-9]*', '**/*[-._ 0-9]training[-._ 0-9]*'],
         'test': ['**/test[-._ 0-9]*', '**/*[-._ 0-9]test[-._ 0-9]*', '**/testing[-._ 0-9]*', '**/*[-._ 0-9]testing[-._ 0-9]*', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['**/train/**', '**/train[-._ 0-9]*/**', '**/*[-._ 0-9]train/**', '**/*[-._ 0-9]train[-._ 0-9]*/**', ...],
         'test': ['**/test/**', '**/test[-._ 0-9]*/**', '**/*[-._ 0-9]test/**', '**/*[-._ 0-9]test[-._ 0-9]*/**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    r   r   The directory at z doesn't contain any data filesN)r   r   r   r   r)   r   r   resolverr&   r&   r'   get_data_patterns  s   T
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )zE
    Get the supported metadata patterns from a local directory.
    r   r   z" doesn't contain any metadata fileN)r   r   r   r   r   r&   r&   r'   get_metadata_patterns  s   
r   	data_filec           	      C   s   t | |d\} }t| fi |^}}t|tr"|| }|j|jfS t|trR| t	j
rRtt	j
|jd}d| tt	j
d d  ddd } || }|j|jfS || }dD ]}||v rht|| f  S qYdS )	Nr   )endpointtokenzhf://r   z	/resolve/@)ETagetagmtimer&   )r   r   rX   r   resolve_pathrepo_idrevisionr   ro   r   HF_ENDPOINTr   rd   r   r   rW   )	r   r   r   r   _resolved_pathhffsr   rZ   r&   r&   r'   _get_single_origin_metadata  s    

$

r   r   max_workersc                 C   s:   |d ur|nt j}ttt|d| |tdt| dkpd dS )Nr   zResolving data files   )r   
tqdm_classdescdisable)r   &HF_DATASETS_MULTITHREADING_MAX_WORKERSr   r   r   hf_tqdmrd   )r   r   r   r&   r&   r'   _get_origin_metadata  s   
r   c                       s  e Zd ZdZdee deee  f fddZdd Ze				dd	ee d
e
jjdee deee  dee dd fddZe				dd	ee dee deee  dee dd f
ddZe				dd	ee dee deee  dee dd f
ddZdee dd fddZ  ZS )rU   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatac                       t  | || _d S rL   )super__init__r   )selfr   r   	__class__r&   r'   r   =  s   
zDataFilesList.__init__c                 C      t g | || j|j S rL   )rU   r   r   otherr&   r&   r'   __add__A     zDataFilesList.__add__NrT   dataset_infor   r   r   rK   c                 C   s6   d|j  d|j d|pd d}| j||||dS )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrT   r   r   r   r   r&   r&   r'   from_hf_repoD  s   $	zDataFilesList.from_hf_repoc                 C   s,   |d ur|nt    }| j||||dS Nr   )r   resolveas_posixr   )r   rT   r   r   r   r&   r&   r'   from_local_or_remoteR  s   z"DataFilesList.from_local_or_remotec              	   C   st   |d ur|nt    }g }|D ]}z|t||||d W q ty.   t|s, Y qw t||d}| ||S Nr   r   )r   r   r   extendr   r   r   r   )r   rT   r   r   r   r   r>   r   r&   r&   r'   r   _  s&   
zDataFilesList.from_patterns
extensionsc                    sB   d dd |D  td  d t fdd| D | jdS )	N|c                 s   s    | ]}d | V  qdS )\Nr&   )r=   extr&   r&   r'   rO   z  rP   z2DataFilesList.filter_extensions.<locals>.<genexpr>z.*(z	)(\..+)?$c                    s   g | ]	}  |r|qS r&   )r   )r=   r   rN   r&   r'   r?   }  rr   z3DataFilesList.filter_extensions.<locals>.<listcomp>)r   )joinr   compilerU   r   r   r   r&   rN   r'   filter_extensionsy  s   zDataFilesList.filter_extensionsNNN)r#   r$   r%   __doc__r   rW   r   r   r   classmethodhuggingface_hubhf_apiDatasetInfor	   r   r   r   r   r   __classcell__r&   r&   r   r'   rU   +  sh    "


c                   @   s  e Zd ZdZe			ddeeeee e	f f de
e de
ee  de
e dd f
dd	Ze			ddeeeee e	f f d
ejjde
e de
ee  de
e dd fddZe			ddeeeee e	f f de
e de
ee  de
e dd f
ddZdee dd fddZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see ``DataFilesList``.

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrT   r   r   r   rK   c                 C   @   |  }|  D ]\}}t|tstj||||dn|||< q|S r   )rb   rX   rU   r   r   rT   r   r   r   r   rZ   patterns_for_keyr&   r&   r'   r        
z"DataFilesDict.from_local_or_remoter   c           	      C   sB   |  }|  D ]\}}t|tstj|||||dn|||< q|S )N)r   r   r   r   )rb   rX   rU   r   )	r   rT   r   r   r   r   r   rZ   r  r&   r&   r'   r     s   		zDataFilesDict.from_hf_repoc                 C   r  r   )rb   rX   rU   r   r  r&   r&   r'   r     r  zDataFilesDict.from_patternsr   c                 C   .   t |  }|  D ]\}}||||< q	|S rL   r   rb   r   )r   r   r   rZ   data_files_listr&   r&   r'   r        
zDataFilesDict.filter_extensionsr   )r#   r$   r%   r   r   r   rW   r   r   rU   r	   r   r   r   r  r  r   r   r   r&   r&   r&   r'   r    sd    


r  c                       s   e Zd ZdZdee deeee   f fddZdd Ze		ddee deee  d	d
fddZ
	ddedee d	dfddZdee d	dfddZ  ZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rT   r   c                    r   rL   )r   r   r   )r   rT   r   r   r&   r'   r     s   
zDataFilesPatternsList.__init__c                 C   r   rL   )rU   r   r   r&   r&   r'   r     r   zDataFilesPatternsList.__add__NrK   DataFilesPatternsDictc                 C   s   | ||gt | S rL   )rd   )r   rT   r   r&   r&   r'   r     s   z#DataFilesPatternsList.from_patternsr   r   rU   c              	   C   s   |d ur|nt    }g }t| | jD ]\}}z|t||||d W q ty4   t|s2 Y qw t	||d}t
||S r   )r   r   r   zipr   r   r   r   r   r   rU   )r   r   r   r   r>   r   r   r&   r&   r'   r     s&   
zDataFilesPatternsList.resolver   c                    s   t |  fdd| jD S )Nc                    s   g | ]}|  qS r&   r&   )r=   r   r   r&   r'   r?     ra   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>)r  r   r   r&   r  r'   r     s   z'DataFilesPatternsList.filter_extensionsrL   )r#   r$   r%   r   r   rW   r	   r   r   r   r   r   r   r   r  r&   r&   r   r'   r    s4    

r  c                   @   sv   e Zd ZdZe	ddeeee f deee  dd fddZ		dded	ee
 dd
fddZdee dd fddZdS )r  z[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrT   r   rK   c                 C   s<   |  }|  D ]\}}t|tstj||dn|||< q|S )Nr   )rb   rX   r  r   )r   rT   r   r   rZ   r  r&   r&   r'   r     s   z#DataFilesPatternsDict.from_patternsr   r   r  c                 C   s,   t  }|  D ]\}}|||||< q|S rL   )r  rb   r   )r   r   r   r   rZ   data_files_patterns_listr&   r&   r'   r   '  s   zDataFilesPatternsDict.resolver   c                 C   r	  rL   r
  )r   r   r   rZ   r  r&   r&   r'   r   1  r  z'DataFilesPatternsDict.filter_extensionsrL   )r#   r$   r%   r   r   r   rW   r   r	   r   r   r   r   r&   r&   r&   r'   r    s(    


r  )NNrL   )\r   r   	functoolsr   r   r   pathlibr   r   typingr   r   r   r	   r
   r   r   r   fsspec.corer   fsspec.implementations.httpr   r   	packagingr   tqdm.contrib.concurrentr   r   r   downloadr   namingr   ri   r   utilsr   r   r   utils.file_utilsr   r   r   r   r   utils.py_utilsr   r   rW   TRAINrc   
get_loggerr#   r   r!   r   r)   SPLIT_PATTERN_SHARDED
VALIDATIONTESTrC   r;   FSSPEC_VERSIONr   r9   rH   r   "DEFAULT_PATTERNS_SPLIT_IN_FILENAME"DEFAULT_PATTERNS_SPLIT_IN_DIR_NAMEDEFAULT_PATTERNS_ALLr   r   r   rR   r   boolrS   rh   rw   r   r   r   r   r   r   r   intr   rU   r  r  r  r&   r&   r&   r'   <module>   s    $








0&!,>$*

([]


W[5