o
    TZhi|                     @   s   d dl Z d dlZd dlmZ d dlmZ d dlmZmZ d dl	m
Z
mZmZmZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl'm)Z* ddl+m,Z,m-Z- ddl.m/Z/m0Z0 e1e&j2Z3e(4e5Z6G dd de1Z7G dd de8Z9dZ:e&j2ddge&j;g de&j<g diZ=dZ>ej?e@dk rd d!gZAnej?e@d"k rd d#gZAng d$ZAe&j2e&j;e&j<gZBd%d& eBD ZCe&j2d'giZDe:gZEeCeDgZFej?e@dk rg d(ZGnd)d*gZGd+ZHg d,ZId-e1d.eJfd/d0ZKd1eeee1f d.ee1eee1 d2f f fd3d4ZLd5e1d-e1d.eJfd6d7ZMd5e1d-e1d.eJfd8d9ZNd:e
e1gee1 f d.ee1ee1 f fd;d<ZOd:e
e1gee1 f d.ee1 fd=d>ZP		dVd-e1d?e1d@eee1  dAee d.ee1 f
dBdCZQdWd?e1dAee d.ee1ee1 f fdDdEZR	dWd?e1dAee d.ee1 fdFdGZS	dWdHe1dAee d.ee1 fdIdJZT	K	dXdLee1 dAee d.ee1 fdMdNZUG dOd2 d2ee1 ZVG dPdQ dQee1eVf ZWG dRdS dSee1 ZXG dTdU dUee1eXf ZYdS )Y    N)partial)	has_magic)PathPurePath)CallableDictListOptionalSetTupleUnion)get_fs_token_paths)HTTPFileSystem)HfFileSystem)version)
thread_map   )config)DownloadConfig)!_prepare_path_and_storage_options	xbasenamexjoin)	_split_re)Split)logging)tqdm)is_local_pathis_relative_path)glob_pattern_to_regexstring_to_dictc                   @      e Zd ZdS )UrlN__name__
__module____qualname__ r&   r&   J/var/www/html/lang_env/lib/python3.10/site-packages/datasets/data_files.pyr!           r!   c                   @   r    )EmptyDatasetErrorNr"   r&   r&   r&   r'   r)   $   r(   r)   zFdata/{split}-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*trainZtraining)Z
validationZvaliddevval)testtestingevalZ
evaluationz-._ 0-9z2023.9.0z{keyword}[{sep}/]**z**[{sep}/]{keyword}[{sep}/]**z	2023.12.0z**/*[{sep}/]{keyword}[{sep}/]**)z**/{keyword}[{sep}]*z**/{keyword}/**z**/*[{sep}]{keyword}[{sep}]*z**/*[{sep}]{keyword}[{sep}]*/**z**/{keyword}[{sep}]*/**z**/*[{sep}]{keyword}/**c                 C   s    i | ]}|d d t | D qS )c                 S   s$   g | ]}t D ]	}|j|td qqS ))keywordsep)#KEYWORDS_IN_PATH_NAME_BASE_PATTERNSformatNON_WORDS_CHARS).0r0   patternr&   r&   r'   
<listcomp>@   s    z<dictcomp>.<listcomp>)SPLIT_KEYWORDSr5   splitr&   r&   r'   
<dictcomp>?   s    r;   z**)zmetadata.csv**/metadata.csvzmetadata.jsonl**/metadata.jsonlr<   r=   z*[])z	README.mdzconfig.jsonzdataset_info.jsonzdataset_infos.jsonzdummy_data.zipzdataset_dict.jsonr6   returnc                    s   t  fddtD S )Nc                 3   s    | ]}| v V  qd S Nr&   )r5   Zwilcard_characterr6   r&   r'   	<genexpr>i       z%contains_wildcards.<locals>.<genexpr>)anyWILDCARD_CHARACTERSr@   r&   r@   r'   contains_wildcardsh   s   rE   patternsDataFilesListc                 C   s   t | trdd |  D S t | trt| giS t | trntdd | D rj| D ]"}t |trCt|dkrCd|v rCt |dttfsJt	d| q(d	d
 | D }tt
|t|krct	d| dd | D S t| iS tt| S )a/  
    Take the data_files patterns from the user, and format them into a dictionary.
    Each key is the name of the split, and each value is a list of data files patterns (paths or urls).
    The default split is "train".

    Returns:
        patterns: dictionary of split_name -> list of patterns
    c                 S   s*   i | ]\}}t |t|tr|n|gqS r&   str
isinstancelist)r5   keyvaluer&   r&   r'   r;   v   s   * z%sanitize_patterns.<locals>.<dictcomp>c                 s   s    | ]}t |tV  qd S r?   )rJ   dictr5   r6   r&   r&   r'   rA   z   s    z$sanitize_patterns.<locals>.<genexpr>   r:   pathz]Expected each split to have a 'path' key which can be a string or a list of strings, but got c                 S   s   g | ]}|d  qS r:   r&   rO   r&   r&   r'   r7          z%sanitize_patterns.<locals>.<listcomp>z*Some splits are duplicated in data_files: c                 S   s6   i | ]}t |d  t|d tr|d n|d gqS )r:   rQ   rH   rO   r&   r&   r'   r;      s    ()rJ   rN   itemsrI   SANITIZED_DEFAULT_SPLITrK   rC   lenget
ValueErrorsetsanitize_patterns)rF   r6   splitsr&   r&   r'   rZ   l   s2   
	


rZ   matched_rel_pathc                 C   s<   dd t | jjD }dd t |jjD }t|t|kS )u  
    When a path matches a pattern, we additionnally check if it's inside a special directory
    we ignore by default (if it starts with a double underscore).

    Users can still explicitly request a filepath inside such a directory if "__pycache__" is
    mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── __pycache__
            └── b.txt

    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "**")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "*/b.txt")
    True
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__pycache__/*")
    False
    >>> _is_inside_unrequested_special_dir("__pycache__/b.txt", "__*/*")
    False
    c                 S      g | ]	}| d r|qS __
startswithr5   partr&   r&   r'   r7          z6_is_inside_unrequested_special_dir.<locals>.<listcomp>c                 S   r]   r^   r`   rb   r&   r&   r'   r7      rd   )r   parentpartsrV   )r\   r6   Zdata_dirs_to_ignore_in_pathZdata_dirs_to_ignore_in_patternr&   r&   r'   "_is_inside_unrequested_special_dir   s   rg   c                 C   s8   dd t | jD }dd t |jD }t|t|kS )u:  
    When a path matches a pattern, we additionnally check if it's a hidden file or if it's inside
    a hidden directory we ignore by default, i.e. if the file name or a parent directory name starts with a dot.

    Users can still explicitly request a filepath that is hidden or is inside a hidden directory
    if the hidden part is mentioned explicitly in the requested pattern.

    Some examples:

    base directory:

        ./
        └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_file.txt", ".*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── a.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".*/*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/a.txt", ".hidden_dir/*")
    False

    base directory:

        ./
        └── .hidden_dir
            └── .hidden_file.txt

    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", "**")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".*/.*")
    False
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/*")
    True
    >>> _is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir(".hidden_dir/.hidden_file.txt", ".hidden_dir/.*")
    False
    c                 S   (   g | ]}| d rt|d hks|qS .ra   rY   rb   r&   r&   r'   r7          
zS_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir.<locals>.<listcomp>c                 S   rh   ri   rk   rb   r&   r&   r'   r7      rl   )r   rf   rV   )r\   r6   Zhidden_directories_in_pathZhidden_directories_in_patternr&   r&   r'   ?_is_unrequested_hidden_file_or_is_inside_unrequested_hidden_dir   s   5rm   pattern_resolverc                    sF  t D ]Wdd}z| |}W n	 ty   Y qw t|dkrYfdd|D tdd D r=tdt d	 d
fddtD tt	t  }fdd|D   S qt
D ]< g }  D ]&\}}|D ]}z| |}W n	 ty{   Y qjw t|dkr||  nqjqd|r fdd|D   S q\td| d|  )a+  
    Get the default pattern from a directory or repository by testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    z{split}*r   c                    s&   h | ]}t t|tt d  qS rR   )r   r   r   )r5   psplit_patternr&   r'   	<setcomp>   s    z+_get_data_files_patterns.<locals>.<setcomp>c                 s   s    | ]
}t t| V  qd S r?   )rematchr   r9   r&   r&   r'   rA     s    z+_get_data_files_patterns.<locals>.<genexpr>zSplit name should match 'z'' but got 'z'.c                    s   g | ]
}| v rt |qS r&   )rI   r9   )r[   r&   r'   r7         z,_get_data_files_patterns.<locals>.<listcomp>c                    s   i | ]
}| j |d gqS )rR   )r3   r9   rq   r&   r'   r;   	  rv   z,_get_data_files_patterns.<locals>.<dictcomp>c                    s   i | ]}| | qS r&   r&   r9   )patterns_dictr&   r'   r;     s    Couldn't resolve pattern  with resolver )ALL_SPLIT_PATTERNSreplaceFileNotFoundErrorrV   rC   rX   r   DEFAULT_SPLITSsortedrY   ALL_DEFAULT_PATTERNSrT   append)rn   r6   
data_filesZsorted_splitsZnon_empty_splitsr:   rF   r&   )rw   rr   r[   r'   _get_data_files_patterns   sH   


r   c              	   C   s^   g }t D ]}z| |}t|dkr|| W q ty    Y qw |r%|S td| d|  )zM
    Get the supported metadata patterns from a directory or repository.
    r   rx   ry   )METADATA_PATTERNSrV   r   r|   )rn   Znon_empty_patternsr6   Zmetadata_filesr&   r&   r'   _get_metadata_files_patterns  s   
r   	base_pathallowed_extensionsdownload_configc                    s  t | r
t|| } nt| rtj| d tj }nd}t| |d\} }t| |d\}}}|	dd 	dd p;|j
| 	dd 	dd ttt| h t|jtrZ|jn|jd }|dkrg|d ndi }|d	kr{tjtd
kr{d|d< fdd|j| fddi| D }	 dur fdd|	D }
t|
t|	k rtt|	t|
 }td|  d|  n|	}
|
sd|  d} dur|dt  7 }t||
S )a{  
    Resolve the paths and URLs of the data files from the pattern passed by the user.

    You can use patterns to resolve multiple local files. Here are a few examples:
    - *.csv to match all the CSV files at the first level
    - **.csv to match all the CSV files at any level
    - data/* to match all the files inside "data"
    - data/** to match all the files inside "data" and its subdirectories

    The patterns are resolved using the fsspec glob. In fsspec>=2023.12.0 this is equivalent to
    Python's glob.glob, Path.glob, Path.match and fnmatch where ** is unsupported with a prefix/suffix
    other than a forward slash /.

    More generally:
    - '*' matches any character except a forward-slash (to match just the file or directory name)
    - '**' matches any character including a forward-slash /

    Hidden files and directories (i.e. whose names start with a dot) are ignored, unless they are explicitly requested.
    The same applies to special directories that start with a double underscore like "__pycache__".
    You can still include one if the pattern explicilty mentions it:
    - to include a hidden file: "*/.hidden.txt" or "*/.*"
    - to include a hidden directory: ".hidden/*" or ".*/*"
    - to include a special directory: "__special__/*" or "__*/*"

    Example::

        >>> from datasets.data_files import resolve_pattern
        >>> base_path = "."
        >>> resolve_pattern("docs/**/*.py", base_path)
        [/Users/mariosasko/Desktop/projects/datasets/docs/source/_config.py']

    Args:
        pattern (str): Unix pattern or paths or URLs of the data files to resolve.
            The paths can be absolute or relative to base_path.
            Remote filesystems using fsspec are supported, e.g. with the hf:// protocol.
        base_path (str): Base path to use when resolving relative paths.
        allowed_extensions (Optional[list], optional): White-list of file extensions to use. Defaults to None (all extensions).
            For example: allowed_extensions=[".csv", ".json", ".txt", ".parquet"]
    Returns:
        List[str]: List of paths or URLs to the local or remote files that match the patterns.
    r    r   storage_optionsz::z://fileZhfz0.20.0FZexpand_infoc                    sz   g | ]9\}}|d  dkr;t | vrttj|tjsttj|tjs|r7|n| qS )typer   )r   rg   osrQ   relpathrm   ra   )r5   filepathinfo)files_to_ignorefs_base_path
fs_patternprotocol_prefixr&   r'   r7   l  s    z#resolve_pattern.<locals>.<listcomp>detailTNc                    s8   g | ]}t  fd dt|ddd D r|qS )c                 3   s    | ]	}d |  v V  qdS )rj   Nr&   )r5   suffixr   r&   r'   rA   |  s    z-resolve_pattern.<locals>.<listcomp>.<genexpr>rj   r   N)rC   r   r:   )r5   r   r   r&   r'   r7   y  s    &z Some files matched the pattern 'z-' but don't have valid data file extensions: zUnable to find ''z with any supported extension )r   r   r   r   rQ   
splitdriver1   r   r   r:   Zroot_markerrY   FILES_TO_IGNOREr   rJ   protocolrI   r   ZHF_HUB_VERSIONr   parseglobrT   rV   rK   loggerr   r|   )r6   r   r   r   r   fs_r   Zglob_kwargsZmatched_pathsoutZinvalid_matched_files	error_msgr&   )r   r   r   r   r   r'   resolve_pattern,  sF   /
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )uh
  
    Get the default pattern from a directory testing all the supported patterns.
    The first patterns to return a non-empty list of data files is returned.

    Some examples of supported patterns:

    Input:

        my_dataset_repository/
        ├── README.md
        └── dataset.csv

    Output:

        {"train": ["**"]}

    Input:

        my_dataset_repository/
        ├── README.md
        ├── train.csv
        └── test.csv

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train.csv
            └── test.csv

        my_dataset_repository/
        ├── README.md
        ├── train_0.csv
        ├── train_1.csv
        ├── train_2.csv
        ├── train_3.csv
        ├── test_0.csv
        └── test_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train/
            │   ├── shard_0.csv
            │   ├── shard_1.csv
            │   ├── shard_2.csv
            │   └── shard_3.csv
            └── test/
                ├── shard_0.csv
                └── shard_1.csv

    Output:

        {'train': ['train[-._ 0-9/]**', '**/*[-._ 0-9/]train[-._ 0-9/]**', 'training[-._ 0-9/]**', '**/*[-._ 0-9/]training[-._ 0-9/]**'],
         'test': ['test[-._ 0-9/]**', '**/*[-._ 0-9/]test[-._ 0-9/]**', 'testing[-._ 0-9/]**', '**/*[-._ 0-9/]testing[-._ 0-9/]**', ...]}

    Input:

        my_dataset_repository/
        ├── README.md
        └── data/
            ├── train-00000-of-00003.csv
            ├── train-00001-of-00003.csv
            ├── train-00002-of-00003.csv
            ├── test-00000-of-00001.csv
            ├── random-00000-of-00003.csv
            ├── random-00001-of-00003.csv
            └── random-00002-of-00003.csv

    Output:

        {'train': ['data/train-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'test': ['data/test-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*'],
         'random': ['data/random-[0-9][0-9][0-9][0-9][0-9]-of-[0-9][0-9][0-9][0-9][0-9]*.*']}

    In order, it first tests if SPLIT_PATTERN_SHARDED works, otherwise it tests the patterns in ALL_DEFAULT_PATTERNS.
    r   r   The directory at z doesn't contain any data filesN)r   r   r   r|   r)   r   r   resolverr&   r&   r'   get_data_patterns  s   T
r   c                 C   s:   t t| |d}zt|W S  ty   td|  ddw )zE
    Get the supported metadata patterns from a local directory.
    r   r   z" doesn't contain any metadata fileN)r   r   r   r|   r   r&   r&   r'   get_metadata_patterns  s   
r   	data_filec           	      C   s   t | |d\} }t| |d\}}}t|tr!|| }|j|jfS t|trQ| t	j
rQtt	j
|jd}d| tt	j
d d  ddd } || }|j|jfS || }dD ]}||v rgt|| f  S qXd	S )
Nr   r   )Zendpointtokenzhf://r   z	/resolve/@)ETagetagmtimer&   )r   r   rJ   r   Zresolve_pathZrepo_idrevisionr   ra   r   ZHF_ENDPOINTr   rV   r{   r   rI   )	r   r   r   r   r   resolved_pathZhffsr   rL   r&   r&   r'   _get_single_origin_metadata  s    

$

r   @   r   c                 C   s(   t tt|d| |tdt| dkpd dS )Nr   zResolving data files   )max_workersZ
tqdm_classZdescdisable)r   r   r   hf_tqdmrV   )r   r   r   r&   r&   r'   _get_origin_metadata  s   
r   c                       s  e Zd ZdZdee deee  f fddZdd Ze				dd	ee d
e
jjdee deee  dee dd fddZe				dd	ee dee deee  dee dd f
ddZe				dd	ee dee deee  dee dd f
ddZdee dd fddZ  ZS )rG   a  
    List of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover DataFilesList has an additional attribute ``origin_metadata``.
    It can store:
    - the last modified time of local files
    - ETag of remote files
    - commit sha of a dataset repository

    Thanks to this additional attribute, it is possible to hash the list
    and get a different hash if and only if at least one file changed.
    This is useful for caching Dataset objects that are obtained from a list of data files.
    r   origin_metadatac                       t  | || _d S r?   )super__init__r   )selfr   r   	__class__r&   r'   r   .  s   
zDataFilesList.__init__c                 C      t g | || j|j S r?   )rG   r   r   otherr&   r&   r'   __add__2     zDataFilesList.__add__NrF   dataset_infor   r   r   r>   c                 C   s6   d|j  d|j d|pd d}| j||||dS )Nzhf://datasets/r   /r   r   r   r   )idsharstripfrom_patterns)clsrF   r   r   r   r   r&   r&   r'   from_hf_repo5  s   $	zDataFilesList.from_hf_repoc                 C   s,   |d ur|nt    }| j||||dS Nr   )r   resolveas_posixr   )r   rF   r   r   r   r&   r&   r'   from_local_or_remoteC  s   z"DataFilesList.from_local_or_remotec              	   C   st   |d ur|nt    }g }|D ]}z|t||||d W q ty.   t|s, Y qw t||d}| ||S Nr   r   )r   r   r   extendr   r|   r   r   )r   rF   r   r   r   r   r6   r   r&   r&   r'   r   P  s&   
zDataFilesList.from_patterns
extensionsc                    sB   d dd |D  td  d t fdd| D | jdS )	N|c                 s   s    | ]}d | V  qdS )\Nr&   )r5   extr&   r&   r'   rA   k  rB   z2DataFilesList.filter_extensions.<locals>.<genexpr>z.*(z	)(\..+)?$c                    s   g | ]	}  |r|qS r&   )ru   )r5   r   r@   r&   r'   r7   n  rd   z3DataFilesList.filter_extensions.<locals>.<listcomp>)r   )joinrt   compilerG   r   r   r   r&   r@   r'   filter_extensionsj  s   zDataFilesList.filter_extensionsNNN)r#   r$   r%   __doc__r   rI   r   r   r   classmethodhuggingface_hubhf_apiDatasetInfor	   r   r   r   r   r   __classcell__r&   r&   r   r'   rG     sh    "


c                   @   s  e Zd ZdZe			ddeeeee e	f f de
e de
ee  de
e dd f
dd	Ze			ddeeeee e	f f d
ejjde
e de
ee  de
e dd fddZe			ddeeeee e	f f de
e de
ee  de
e dd f
ddZdee dd fddZdS )DataFilesDicta  
    Dict of split_name -> list of data files (absolute local paths or URLs).
    It has two construction methods given the user's data files patterns :
    - ``from_hf_repo``: resolve patterns inside a dataset repository
    - ``from_local_or_remote``: resolve patterns from a local path

    Moreover each list is a DataFilesList. It is possible to hash the dictionary
    and get a different hash if and only if at least one file changed.
    For more info, see ``DataFilesList``.

    This is useful for caching Dataset objects that are obtained from a list of data files.

    Changing the order of the keys of this dictionary also doesn't change its hash.
    NrF   r   r   r   r>   c                 C   @   |  }|  D ]\}}t|tstj||||dn|||< q|S r   )rT   rJ   rG   r   r   rF   r   r   r   r   rL   patterns_for_keyr&   r&   r'   r        
z"DataFilesDict.from_local_or_remoter   c           	      C   sB   |  }|  D ]\}}t|tstj|||||dn|||< q|S )N)r   r   r   r   )rT   rJ   rG   r   )	r   rF   r   r   r   r   r   rL   r   r&   r&   r'   r     s   		zDataFilesDict.from_hf_repoc                 C   r   r   )rT   rJ   rG   r   r   r&   r&   r'   r     r   zDataFilesDict.from_patternsr   c                 C   .   t |  }|  D ]\}}||||< q	|S r?   r   rT   r   )r   r   r   rL   Zdata_files_listr&   r&   r'   r        
zDataFilesDict.filter_extensionsr   )r#   r$   r%   r   r   r   rI   r   r   rG   r	   r   r   r   r   r   r   r   r   r&   r&   r&   r'   r   s  sd    


r   c                       s   e Zd ZdZdee deeee   f fddZdd Ze		ddee deee  d	d
fddZ
	ddedee d	dfddZdee d	dfddZ  ZS )DataFilesPatternsListz
    List of data files patterns (absolute local paths or URLs).
    For each pattern there should also be a list of allowed extensions
    to keep, or a None ot keep all the files for the pattern.
    rF   r   c                    r   r?   )r   r   r   )r   rF   r   r   r&   r'   r     s   
zDataFilesPatternsList.__init__c                 C   r   r?   )rG   r   r   r&   r&   r'   r     r   zDataFilesPatternsList.__add__Nr>   DataFilesPatternsDictc                 C   s   | ||gt | S r?   )rV   )r   rF   r   r&   r&   r'   r     s   z#DataFilesPatternsList.from_patternsr   r   rG   c              	   C   s   |d ur|nt    }g }t| | jD ]\}}z|t||||d W q ty4   t|s2 Y qw t	||d}t
||S r   )r   r   r   zipr   r   r   r|   r   r   rG   )r   r   r   r   r6   r   r   r&   r&   r'   r     s&   
zDataFilesPatternsList.resolver   c                    s   t |  fdd| jD S )Nc                    s   g | ]}|  qS r&   r&   )r5   r   r   r&   r'   r7     rS   z;DataFilesPatternsList.filter_extensions.<locals>.<listcomp>)r   r   r   r&   r   r'   r     s   z'DataFilesPatternsList.filter_extensionsr?   )r#   r$   r%   r   r   rI   r	   r   r   r   r   r   r   r   r   r&   r&   r   r'   r     s4    

r   c                   @   sv   e Zd ZdZe	ddeeee f deee  dd fddZ		dded	ee
 dd
fddZdee dd fddZdS )r   z[
    Dict of split_name -> list of data files patterns (absolute local paths or URLs).
    NrF   r   r>   c                 C   s<   |  }|  D ]\}}t|tstj||dn|||< q|S )Nr   )rT   rJ   r   r   )r   rF   r   r   rL   r   r&   r&   r'   r     s   z#DataFilesPatternsDict.from_patternsr   r   r   c                 C   s,   t  }|  D ]\}}|||||< q|S r?   )r   rT   r   )r   r   r   r   rL   data_files_patterns_listr&   r&   r'   r     s   zDataFilesPatternsDict.resolver   c                 C   r   r?   r   )r   r   r   rL   r   r&   r&   r'   r   "  r   z'DataFilesPatternsDict.filter_extensionsr?   )r#   r$   r%   r   r   r   rI   r   r	   r   r   r   r   r&   r&   r&   r'   r     s(    


r   )NNr?   )r   N)Zr   rt   	functoolsr   r   r   pathlibr   r   typingr   r   r   r	   r
   r   r   r   Zfsspecr   Zfsspec.implementations.httpr   r   	packagingr   Ztqdm.contrib.concurrentr   r   r   downloadr   Z#download.streaming_download_managerr   r   r   Znamingr   r[   r   utilsr   r   r   Zutils.file_utilsr   r   Zutils.py_utilsr   r   rI   ZTRAINrU   Z
get_loggerr#   r   r!   r|   r)   ZSPLIT_PATTERN_SHARDEDZ
VALIDATIONTESTr8   r4   ZFSSPEC_VERSIONr   r2   r}   Z#DEFAULT_PATTERNS_SPLIT_IN_PATH_NAMEZDEFAULT_PATTERNS_ALLrz   r   r   rD   r   boolrE   rZ   rg   rm   r   r   r   r   r   r   r   rG   r   r   r   r&   r&   r&   r'   <module>   s    $






	


0&!,>$*

(a]


W[5