o
    TZh0                     @   sh  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZ d dlZd dlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ee Z!G dd dej"Z#de$deee$ e$f fddZ%edG dd de&Z'G dd dee$ee$ef f Z(i dg dg dg dg dg dg d g d!g d"g d#g d$g d%g d&g d'g d(g d)g d*g i d+g d,g d-g d.g d/g d0g d1g d2g d3g d4g d5g d6g d7g d8g d9g d:g d;g g g g d<Z)e d=kr2d d>l*m+Z+ e+d?d@Z,e,-dA e,. Z/e	e/j0Z0e'1e0Z2e3e2 e24e0 dS dS )B    N)Counter)groupby)
itemgetter)Path)AnyClassVarDictListOptionalTupleUnion)DatasetCardData   )METADATA_CONFIGS_FIELD)DatasetInfoDatasetInfosDict)	_split_re)
get_logger   )
deprecatedc                       s&   e Zd Zdd Zd fdd	Z  ZS )_NoDuplicateSafeLoaderc                    sR   fdd|j D }dd |D }t|  fdd D }|r'td| d S )Nc                    s   g | ]	\}} j | qS  )Zconstructed_objects).0Zkey_node_selfr   N/var/www/html/lang_env/lib/python3.10/site-packages/datasets/utils/metadata.py
<listcomp>   s    zS_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_node.<locals>.<listcomp>c                 S   s"   g | ]}t |trt|n|qS r   )
isinstancelisttupler   keyr   r   r   r      s   " c                    s   g | ]
} | d kr|qS )r   r   r!   )counterr   r   r      s    zGot duplicate yaml keys: )valuer   	TypeError)r   nodekeysZduplicate_keysr   )r#   r   r   (_check_no_duplicates_on_constructed_node   s   z?_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_nodeFc                    s   t  j||d}| | |S )N)deep)superconstruct_mappingr(   )r   r&   r)   mapping	__class__r   r   r+      s   
z(_NoDuplicateSafeLoader.construct_mapping)F)__name__
__module____qualname__r(   r+   __classcell__r   r   r-   r   r      s    r   readme_contentreturnc                 C   s|   t |  }|r7|d dkr7d|dd  v r7|dd  dd }d|d| }|d||d d  fS d d|fS )Nr   z---r   
)r   
splitlinesindexjoin)r3   full_contentZsep_idxZ	yamlblockr   r   r   _split_yaml_from_readme%   s    r:   z.Use `huggingface_hub.DatasetCardData` instead.c                   @   sz   e Zd ZdhZedeeef dd fddZdefddZ	dd	e
e defd
dZededd fddZdefddZdS )DatasetMetadataZtrain_eval_indexpathr4   c                 C   sT   t |dd}t| \}}W d   n1 sw   Y  |dur'| |S |  S )aS  Loads and validates the dataset metadata from its dataset card (README.md)

        Args:
            path (:obj:`Path`): Path to the dataset card (its README.md file)

        Returns:
            :class:`DatasetMetadata`: The dataset's metadata

        Raises:
            :obj:`TypeError`: If the dataset's metadata is invalid
        utf-8encodingN)openr:   readfrom_yaml_string)clsr<   readme_fileZyaml_stringr   r   r   r   from_readme4   s   
zDatasetMetadata.from_readmec                 C   s   |  rt|dd}| }W d    n1 sw   Y  nd }| |}t|ddd}|| W d    d S 1 s>w   Y  d S )Nr=   r>   w)existsr@   rA   
_to_readmewrite)r   r<   rD   r3   Zupdated_readme_contentr   r   r   	to_readmeH   s   

"zDatasetMetadata.to_readmeNr3   c                 C   s@   |d urt |\}}d|   d | }|S d|   d }|S )Nz---
)r:   to_yaml_string)r   r3   r   contentr9   r   r   r   rH   R   s   zDatasetMetadata._to_readmestringc                    s6   t j|tdpi } fdd| D } di |S )a'  Loads and validates the dataset metadata from a YAML string

        Args:
            string (:obj:`str`): The YAML string

        Returns:
            :class:`DatasetMetadata`: The dataset's metadata

        Raises:
            :obj:`TypeError`: If the dataset's metadata is invalid
        )Loaderc                    s4   i | ]\}}| d d jv r| d dn||qS )-r   )replace_FIELDS_WITH_DASHESr   r"   r$   rC   r   r   
<dictcomp>j   s    "z4DatasetMetadata.from_yaml_string.<locals>.<dictcomp>Nr   )yamlloadr   items)rC   rM   metadata_dictr   rS   r   rB   Z   s
   
z DatasetMetadata.from_yaml_stringc                    s*   t j fdd  D dddddS )Nc                    s,   i | ]\}}| j v r|d dn||qS )r   rO   )rQ   rP   rR   r   r   r   rT   r   s    z2DatasetMetadata.to_yaml_string.<locals>.<dictcomp>FTr=   )	sort_keysZallow_unicoder?   )rU   Z	safe_dumprW   decoder   r   r   r   rK   p   s   
zDatasetMetadata.to_yaml_string)N)r/   r0   r1   rQ   classmethodr   r   strrE   rJ   r
   rH   rB   rK   r   r   r   r   r;   /   s    
r;   c                	   @   s   e Zd ZU dZeZee ed< e	de
fddZededeeeef  ded	d fd
dZeded	d fddZded	dfddZd	ee fddZdS )MetadataConfigsz5Should be in format {config_name: {**config_params}}.
FIELD_NAMEmetadata_configc                 C   s   |  d}|d urStd| d}t|ttfst|t|trU|D ]2}t|ttfrNt|trRt|dkrNd|v rNt	
t|d rNt| dttfsRt|q$d S d S d S )N
data_filesz
                Expected data_files in YAML to be either a string or a list of strings
                or a list of dicts with two keys: 'split' and 'path', but got a  
                Examples of data_files in YAML:

                   data_files: data.csv

                   data_files: data/*.png

                   data_files:
                    - part0/*
                    - part1/*

                   data_files:
                    - split: train
                      path: train/*
                    - split: test
                      path: test/*

                   data_files:
                    - split: train
                      path:
                      - train/part1/*
                      - train/part2/*
                    - split: test
                      path: test/*

                PS: some symbols like dashes '-' are not allowed in split names
                r   splitr<   )gettextwrapdedentr   r   r\   
ValueErrordictlenrematchr   )r_   Zyaml_data_filesZyaml_error_messageZyaml_data_files_itemr   r   r   $_raise_if_data_files_field_not_valid   s4   

"z4MetadataConfigs._raise_if_data_files_field_not_validrevisionexported_parquet_filesdataset_infosr4   c                    s@    fddt |tdD  rfdd  D | S )Nc              	      sH   i | ] \}}|fd dt |tdD t |t jpddqS )c                    s(   g | ]\}}| fd d|D dqS )c                    s   g | ]
}|d   d qS )urlzrefs%2Fconvert%2Fparquet)rP   )r   Zparquet_filerk   r   r   r      s    zhMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>.<listcomp>)ra   r<   r   )r   
split_nameZparquet_files_for_splitro   r   r   r      s    
]MetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>ra   z0.0.0r`   version)r   r   r\   rb   r   rs   )r   config_nameZparquet_files_for_config)rm   rk   r   r   rT      s    

zRMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>configc                    s6   i | ]\ }  fd d|j D   d dqS )c                    s.   g | ]}  d  D ]
}|d |kr
|q
qS )r`   ra   r   )r   rp   Z	data_file)rt   metadata_configsr   r   r      s    
rq   rs   rr   )Zsplits)r   Zdataset_info)rv   rt   r   rT      s    

)r   r   rW   )rC   rk   rl   rm   r   )rm   rv   rk   r   ._from_exported_parquet_files_and_dataset_infos   s   

z>MetadataConfigs._from_exported_parquet_files_and_dataset_infosdataset_card_datac                 C   s|   | | jr;|| j }t|tstd| j d| d|D ]}d|vr,td| d| | q| dd |D S |  S )	Nz	Expected z to be a list, but got ''rt   zUEach config must include `config_name` field with a string name of a config, but got z. c                 S   s$   i | ]}|d  dd |  D qS )rt   c                 S   s   i | ]\}}|d kr||qS rw   r   )r   paramr$   r   r   r   rT      s    zEMetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>.<dictcomp>)rW   )r   ru   r   r   r   rT      s    z:MetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>)rb   r^   r   r   re   rj   )rC   ry   rv   r_   r   r   r   from_dataset_card_data   s$   

z&MetadataConfigs.from_dataset_card_dataNc                 C   s|   | r<|   D ]}| | q| |}tti ||  }| D ]
\}}|dd  q#dd | D || j< d S d S )Nrt   c                 S   s   g | ]
\}}d |i|qS rw   r   )r   rt   config_metadatar   r   r   r      s    
z8MetadataConfigs.to_dataset_card_data.<locals>.<listcomp>)valuesrj   r|   rf   sortedrW   popr^   )r   ry   r_   Zcurrent_metadata_configsZtotal_metadata_configsrt   r}   r   r   r   to_dataset_card_data   s   
z$MetadataConfigs.to_dataset_card_datac                 C   s\   d }|   D ]%\}}t| dks|dks|dr+|d u r |}qtd| d| dq|S )Nr   defaultz&Dataset has several default configs: 'z' and 'z'.)rW   rg   rb   re   )r   Zdefault_config_namert   r_   r   r   r   get_default_config_name   s   z'MetadataConfigs.get_default_config_name)r/   r0   r1   __doc__r   r^   r   r\   __annotations__staticmethodrf   rj   r[   r	   r   r   r   rx   r   r|   r   r
   r   r   r   r   r   r]   |   s&   
 2&r]   zimage-classificationtranslationzimage-segmentationz	fill-maskzautomatic-speech-recognitionztoken-classificationzsentence-similarityzaudio-classificationzquestion-answeringZsummarizationzzero-shot-classificationztable-to-textzfeature-extractionotherzmultiple-choiceztext-classificationztext-to-imageztext2text-generationzzero-shot-image-classificationztabular-classificationztabular-regressionzimage-to-imageztabular-to-textzunconditional-image-generationztext-retrievalztext-to-speechzobject-detectionzaudio-to-audioztext-generationZconversationalztable-question-answeringzvisual-question-answeringzimage-to-textzreinforcement-learning)zvoice-activity-detectionztime-series-forecastingzdocument-question-answering__main__)ArgumentParserz5Validate the yaml metadata block of a README.md file.)usagereadme_filepath)5rh   rc   collectionsr   	itertoolsr   operatorr   pathlibr   typingr   r   r   r	   r
   r   r   rU   Zhuggingface_hubr   ru   r   infor   r   Znamingr   Zutils.loggingr   Zdeprecation_utilsr   r/   loggerZ
SafeLoaderr   r\   r:   rf   r;   r]   Zknown_task_idsargparser   Zapadd_argument
parse_argsargsr   rE   Zdataset_metadataprintrJ   r   r   r   r   <module>   s    $
 L 	
 !"#
)



