o
    +if0                     @   sh  d dl Z d dlZd dlmZ d dlmZ d dlmZ d dlm	Z	 d dl
mZmZmZmZmZmZmZ d dlZd dlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZ ee Z!G dd dej"Z#de$deee$ e$f fddZ%edG dd de&Z'G dd dee$ee$ef f Z(i dg dg dg dg dg dg d g d!g d"g d#g d$g d%g d&g d'g d(g d)g d*g i d+g d,g d-g d.g d/g d0g d1g d2g d3g d4g d5g d6g d7g d8g d9g d:g d;g g g g d<Z)e d=kr2d d>l*m+Z+ e+d?d@Z,e,-dA e,. Z/e	e/j0Z0e'1e0Z2e3e2 e24e0 dS dS )B    N)Counter)groupby)
itemgetter)Path)AnyClassVarDictListOptionalTupleUnion)DatasetCardData   )METADATA_CONFIGS_FIELD)DatasetInfoDatasetInfosDict)	_split_re)
get_logger   )
deprecatedc                       s&   e Zd Zdd Zd fdd	Z  ZS )_NoDuplicateSafeLoaderc                    sR   fdd|j D }dd |D }t|  fdd D }|r'td| d S )Nc                    s   g | ]	\}} j | qS  )constructed_objects).0key_node_selfr   P/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/utils/metadata.py
<listcomp>   s    zS_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_node.<locals>.<listcomp>c                 S   s"   g | ]}t |trt|n|qS r   )
isinstancelisttupler   keyr   r   r   r      s   " c                    s   g | ]
} | d kr|qS )r   r   r#   )counterr   r   r      s    zGot duplicate yaml keys: )valuer   	TypeError)r   nodekeysduplicate_keysr   )r%   r   r   (_check_no_duplicates_on_constructed_node   s   z?_NoDuplicateSafeLoader._check_no_duplicates_on_constructed_nodeFc                    s   t  j||d}| | |S )N)deep)superconstruct_mappingr+   )r   r(   r,   mapping	__class__r   r   r.      s   
z(_NoDuplicateSafeLoader.construct_mapping)F)__name__
__module____qualname__r+   r.   __classcell__r   r   r0   r   r      s    r   readme_contentreturnc                 C   s|   t |  }|r7|d dkr7d|dd  v r7|dd  dd }d|d| }|d||d d  fS d d|fS )Nr   z---r   
)r!   
splitlinesindexjoin)r6   full_contentsep_idx	yamlblockr   r   r   _split_yaml_from_readme%   s    r?   z.Use `huggingface_hub.DatasetCardData` instead.c                   @   sz   e Zd ZdhZedeeef dd fddZdefddZ	dd	e
e defd
dZededd fddZdefddZdS )DatasetMetadatatrain_eval_indexpathr7   c                 C   sT   t |dd}t| \}}W d   n1 sw   Y  |dur'| |S |  S )aS  Loads and validates the dataset metadata from its dataset card (README.md)

        Args:
            path (:obj:`Path`): Path to the dataset card (its README.md file)

        Returns:
            :class:`DatasetMetadata`: The dataset's metadata

        Raises:
            :obj:`TypeError`: If the dataset's metadata is invalid
        utf-8encodingN)openr?   readfrom_yaml_string)clsrB   readme_fileyaml_stringr   r   r   r   from_readme4   s   
zDatasetMetadata.from_readmec                 C   s   |  rt|dd}| }W d    n1 sw   Y  nd }| |}t|ddd}|| W d    d S 1 s>w   Y  d S )NrC   rD   w)existsrF   rG   
_to_readmewrite)r   rB   rJ   r6   updated_readme_contentr   r   r   	to_readmeH   s   

"zDatasetMetadata.to_readmeNr6   c                 C   s@   |d urt |\}}d|   d | }|S d|   d }|S )Nz---
)r?   to_yaml_string)r   r6   r   contentr<   r   r   r   rO   R   s   zDatasetMetadata._to_readmestringc                    s6   t j|tdpi } fdd| D } di |S )a'  Loads and validates the dataset metadata from a YAML string

        Args:
            string (:obj:`str`): The YAML string

        Returns:
            :class:`DatasetMetadata`: The dataset's metadata

        Raises:
            :obj:`TypeError`: If the dataset's metadata is invalid
        )Loaderc                    s4   i | ]\}}| d d jv r| d dn||qS )-r   )replace_FIELDS_WITH_DASHESr   r$   r&   rI   r   r   
<dictcomp>j   s    "z4DatasetMetadata.from_yaml_string.<locals>.<dictcomp>Nr   )yamlloadr   items)rI   rU   metadata_dictr   r[   r   rH   Z   s
   
z DatasetMetadata.from_yaml_stringc                    s*   t j fdd  D dddddS )Nc                    s,   i | ]\}}| j v r|d dn||qS )r   rW   )rY   rX   rZ   r   r   r   r\   r   s    z2DatasetMetadata.to_yaml_string.<locals>.<dictcomp>FTrC   )	sort_keysallow_unicoderE   )r]   	safe_dumpr_   decoder   r   r   r   rS   p   s   
zDatasetMetadata.to_yaml_string)N)r2   r3   r4   rY   classmethodr   r   strrL   rR   r
   rO   rH   rS   r   r   r   r   r@   /   s    
r@   c                	   @   s   e Zd ZU dZeZee ed< e	de
fddZededeeeef  ded	d fd
dZeded	d fddZded	dfddZd	ee fddZdS )MetadataConfigsz5Should be in format {config_name: {**config_params}}.
FIELD_NAMEmetadata_configc                 C   s   |  d}|d urStd| d}t|ttfst|t|trU|D ]2}t|ttfrNt|trRt|dkrNd|v rNt	
t|d rNt| dttfsRt|q$d S d S d S )N
data_filesz
                Expected data_files in YAML to be either a string or a list of strings
                or a list of dicts with two keys: 'split' and 'path', but got a  
                Examples of data_files in YAML:

                   data_files: data.csv

                   data_files: data/*.png

                   data_files:
                    - part0/*
                    - part1/*

                   data_files:
                    - split: train
                      path: train/*
                    - split: test
                      path: test/*

                   data_files:
                    - split: train
                      path:
                      - train/part1/*
                      - train/part2/*
                    - split: test
                      path: test/*

                PS: some symbols like dashes '-' are not allowed in split names
                r   splitrB   )gettextwrapdedentr    r!   rf   
ValueErrordictlenrematchr   )ri   yaml_data_filesyaml_error_messageyaml_data_files_itemr   r   r   $_raise_if_data_files_field_not_valid   s4   

"z4MetadataConfigs._raise_if_data_files_field_not_validrevisionexported_parquet_filesdataset_infosr7   c                    s@    fddt |tdD  rfdd  D | S )Nc              	      sH   i | ] \}}|fd dt |tdD t |t jpddqS )c                    s(   g | ]\}}| fd d|D dqS )c                    s   g | ]
}|d   d qS )urlzrefs%2Fconvert%2Fparquet)rX   )r   parquet_filerx   r   r   r      s    zhMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>.<listcomp>)rk   rB   r   )r   
split_nameparquet_files_for_splitr}   r   r   r      s    
]MetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>.<listcomp>rk   z0.0.0rj   version)r   r   rf   rl   r   r   )r   config_nameparquet_files_for_config)rz   rx   r   r   r\      s    

zRMetadataConfigs._from_exported_parquet_files_and_dataset_infos.<locals>.<dictcomp>configc                    s6   i | ]\ }  fd d|j D   d dqS )c                    s.   g | ]}  d  D ]
}|d |kr
|q
qS )rj   rk   r   )r   r~   	data_file)r   metadata_configsr   r   r      s    
r   r   r   )splits)r   dataset_info)r   r   r   r\      s    

)r   r   r_   )rI   rx   ry   rz   r   )rz   r   rx   r   ._from_exported_parquet_files_and_dataset_infos   s   

z>MetadataConfigs._from_exported_parquet_files_and_dataset_infosdataset_card_datac                 C   s|   | | jr;|| j }t|tstd| j d| d|D ]}d|vr,td| d| | q| dd |D S |  S )	Nz	Expected z to be a list, but got ''r   zUEach config must include `config_name` field with a string name of a config, but got z. c                 S   s$   i | ]}|d  dd |  D qS )r   c                 S   s   i | ]\}}|d kr||qS r   r   )r   paramr&   r   r   r   r\      s    zEMetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>.<dictcomp>)r_   )r   r   r   r   r   r\      s    z:MetadataConfigs.from_dataset_card_data.<locals>.<dictcomp>)rl   rh   r    r!   ro   rw   )rI   r   r   ri   r   r   r   from_dataset_card_data   s$   

z&MetadataConfigs.from_dataset_card_dataNc                 C   s|   | r<|   D ]}| | q| |}tti ||  }| D ]
\}}|dd  q#dd | D || j< d S d S )Nr   c                 S   s   g | ]
\}}d |i|qS r   r   )r   r   config_metadatar   r   r   r      s    
z8MetadataConfigs.to_dataset_card_data.<locals>.<listcomp>)valuesrw   r   rp   sortedr_   poprh   )r   r   ri   current_metadata_configstotal_metadata_configsr   r   r   r   r   to_dataset_card_data   s   
z$MetadataConfigs.to_dataset_card_datac                 C   s\   d }|   D ]%\}}t| dks|dks|dr+|d u r |}qtd| d| dq|S )Nr   defaultz&Dataset has several default configs: 'z' and 'z'.)r_   rq   rl   ro   )r   default_config_namer   ri   r   r   r   get_default_config_name   s   z'MetadataConfigs.get_default_config_name)r2   r3   r4   __doc__r   rh   r   rf   __annotations__staticmethodrp   rw   re   r	   r   r   r   r   r   r   r   r
   r   r   r   r   r   rg   |   s&   
 2&rg   zimage-classificationtranslationzimage-segmentationz	fill-maskzautomatic-speech-recognitionztoken-classificationzsentence-similarityzaudio-classificationzquestion-answeringsummarizationzzero-shot-classificationztable-to-textzfeature-extractionotherzmultiple-choiceztext-classificationztext-to-imageztext2text-generationzzero-shot-image-classificationztabular-classificationztabular-regressionzimage-to-imageztabular-to-textzunconditional-image-generationztext-retrievalztext-to-speechzobject-detectionzaudio-to-audioztext-generationconversationalztable-question-answeringzvisual-question-answeringzimage-to-textzreinforcement-learning)zvoice-activity-detectionztime-series-forecastingzdocument-question-answering__main__)ArgumentParserz5Validate the yaml metadata block of a README.md file.)usagereadme_filepath)5rr   rm   collectionsr   	itertoolsr   operatorr   pathlibr   typingr   r   r   r	   r
   r   r   r]   huggingface_hubr   r   r   infor   r   namingr   utils.loggingr   deprecation_utilsr   r2   logger
SafeLoaderr   rf   r?   rp   r@   rg   known_task_idsargparser   apadd_argument
parse_argsargsr   rL   dataset_metadataprintrR   r   r   r   r   <module>   s    $
 L 	
 !"#
)



