o
    TZhh                     @   s~  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlmZmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ e!e%Z&eG dd dZ'eG dd dZ(G dd de)Z*G dd de)Z+eG dd dZ,eG dd dZ-G dd dee.e-f Z/eG dd dZ0dS )a  DatasetInfo and MetricInfo record information we know about a dataset and a metric.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarDictListOptionalUnion)DatasetCardDatasetCardData   )config)FeaturesValue)	SplitDict)TaskTemplatetask_template_from_dict)Version)
get_logger)asdictunique_valuesc                   @   &   e Zd ZU dZeed< dZeed< dS )SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r    r    r    D/var/www/html/lang_env/lib/python3.10/site-packages/datasets/info.pyr   8      
 r   c                   @   r   )DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r$   r   r   r%   r    r    r    r!   r#   >   r"   r#   c                   @      e Zd ZdZdS )MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r    r    r    r!   r'   D       r'   c                   @   r&   )NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr(   r    r    r    r!   r+   H   r*   r+   c                   @   sL   e Zd ZU dZee ed< dZee ed< dd Z	e
dedd fdd	ZdS )
PostProcessedInfoNfeaturesresources_checksumsc                 C   s0   | j d urt| j tst| j | _ d S d S d S N)r-   
isinstancer   	from_dictselfr    r    r!   __post_init__Q   s   zPostProcessedInfo.__post_init__post_processed_info_dictreturnc                    4   dd t | D  | di  fdd| D S )Nc                 S      h | ]}|j qS r    name.0fr    r    r!   	<setcomp>X       z.PostProcessedInfo.from_dict.<locals>.<setcomp>c                       i | ]\}}| v r||qS r    r    r<   kvfield_namesr    r!   
<dictcomp>Y       z/PostProcessedInfo.from_dict.<locals>.<dictcomp>r    dataclassesfieldsitems)clsr5   r    rD   r!   r1   V       zPostProcessedInfo.from_dict)r   r   r   r-   r   r   r   r.   dictr4   classmethodr1   r    r    r    r!   r,   L   s   
 r,   c                   @   s  e Zd ZU dZejedZeed< ejedZ	eed< ejedZ
eed< ejedZeed< dZee ed< dZee ed	< dZee ed
< dZeee  ed< dZee ed< dZee ed< dZee ed< dZeeeef  ed< dZee ed< dZee ed< dZee  ed< dZ!ee  ed< dZ"ee  ed< dZ#ee  ed< g dZ$e%ee  ed< dd Z&	d8dee fddZ'd9dd Z(d!d" Z)e*d#ed  fd$d%Z+e*	d:d&edee d'd fd(d)Z,e*d*ed'd fd+d,Z-d;d<d/d0Z.d=d1d2Z/d'efd3d4Z0e*d5ed'd fd6d7Z1dS )>DatasetInfoa
  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        task_templates (`List[TaskTemplate]`, *optional*):
            The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    default_factorydescriptioncitationhomepagelicenseNr-   post_processedsupervised_keystask_templatesbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes)r\   r`   rb   r-   r^   _INCLUDED_INFO_IN_YAMLc                    s   j d urt j tst j  _  jd ur$t jts$t j _ jd urCt jtsCt jtr<t j _nt j _ j	d urUt j	t
sUt
 j	 _	 jd urxt jtsxt jttfrot j  _n	tdi  j _ jd urt jttfrdd  jD }dd |D  _nt jtr jg _nt j}|d ur|gng  _ jd urt j _ j d urЇ fdd jD  _d S d S d S )Nc                 S   s"   g | ]}t |tr|nt|qS r    )r0   r   r   r<   templater    r    r!   
<listcomp>   s    z-DatasetInfo.__post_init__.<locals>.<listcomp>c                 S   s   g | ]}|d ur|qS r/   r    re   r    r    r!   rg      s    c                    s   g | ]}|  jqS r    )Zalign_with_featuresr-   re   r2   r    r!   rg      s    r    )r-   r0   r   r1   rW   r,   r]   r   r   r^   r   Zfrom_split_dictrX   r   tuplelistrY   r   r   )r3   Z	templatesrf   r    r2   r!   r4      s@   




zDatasetInfo.__post_init__F
deprecatedstorage_optionsc                 C   s   |dkrt dt |j}tj||d\}}}|t|t	j
d}| j||d W d   n1 s4w   Y  | jr_|t|t	jd}| | W d   dS 1 sXw   Y  dS dS )ah  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        rj   'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.
You can remove this warning by passing 'storage_options=fs.storage_options' instead.rk   wb)pretty_printN)warningswarnFutureWarningrk   fsspecget_fs_token_pathsopen	posixpathjoinr   DATASET_INFO_FILENAME
_dump_inforV   LICENSE_FILENAME_dump_license)r3   dataset_info_dirro   fsrk   _r=   r    r    r!   write_to_directory   s   !"zDatasetInfo.write_to_directoryc                 C   s*   | tjt| |rdnddd dS )zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r3   filero   r    r    r!   ry     s   *zDatasetInfo._dump_infoc                 C   s   | | jd dS )zTDump license in `file` file-like object open in bytes mode (to support remote files)r   N)r   rV   r   )r3   r   r    r    r!   r{   	  s   zDatasetInfo._dump_licensedataset_infosc           
   	      s(  dd  D  t  dkrt fdd D r d S dtdd  D  }dtdd  D  }dtd	d  D  }dtd
d  D  }d }d }d }dd  D }	t |	dkrwtt|	d j|	dd   }nt |	rtt|	d }|r|nd }| |||||||dS )Nc                 S   s   g | ]
}|d ur|  qS r/   )copyr<   	dset_infor    r    r!   rg         z*DatasetInfo.from_merge.<locals>.<listcomp>r   c                 3   s    | ]	} d  |kV  qdS )r   Nr    r   r   r    r!   	<genexpr>  s    z)DatasetInfo.from_merge.<locals>.<genexpr>z

c                 s       | ]}|j V  qd S r/   )rS   r<   infor    r    r!   r         c                 s   r   r/   )rT   r   r    r    r!   r     r   c                 s   r   r/   )rU   r   r    r    r!   r     r   c                 s   r   r/   )rV   r   r    r    r!   r     r   c                 S   s   g | ]
}|j d ur|j qS r/   )rY   r   r    r    r!   rg     r   r   )rS   rT   rU   rV   r-   rX   rY   )lenallrw   r   stripri   setintersection)
rL   r   rS   rT   rU   rV   r-   rX   rY   Zall_task_templatesr    r   r!   
from_merge  s2   " zDatasetInfo.from_merger|   r6   c                 C   s   |dkrt dt |j}tj||d\}}}td|  |s%td|j	t
|tjddd}t|}W d	   n1 sBw   Y  | |S )
a  Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        rj   rl   rm   zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rr   encodingN)rp   rq   rr   rk   rs   rt   loggerr   
ValueErrorru   rv   rw   r   rx   r   loadr1   )rL   r|   r}   rk   r~   r=   dataset_info_dictr    r    r!   from_directory0  s   %
zDatasetInfo.from_directoryr   c                    r7   )Nc                 S   r8   r    r9   r;   r    r    r!   r>   h  r?   z(DatasetInfo.from_dict.<locals>.<setcomp>c                    r@   r    r    rA   rD   r    r!   rF   i  rG   z)DatasetInfo.from_dict.<locals>.<dictcomp>r    rH   )rL   r   r    rD   r!   r1   f  rM   zDatasetInfo.from_dictTother_dataset_infoc                    s.   | j }|jdi  fdd|j  D  d S )Nc                    s(   i | ]\}}|d us s|t |qS r/   r   deepcopyrA   ignore_noner    r!   rF   n  s
    
z&DatasetInfo.update.<locals>.<dictcomp>r    )__dict__updaterK   )r3   r   r   Z	self_dictr    r   r!   r   k  s   

zDatasetInfo.updatec                 C   s    | j di dd | j D S )Nc                 S      i | ]
\}}|t |qS r    r   rA   r    r    r!   rF   v  r   z$DatasetInfo.copy.<locals>.<dictcomp>r    )	__class__r   rK   r2   r    r    r!   r   u  s    zDatasetInfo.copyc                 C   sf   i }t | }|D ](}|| jv r0t| |}t|dr | ||< qt|dr,| ||< q|||< q|S )N_to_yaml_list_to_yaml_string)r   rd   getattrhasattrr   r   )r3   Z	yaml_dictr   r$   r%   r    r    r!   _to_yaml_dictx  s   



zDatasetInfo._to_yaml_dict	yaml_datac                    s~   t |}|dd urt|d |d< |dd ur%t|d |d< dd t| D  | di  fdd| D S )Nr-   r^   c                 S   r8   r    r9   r;   r    r    r!   r>     r?   z.DatasetInfo._from_yaml_dict.<locals>.<setcomp>c                    r@   r    r    rA   rD   r    r!   rF     rG   z/DatasetInfo._from_yaml_dict.<locals>.<dictcomp>r    )	r   r   getr   Z_from_yaml_listr   rI   rJ   rK   )rL   r   r    rD   r!   _from_yaml_dict  s   
 zDatasetInfo._from_yaml_dict)Frj   NF)rj   N)T)r   rP   )r6   rP   )2r   r   r   r)   rI   fieldr   rS   r   rT   rU   rV   r-   r   r   rW   r,   rX   r   rY   r   r   rZ   r[   r\   r]   r   r   r^   rN   r_   r`   intra   rb   rc   rd   r   r4   r   ry   r{   rO   r   r   r1   r   r   r   r   r    r    r    r!   rP   \   s\   
 /*

1"5

rP   c                   @   sN   e Zd ZddddZedddZededd fd	d
ZdeddfddZdS )DatasetInfosDictFr6   Nc                 C   s  i }t j|tj}t j|tj}|s| |}||  t j|rPt	|ddd}dd |
 D }tj|||r=dnd d W d    n1 sKw   Y  t j|r_t|}	|	j}
nd }	t }
|r||
 |	d u rytdt|
 d	 n|	}	|	t| d S d S )
Nwr   r   c                 S   s   i | ]	\}}|t |qS r    )r   r<   r\   r   r    r    r!   rF     s    z7DatasetInfosDict.write_to_directory.<locals>.<dictcomp>r   r   z---
z
---
)ospathrw   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr   r   existsru   rK   r   dumpr	   r   datar
   to_dataset_card_datar   saver   )r3   dataset_infos_dir	overwritero   total_dataset_infosZdataset_infos_pathZdataset_readme_pathr=   Zdataset_infos_dictZdataset_carddataset_card_datar    r    r!   r     s0   



z#DatasetInfosDict.write_to_directoryc                 C   s   t d|  tjtj|tjr(t	t
|tj j}d|v r(| |S tjtj|tjr`ttj|tjdd}| dd t	| D W  d    S 1 sYw   Y  d S |  S )NzLoading Dataset Infos from dataset_infor   r   c                 S   r   r    )rP   r1   )r<   r\   r   r    r    r!   rF     s    
z3DatasetInfosDict.from_directory.<locals>.<dictcomp>)r   r   r   r   r   rw   r   r   r	   r   r   r   from_dataset_card_datar   ru   r   rK   )rL   r   r   r=   r    r    r!   r     s   
$zDatasetInfosDict.from_directoryr   c                 C   sl   t |dttfr3t |d tr| dd |d D S t|d }|d dd|_| |j|iS |  S )Nr   c                 S   s    i | ]}| d dt|qS )r\   default)r   rP   r   )r<   dataset_info_yaml_dictr    r    r!   rF     s    z;DatasetInfosDict.from_dataset_card_data.<locals>.<dictcomp>r\   r   )r0   r   ri   rN   rP   r   r\   )rL   r   r   r    r    r!   r     s   	z'DatasetInfosDict.from_dataset_card_datac                 C   s6  | rd|v rt |d tr|d dd|d i}nd|v r/t |d tr/dd |d D }ni }i |dd |  D }| D ]\}}||d< qBt|dkrutt| |d< |d 	dd }|dkrsd|i|d |d< d S d S g |d< t
| D ]\}}|	dd  d|i|}|d | qd S d S )Nr   r\   r   c                 S   s   i | ]}|d  |qS )r\   r    )r<   Zconfig_metadatar    r    r!   rF     s    z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>c                 S   s   i | ]	\}}||  qS r    )r   r   r    r    r!   rF     s    r   )r0   rN   r   ri   rK   r   nextitervaluespopsortedappend)r3   r   Zdataset_metadata_infosr   r\   Zdset_info_yaml_dictr   r    r    r!   r     s@   
!z%DatasetInfosDict.to_dataset_card_data)FF)r6   N)r6   r   )	r   r   r   r   rO   r   r
   r   r   r    r    r    r!   r     s    r   c                   @   s  e Zd ZU dZeed< eed< eed< ejedZ	eed< ejedZ
eed< ejedZeed< ejedZee ed	< ejedZee ed
< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dd ZdddZedddZededd fddZdS )
MetricInfoa  Information about a metric.

    `MetricInfo` documents a metric, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Note: Not all fields are known on construction and may be updated later.
    rS   rT   r-   rQ   inputs_descriptionrU   rV   codebase_urlsreference_urlsF
streamableNformatmetric_namer\   experiment_idc                 C   sH   | j d ur | j D ]\}}t|tstd| d|jj q
d S d S )NzSWhen using 'numpy' format, all features should be a `datasets.Value` feature. Here z is an instance of )r   r-   rK   r0   r   r   r   r   )r3   r$   r%   r    r    r!   r4     s   

zMetricInfo.__post_init__c                 C   s   t tj|tjddd}tjt| ||rdndd W d   n1 s&w   Y  | j	rTt tj|tj
ddd}|| j	 W d   dS 1 sMw   Y  dS dS )a  Write `MetricInfo` as JSON to `metric_info_dir`.
        Also save the license separately in LICENCE.
        If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.

        Example:

        ```py
        >>> from datasets import load_metric
        >>> metric = load_metric("accuracy")
        >>> metric.info.write_to_directory("/path/to/directory/")
        ```
        r   r   r   r   Nr   )ru   r   r   rw   r   METRIC_INFO_FILENAMEr   r   r   rV   rz   r   )r3   metric_info_dirro   r=   r    r    r!   r   "  s   "zMetricInfo.write_to_directoryr6   c                 C   sh   t d|  |stdttj|tjdd}t	
|}W d   n1 s*w   Y  | |S )a  Create MetricInfo from the JSON file in `metric_info_dir`.

        Args:
            metric_info_dir: `str` The directory containing the metadata file. This
                should be the root directory of a specific dataset version.

        Example:

        ```py
        >>> from datasets import MetricInfo
        >>> metric_info = MetricInfo.from_directory("/path/to/directory/")
        ```
        zLoading Metric info from zCCalling MetricInfo.from_directory() with undefined metric_info_dir.r   r   N)r   r   r   ru   r   r   rw   r   r   r   r   r1   )rL   r   r=   metric_info_dictr    r    r!   r   6  s   
zMetricInfo.from_directoryr   c                    r7   )Nc                 S   r8   r    r9   r;   r    r    r!   r>   O  r?   z'MetricInfo.from_dict.<locals>.<setcomp>c                    r@   r    r    rA   rD   r    r!   rF   P  rG   z(MetricInfo.from_dict.<locals>.<dictcomp>r    rH   )rL   r   r    rD   r!   r1   M  rM   zMetricInfo.from_dictr   )r6   r   )r   r   r   r)   r   r   r   rI   r   r   rU   rV   ri   r   r   r   r   boolr   r   r   r\   r   r4   r   rO   r   rN   r1   r    r    r    r!   r     s*   
 	
	r   )1r)   r   rI   r   r   rv   rp   r   pathlibr   typingr   r   r   r   r   rs   Zhuggingface_hubr	   r
   r   r   r-   r   r   r^   r   tasksr   r   utilsr   Zutils.loggingr   Zutils.py_utilsr   r   r   r   r   r#   	Exceptionr'   r+   r,   rP   r   r   r   r    r    r    r!   <module>   sF     6m