o
    +ifh                     @   s  d Z ddlZddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZmZmZmZmZ ddlZddlmZ ddlmZmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z& e#e'Z(eG dd dZ)eG dd dZ*G dd de+Z,G dd de+Z-eG dd dZ.eG dd dZ/G dd dee0e/f Z1eG dd dZ2dS ) a  DatasetInfo and MetricInfo record information we know about a dataset and a metric.

This includes things that we know about the dataset statically, i.e.:
 - description
 - canonical location
 - does it have validation and tests splits
 - size
 - etc.

This also includes the things that can and should be computed once we've
processed the dataset as well:
 - number of examples (in each split)
 - etc.
    N)	dataclass)Path)ClassVarDictListOptionalUnion)	url_to_fs)DatasetCardDatasetCardData   )config)FeaturesValue)	SplitDict)TaskTemplatetask_template_from_dict)Version)
get_logger)asdictunique_valuesc                   @   &   e Zd ZU dZeed< dZeed< dS )SupervisedKeysData inputoutputN)__name__
__module____qualname__r   str__annotations__r    r!   r!   F/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/info.pyr   9      
 r   c                   @   r   )DownloadChecksumsEntryDatar   keyvalueN)r   r   r   r%   r   r    r&   r!   r!   r!   r"   r$   ?   r#   r$   c                   @      e Zd ZdZdS )MissingCachedSizesConfigErrorz;The expected cached sizes of the download file are missing.Nr   r   r   __doc__r!   r!   r!   r"   r(   E       r(   c                   @   r'   )NonMatchingCachedSizesErrorz/The prepared split doesn't have expected sizes.Nr)   r!   r!   r!   r"   r,   I   r+   r,   c                   @   sL   e Zd ZU dZee ed< dZee ed< dd Z	e
dedd fdd	ZdS )
PostProcessedInfoNfeaturesresources_checksumsc                 C   s0   | j d urt| j tst| j | _ d S d S d S N)r.   
isinstancer   	from_dictselfr!   r!   r"   __post_init__R   s   zPostProcessedInfo.__post_init__post_processed_info_dictreturnc                    4   dd t | D  | di  fdd| D S )Nc                 S      h | ]}|j qS r!   name.0fr!   r!   r"   	<setcomp>Y       z.PostProcessedInfo.from_dict.<locals>.<setcomp>c                       i | ]\}}| v r||qS r!   r!   r=   kvfield_namesr!   r"   
<dictcomp>Z       z/PostProcessedInfo.from_dict.<locals>.<dictcomp>r!   dataclassesfieldsitems)clsr6   r!   rE   r"   r2   W       zPostProcessedInfo.from_dict)r   r   r   r.   r   r   r    r/   dictr5   classmethodr2   r!   r!   r!   r"   r-   M   s   
 r-   c                   @   s  e Zd ZU dZejedZeed< ejedZ	eed< ejedZ
eed< ejedZeed< dZee ed< dZee ed	< dZee ed
< dZeee  ed< dZee ed< dZee ed< dZee ed< dZeeeef  ed< dZee ed< dZee ed< dZee  ed< dZ!ee  ed< dZ"ee  ed< dZ#ee  ed< g dZ$e%ee  ed< dd Z&	d8dee fddZ'd9dd Z(d!d" Z)e*d#ed  fd$d%Z+e*	d:d&edee d'd fd(d)Z,e*d*ed'd fd+d,Z-d;d<d/d0Z.d=d1d2Z/d'efd3d4Z0e*d5ed'd fd6d7Z1dS )>DatasetInfoa
  Information about a dataset.

    `DatasetInfo` documents datasets, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Not all fields are known on construction and may be updated later.

    Attributes:
        description (`str`):
            A description of the dataset.
        citation (`str`):
            A BibTeX citation of the dataset.
        homepage (`str`):
            A URL to the official homepage for the dataset.
        license (`str`):
            The dataset's license. It can be the name of the license or a paragraph containing the terms of the license.
        features ([`Features`], *optional*):
            The features used to specify the dataset's column types.
        post_processed (`PostProcessedInfo`, *optional*):
            Information regarding the resources of a possible post-processing of a dataset. For example, it can contain the information of an index.
        supervised_keys (`SupervisedKeysData`, *optional*):
            Specifies the input feature and the label for supervised learning if applicable for the dataset (legacy from TFDS).
        builder_name (`str`, *optional*):
            The name of the `GeneratorBasedBuilder` subclass used to create the dataset. Usually matched to the corresponding script name. It is also the snake_case version of the dataset builder class name.
        config_name (`str`, *optional*):
            The name of the configuration derived from [`BuilderConfig`].
        version (`str` or [`Version`], *optional*):
            The version of the dataset.
        splits (`dict`, *optional*):
            The mapping between split name and metadata.
        download_checksums (`dict`, *optional*):
            The mapping between the URL to download the dataset's checksums and corresponding metadata.
        download_size (`int`, *optional*):
            The size of the files to download to generate the dataset, in bytes.
        post_processing_size (`int`, *optional*):
            Size of the dataset in bytes after post-processing, if any.
        dataset_size (`int`, *optional*):
            The combined size in bytes of the Arrow tables for all splits.
        size_in_bytes (`int`, *optional*):
            The combined size in bytes of all files associated with the dataset (downloaded files + Arrow files).
        task_templates (`List[TaskTemplate]`, *optional*):
            The task templates to prepare the dataset for during training and evaluation. Each template casts the dataset's [`Features`] to standardized column names and types as detailed in `datasets.tasks`.
        **config_kwargs (additional keyword arguments):
            Keyword arguments to be passed to the [`BuilderConfig`] and used in the [`DatasetBuilder`].
    default_factorydescriptioncitationhomepagelicenseNr.   post_processedsupervised_keystask_templatesbuilder_namedataset_nameconfig_nameversionsplitsdownload_checksumsdownload_sizepost_processing_sizedataset_sizesize_in_bytes)r]   ra   rc   r.   r_   _INCLUDED_INFO_IN_YAMLc                    s   j d urt j tst j  _  jd ur$t jts$t j _ jd urCt jtsCt jtr<t j _nt j _ j	d urUt j	t
sUt
 j	 _	 jd urxt jtsxt jttfrot j  _n	tdi  j _ jd urt jttfrdd  jD }dd |D  _nt jtr jg _nt j}|d ur|gng  _ jd urt j _ j d urЇ fdd jD  _d S d S d S )Nc                 S   s"   g | ]}t |tr|nt|qS r!   )r1   r   r   r=   templater!   r!   r"   
<listcomp>   s    z-DatasetInfo.__post_init__.<locals>.<listcomp>c                 S   s   g | ]}|d ur|qS r0   r!   rf   r!   r!   r"   rh      s    c                    s   g | ]}|  jqS r!   )align_with_featuresr.   rf   r3   r!   r"   rh      s    r!   )r.   r1   r   r2   rX   r-   r^   r   r   r_   r   from_split_dictrY   r   tuplelistrZ   r   r   )r4   	templatesrg   r!   r3   r"   r5      s@   




zDatasetInfo.__post_init__F
deprecatedstorage_optionsc                 C   s   |dkrt dt |j}t|fi |pi ^}}|t|tj	d}| j
||d W d   n1 s6w   Y  | jra|t|tjd}| | W d   dS 1 sZw   Y  dS dS )ah  Write `DatasetInfo` and license (if present) as JSON files to `dataset_info_dir`.

        Args:
            dataset_info_dir (`str`):
                Destination directory.
            pretty_print (`bool`, defaults to `False`):
                If `True`, the JSON will be pretty-printed with the indent level of 4.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import load_dataset
        >>> ds = load_dataset("rotten_tomatoes", split="validation")
        >>> ds.info.write_to_directory("/path/to/directory/")
        ```
        rn   'fs' was deprecated in favor of 'storage_options' in version 2.9.0 and will be removed in 3.0.0.
You can remove this warning by passing 'storage_options=fs.storage_options' instead.wb)pretty_printN)warningswarnFutureWarningro   r	   open	posixpathjoinr   DATASET_INFO_FILENAME
_dump_inforW   LICENSE_FILENAME_dump_license)r4   dataset_info_dirrr   fsro   _r>   r!   r!   r"   write_to_directory   s   !"zDatasetInfo.write_to_directoryc                 C   s*   | tjt| |rdnddd dS )zQDump info in `file` file-like object open in bytes mode (to support remote files)   Nindentutf-8)writejsondumpsr   encode)r4   filerr   r!   r!   r"   rz     s   *zDatasetInfo._dump_infoc                 C   s   | | jd dS )zTDump license in `file` file-like object open in bytes mode (to support remote files)r   N)r   rW   r   )r4   r   r!   r!   r"   r|   
  s   zDatasetInfo._dump_licensedataset_infosc           
   	      s(  dd  D  t  dkrt fdd D r d S dtdd  D  }dtdd  D  }dtd	d  D  }dtd
d  D  }d }d }d }dd  D }	t |	dkrwtt|	d j|	dd   }nt |	rtt|	d }|r|nd }| |||||||dS )Nc                 S   s   g | ]
}|d ur|  qS r0   )copyr=   	dset_infor!   r!   r"   rh         z*DatasetInfo.from_merge.<locals>.<listcomp>r   c                 3   s    | ]	} d  |kV  qdS )r   Nr!   r   r   r!   r"   	<genexpr>  s    z)DatasetInfo.from_merge.<locals>.<genexpr>z

c                 s       | ]}|j V  qd S r0   )rT   r=   infor!   r!   r"   r         c                 s   r   r0   )rU   r   r!   r!   r"   r     r   c                 s   r   r0   )rV   r   r!   r!   r"   r     r   c                 s   r   r0   )rW   r   r!   r!   r"   r     r   c                 S   s   g | ]
}|j d ur|j qS r0   )rZ   r   r!   r!   r"   rh     r   r   )rT   rU   rV   rW   r.   rY   rZ   )lenallrx   r   striprl   setintersection)
rM   r   rT   rU   rV   rW   r.   rY   rZ   all_task_templatesr!   r   r"   
from_merge  s2   " zDatasetInfo.from_merger}   r7   c                 C   s   |dkrt dt |j}t|fi |pi ^}}td|  |s'td|jt	
|tjddd}t|}W d   n1 sDw   Y  | |S )	a  Create [`DatasetInfo`] from the JSON file in `dataset_info_dir`.

        This function updates all the dynamically generated fields (num_examples,
        hash, time of creation,...) of the [`DatasetInfo`].

        This will overwrite all previous metadata.

        Args:
            dataset_info_dir (`str`):
                The directory containing the metadata file. This
                should be the root directory of a specific dataset version.
            fs (`fsspec.spec.AbstractFileSystem`, *optional*):
                Instance of the remote filesystem used to download the files from.

                <Deprecated version="2.9.0">

                `fs` was deprecated in version 2.9.0 and will be removed in 3.0.0.
                Please use `storage_options` instead, e.g. `storage_options=fs.storage_options`.

                </Deprecated>

            storage_options (`dict`, *optional*):
                Key/value pairs to be passed on to the file-system backend, if any.

                <Added version="2.9.0"/>

        Example:

        ```py
        >>> from datasets import DatasetInfo
        >>> ds_info = DatasetInfo.from_directory("/path/to/directory/")
        ```
        rn   rp   zLoading Dataset info from zECalling DatasetInfo.from_directory() with undefined dataset_info_dir.rr   encodingN)rs   rt   ru   ro   r	   loggerr   
ValueErrorrv   rw   rx   r   ry   r   loadr2   )rM   r}   r~   ro   r   r>   dataset_info_dictr!   r!   r"   from_directory1  s   %
zDatasetInfo.from_directoryr   c                    r8   )Nc                 S   r9   r!   r:   r<   r!   r!   r"   r?   i  r@   z(DatasetInfo.from_dict.<locals>.<setcomp>c                    rA   r!   r!   rB   rE   r!   r"   rG   j  rH   z)DatasetInfo.from_dict.<locals>.<dictcomp>r!   rI   )rM   r   r!   rE   r"   r2   g  rN   zDatasetInfo.from_dictTother_dataset_infoc                    s.   | j }|jdi  fdd|j  D  d S )Nc                    s(   i | ]\}}|d us s|t |qS r0   r   deepcopyrB   ignore_noner!   r"   rG   o  s
    
z&DatasetInfo.update.<locals>.<dictcomp>r!   )__dict__updaterL   )r4   r   r   	self_dictr!   r   r"   r   l  s   

zDatasetInfo.updatec                 C   s    | j di dd | j D S )Nc                 S      i | ]
\}}|t |qS r!   r   rB   r!   r!   r"   rG   w  r   z$DatasetInfo.copy.<locals>.<dictcomp>r!   )	__class__r   rL   r3   r!   r!   r"   r   v  s    zDatasetInfo.copyc                 C   sf   i }t | }|D ](}|| jv r0t| |}t|dr | ||< qt|dr,| ||< q|||< q|S )N_to_yaml_list_to_yaml_string)r   re   getattrhasattrr   r   )r4   	yaml_dictr   r%   r&   r!   r!   r"   _to_yaml_dicty  s   



zDatasetInfo._to_yaml_dict	yaml_datac                    s~   t |}|dd urt|d |d< |dd ur%t|d |d< dd t| D  | di  fdd| D S )Nr.   r_   c                 S   r9   r!   r:   r<   r!   r!   r"   r?     r@   z.DatasetInfo._from_yaml_dict.<locals>.<setcomp>c                    rA   r!   r!   rB   rE   r!   r"   rG     rH   z/DatasetInfo._from_yaml_dict.<locals>.<dictcomp>r!   )	r   r   getr   _from_yaml_listr   rJ   rK   rL   )rM   r   r!   rE   r"   _from_yaml_dict  s   
 zDatasetInfo._from_yaml_dict)Frn   NF)rn   N)T)r   rQ   )r7   rQ   )2r   r   r   r*   rJ   fieldr   rT   r    rU   rV   rW   r.   r   r   rX   r-   rY   r   rZ   r   r   r[   r\   r]   r^   r   r   r_   rO   r`   ra   intrb   rc   rd   re   r   r5   r   rz   r|   rP   r   r   r2   r   r   r   r   r!   r!   r!   r"   rQ   ]   s\   
 /*

1"5

rQ   c                   @   sN   e Zd ZddddZedddZededd fd	d
ZdeddfddZdS )DatasetInfosDictFr7   Nc                 C   s  i }t j|tj}t j|tj}|s| |}||  t j|rPt	|ddd}dd |
 D }tj|||r=dnd d W d    n1 sKw   Y  t j|r_t|}	|	j}
nd }	t }
|r||
 |	d u rytdt|
 d	 n|	}	|	t| d S d S )
Nwr   r   c                 S   s   i | ]	\}}|t |qS r!   )r   r=   r]   r   r!   r!   r"   rG     s    z7DatasetInfosDict.write_to_directory.<locals>.<dictcomp>r   r   z---
z
---
)ospathrx   r   DATASETDICT_INFOS_FILENAMEREPOCARD_FILENAMEr   r   existsrv   rL   r   dumpr
   r   datar   to_dataset_card_datar   saver   )r4   dataset_infos_dir	overwriterr   total_dataset_infosdataset_infos_pathdataset_readme_pathr>   dataset_infos_dictdataset_carddataset_card_datar!   r!   r"   r     s0   



z#DatasetInfosDict.write_to_directoryc                 C   s   t d|  tjtj|tjr(t	t
|tj j}d|v r(| |S tjtj|tjr`ttj|tjdd}| dd t	| D W  d    S 1 sYw   Y  d S |  S )NzLoading Dataset Infos from dataset_infor   r   c                 S   r   r!   )rQ   r2   )r=   r]   r   r!   r!   r"   rG     s    
z3DatasetInfosDict.from_directory.<locals>.<dictcomp>)r   r   r   r   r   rx   r   r   r
   r   r   r   from_dataset_card_datar   rv   r   rL   )rM   r   r   r>   r!   r!   r"   r     s   
$zDatasetInfosDict.from_directoryr   c                 C   sl   t |dttfr3t |d tr| dd |d D S t|d }|d dd|_| |j|iS |  S )Nr   c                 S   s    i | ]}| d dt|qS )r]   default)r   rQ   r   )r=   dataset_info_yaml_dictr!   r!   r"   rG     s    z;DatasetInfosDict.from_dataset_card_data.<locals>.<dictcomp>r]   r   )r1   r   rl   rO   rQ   r   r]   )rM   r   r   r!   r!   r"   r     s   	z'DatasetInfosDict.from_dataset_card_datac                 C   s6  | rd|v rt |d tr|d dd|d i}nd|v r/t |d tr/dd |d D }ni }i |dd |  D }| D ]\}}||d< qBt|dkrutt| |d< |d 	dd }|dkrsd|i|d |d< d S d S g |d< t
| D ]\}}|	dd  d|i|}|d | qd S d S )Nr   r]   r   c                 S   s   i | ]}|d  |qS )r]   r!   )r=   config_metadatar!   r!   r"   rG     s    z9DatasetInfosDict.to_dataset_card_data.<locals>.<dictcomp>c                 S   s   i | ]	\}}||  qS r!   )r   r   r!   r!   r"   rG     s    r   )r1   rO   r   rl   rL   r   nextitervaluespopsortedappend)r4   r   dataset_metadata_infosr   r]   dset_info_yaml_dictr   r!   r!   r"   r     s@   
!z%DatasetInfosDict.to_dataset_card_data)FF)r7   N)r7   r   )	r   r   r   r   rP   r   r   r   r   r!   r!   r!   r"   r     s    r   c                   @   s  e Zd ZU dZeed< eed< eed< ejedZ	eed< ejedZ
eed< ejedZeed< ejedZee ed	< ejedZee ed
< dZeed< dZee ed< dZee ed< dZee ed< dZee ed< dd ZdddZedddZededd fddZdS )
MetricInfoa  Information about a metric.

    `MetricInfo` documents a metric, including its name, version, and features.
    See the constructor arguments and properties for a full list.

    Note: Not all fields are known on construction and may be updated later.
    rT   rU   r.   rR   inputs_descriptionrV   rW   codebase_urlsreference_urlsF
streamableNformatmetric_namer]   experiment_idc                 C   sH   | j d ur | j D ]\}}t|tstd| d|jj q
d S d S )NzSWhen using 'numpy' format, all features should be a `datasets.Value` feature. Here z is an instance of )r   r.   rL   r1   r   r   r   r   )r4   r%   r&   r!   r!   r"   r5     s   

zMetricInfo.__post_init__c                 C   s   t tj|tjddd}tjt| ||rdndd W d   n1 s&w   Y  | j	rTt tj|tj
ddd}|| j	 W d   dS 1 sMw   Y  dS dS )a  Write `MetricInfo` as JSON to `metric_info_dir`.
        Also save the license separately in LICENCE.
        If `pretty_print` is True, the JSON will be pretty-printed with the indent level of 4.

        Example:

        ```py
        >>> from datasets import load_metric
        >>> metric = load_metric("accuracy")
        >>> metric.info.write_to_directory("/path/to/directory/")
        ```
        r   r   r   r   Nr   )rv   r   r   rx   r   METRIC_INFO_FILENAMEr   r   r   rW   r{   r   )r4   metric_info_dirrr   r>   r!   r!   r"   r   #  s   "zMetricInfo.write_to_directoryr7   c                 C   sh   t d|  |stdttj|tjdd}t	
|}W d   n1 s*w   Y  | |S )a  Create MetricInfo from the JSON file in `metric_info_dir`.

        Args:
            metric_info_dir: `str` The directory containing the metadata file. This
                should be the root directory of a specific dataset version.

        Example:

        ```py
        >>> from datasets import MetricInfo
        >>> metric_info = MetricInfo.from_directory("/path/to/directory/")
        ```
        zLoading Metric info from zCCalling MetricInfo.from_directory() with undefined metric_info_dir.r   r   N)r   r   r   rv   r   r   rx   r   r   r   r   r2   )rM   r   r>   metric_info_dictr!   r!   r"   r   7  s   
zMetricInfo.from_directoryr   c                    r8   )Nc                 S   r9   r!   r:   r<   r!   r!   r"   r?   P  r@   z'MetricInfo.from_dict.<locals>.<setcomp>c                    rA   r!   r!   rB   rE   r!   r"   rG   Q  rH   z(MetricInfo.from_dict.<locals>.<dictcomp>r!   rI   )rM   r   r!   rE   r"   r2   N  rN   zMetricInfo.from_dictr   )r7   r   )r   r   r   r*   r   r    r   rJ   r   r   rV   rW   rl   r   r   r   r   boolr   r   r   r]   r   r5   r   rP   r   rO   r2   r!   r!   r!   r"   r     s*   
 	
	r   )3r*   r   rJ   r   r   rw   rs   r   pathlibr   typingr   r   r   r   r   fsspecfsspec.corer	   huggingface_hubr
   r   r   r   r.   r   r   r_   r   tasksr   r   utilsr   utils.loggingr   utils.py_utilsr   r   r   r   r   r$   	Exceptionr(   r,   r-   rQ   r   r   r   r!   r!   r!   r"   <module>   sH     6m