o
    TZh"j                     @   s:  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl"m$Z% ddl&m'Z' erddl(m)Z) ddl*m+Z+m,Z, e#-e.Z/dZ0e1dedd  dej2Z3e1dZ4G dd de5Z6G dd de5Z7eddG dd dZ8		d;de9ded  d!ee9d"f d#ee9 d$ee9 d%e8fd&d'Z:G d(d) d)Z;G d*d+ d+e;Z<G d,d- d-e;Z=eddG d.d/ d/Z>eddG d0d1 d1Z?d2d3 Z@d4d5 ZAd6d7 ZBd8d9 ZCG d:d" d"ZDdS )<zArrow ArrowReader.    N)	dataclass)partial)Path)TYPE_CHECKINGListOptionalUnion)
thread_map   )DownloadConfig)	_split_refilenames_for_dataset_split)InMemoryTableMemoryMappedTableTableconcat_tables)logging)tqdm)cached_path)DatasetInfo)Split	SplitInfoz=https://storage.googleapis.com/huggingface-nlp/cache/datasetsz
^
 (?P<split>z)
 (\[
    ((?P<from>-?\d+)
     (?P<from_pct>%)?)?
    :
    ((?P<to>-?\d+)
     (?P<to_pct>%)?)?
 \])?(\((?P<rounding>[^\)]*)\))?
$
z\s*\+\s*c                   @      e Zd ZdZdS )DatasetNotOnHfGcsErrorz?When you can't get the dataset from the Hf google cloud storageN__name__
__module____qualname____doc__ r    r    L/var/www/html/lang_env/lib/python3.10/site-packages/datasets/arrow_reader.pyr   D       r   c                   @   r   )MissingFilesOnHfGcsErrorz9When some files are missing on the Hf oogle cloud storageNr   r    r    r    r!   r#   J   r"   r#   T)frozenc                   @   s&   e Zd ZU dZeed< ee ed< dS )FileInstructionsa}  The file instructions associated with a split ReadInstruction.

    Attributes:
        num_examples: `int`, The total number of examples
        file_instructions: List[dict(filename, skip, take)], the files information.
            The filenames contains the relative path, not absolute.
            skip/take indicates which example read in the file: `ds.slice(skip, take)`
    num_examplesfile_instructionsN)r   r   r   r   int__annotations__r   dictr    r    r    r!   r%   P   s   
 	r%   namesplit_infosr   instructionReadInstructionfiletype_suffixprefix_pathreturnc                    s  t tstdtj stddd |D }dd |D  fdd|D }t |ts9t|}||}g }d}	|D ]}
||
j	 }||
j	 }|
j	 }|
j
du r\dn|
j
}|
jdu rf|n|
j}|du r|D ]}|| }|dkrzqo|	|7 }	||||d	 qoqDd}d}t||D ]D\}}||7 }||k r||kr||kr|| nd}||k r|| | nd
}|dkrq||||d	 |	|d
kr|| n|7 }	||7 }qqDt|	|dS )a  Returns instructions of the split dict.

    Args:
        name (`str`): Name of the dataset.
        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.

    Returns:
        [`FileInstructions`]
    zExpected str 'name', but got: zExpected non-empty str 'name'c                 S      i | ]}|j |jqS r    )r+   r&   .0infor    r    r!   
<dictcomp>v       z*make_file_instructions.<locals>.<dictcomp>c                 S   r2   r    )r+   shard_lengthsr3   r    r    r!   r6   w   r7   c              
      s*   i | ]}|j t|j  |j  d qS ))pathZdataset_namesplitr/   r8   )r+   r   r3   r/   r+   Zname2shard_lengthsr0   r    r!   r6   x   s    r   N)filenameskiptaker   )r&   r'   )
isinstancestr	TypeErrortyper   
ValueErrorr.   	from_specto_absolute	splitnamefrom_toappendzipr%   )r+   r,   r-   r/   r0   name2lenZname2filenamesZabsolute_instructionsr'   r&   Z	abs_instrZsplit_length	filenamesr8   rG   rH   r<   r>   Zindex_startZ	index_endZshard_lengthr=   r    r;   r!   make_file_instructions_   sZ   









rM   c                   @   s   e Zd ZdZdeded fddZddefd	d
ZddefddZ	dd Z
	dddZ		ddee ded fddZdefddZdS )
BaseReaderz@
    Build a Dataset object out of Instruction instance(s).
    r9   r5   r   c                 C   s   || _ || _d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        N)_path_info_filetype_suffixselfr9   r5   r    r    r!   __init__   s   
zBaseReader.__init__Fr1   c                 C   s   t )=Returns a Dataset instance from given (filename, skip, take).)NotImplementedError)rS   filename_skip_take	in_memoryr    r    r!   _get_table_from_filename   s   z#BaseReader._get_table_from_filenamec                 C   s   t |dkstdd |D stdt|}|D ]}tj| j|d |d< qt	t
| j|d|tdt |dkp:d	d
}dd |D }|sU| jd	u sQ| jjd	u rUtd|pdtjg t| jjjdg}t |dkrqt|}|S |d }|S )a  Returns Dataset for given file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contain the absolute path, not relative.
                skip/take indicates which example read in the file: `ds.slice(skip, take)`
            in_memory (bool, default False): Whether to copy the data in-memory.
        r   c                 s   s    | ]}t |tV  qd S N)r?   r*   )r4   fr    r    r!   	<genexpr>   s    z)BaseReader._read_files.<locals>.<genexpr>z&please provide valid file informationsr<   rX   zLoading dataset shards   N)Z
tqdm_classZdescdisablec                 S   s   g | ]
}t |d kr|qS )r   )len)r4   tr    r    r!   
<listcomp>   s    z*BaseReader._read_files.<locals>.<listcomp>zqTried to read an empty table. Please specify at least info.features to create an empty table with the right type.)schemar
   )r`   allrC   copydeepcopyosr9   joinrO   r	   r   rY   hf_tqdmrP   featuresr   Zfrom_batchesparc   rB   r   )rS   filesrX   r[   Z	pa_tablespa_tabler    r    r!   _read_files   s,   	
 zBaseReader._read_filesc                 C   s    t |||| j| jd}|j}|S )z?Return list of dict {'filename': str, 'skip': int, 'take': int})r/   r0   )rM   rQ   rO   r'   )rS   r+   r-   r,   r'   rl   r    r    r!   get_file_instructions   s
   z BaseReader.get_file_instructionsc                 C   s6   |  |||}|sd| d}t|| j|||dS )a  Returns Dataset instance(s).

        Args:
            name (str): name of the dataset.
            instructions (ReadInstruction): instructions to read.
                Instruction can be string and will then be passed to the Instruction
                constructor as it.
            split_infos (list of SplitInfo proto): the available splits for dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
             kwargs to build a single Dataset instance.
        zInstruction "z" corresponds to no data!)rl   original_instructionsrX   )ro   rC   
read_files)rS   r+   Zinstructionsr,   rX   rl   msgr    r    r!   read   s
   zBaseReader.readNrl   rp   )Nr.   r   c                 C   sF   | j ||d}|durddlm} |t|}nd}|| j|d}|S )aJ  Returns single Dataset instance for the set of file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contains the relative path, not absolute.
                skip/take indicates which example read in the file: `ds.skip().take()`
            original_instructions: store the original instructions used to build the dataset split in the dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
            kwargs to build a Dataset instance.
        r]   Nr
   )r   )Zarrow_tabler5   r:   )rn   splitsr   r@   rP   )rS   rl   rp   rX   rm   r   r:   Zdataset_kwargsr    r    r!   rq     s   zBaseReader.read_filesdownload_configc              
   C   sD  t d |tjd }z0tj|d}t|tjd|d}t|tj| j	d | j
dur:| j
| j
| j	 W n tyL } zt|dd}~ww zC| j
jD ];}| j| j
j|| j
j d}|D ](}	tt|	d | j	}
tj||
}t|tjd|d}t||	d  qdqRW dS  ty } zt|dd}~ww )a%  
        Download the dataset files from the Hf GCS

        Args:
            dl_cache_dir: `str`, the local cache directory used to download files
            relative_data_dir: `str`, the relative directory of the remote files from
                the `datasets` directory on GCS.

        /zdataset_info.json)ru   N)r+   r-   r,   r<   )HF_GCP_BASE_URLreplacerg   sepr9   rh   r   shutilmoverO   rP   updatefrom_directoryFileNotFoundErrorr   rt   ro   Zbuilder_namevaluesr@   r   relative_tor#   )rS   ru   Zrelative_data_dirZremote_cache_dirZremote_dataset_infoZdownloaded_dataset_infoerrr:   r'   Zfile_instructionZfile_to_downloadZremote_prepared_filenameZdownloaded_prepared_filenamer    r    r!   download_from_hf_gcs  sD   




zBaseReader.download_from_hf_gcsF)NF)r   r   r   r   r@   r   rT   r   rY   rn   ro   rs   r   r*   r   rq   r   r   r    r    r    r!   rN      s      

rN   c                       sR   e Zd ZdZdeded f fddZddefd	d
Ze	ddefddZ
  ZS )ArrowReaderz
    Build a Dataset object out of Instruction instance(s).
    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
    r9   r5   r   c                       t  || d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where Arrow files are stored.
            info (DatasetInfo): info about the dataset.
        arrowNsuperrT   rQ   rR   	__class__r    r!   rT   L     
zArrowReader.__init__Fr1   c                 C   s   |d d|v r|d ndd|v r|d nd}}}t j||d}|dkr+t|| }|durC|durC|dkr=|t|ksC|||}|S )rU   r<   r=   Nr>   r]   r   r   )r   
read_tabler`   slice)rS   rW   rX   r<   r=   r>   tabler    r    r!   rY   V  s   
$z$ArrowReader._get_table_from_filenamec                 C   s   |rt nt}|| S )z
        Read table from file.

        Args:
            filename (str): File name of the table.
            in_memory (bool, default=False): Whether to copy the data in-memory.

        Returns:
            pyarrow.Table
        )r   r   	from_file)r<   rX   Z	table_clsr    r    r!   r   e  s   
zArrowReader.read_tabler   )r   r   r   r   r@   r   rT   r   rY   staticmethodr   __classcell__r    r    r   r!   r   F  s    
r   c                       s6   e Zd ZdZdeded f fddZdd Z  ZS )	ParquetReaderzv
    Build a Dataset object out of Instruction instance(s).
    This Reader uses memory mapping on parquet files.
    r9   r5   r   c                    r   )zInitializes ParquetReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        parquetNr   rR   r   r    r!   rT   {  r   zParquetReader.__init__c                 K   sv   |d d|v r|d ndd|v r|d nd}}}t j|dd}|dur9|dur9|dkr3|t|ks9|||}|S )rU   r<   r=   Nr>   T)Z
memory_mapr   )pqr   r`   r   )rS   rW   kwargsr<   r=   r>   rm   r    r    r!   rY     s   
$z&ParquetReader._get_table_from_filename)	r   r   r   r   r@   r   rT   rY   r   r    r    r   r!   r   u  s    
r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )_AbsoluteInstructionz?A machine friendly slice: defined absolute positive boundaries.rF   rG   rH   N)r   r   r   r   r@   r)   r(   r    r    r    r!   r     s
   
 r   c                   @   sb   e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dd	 ZdS )
_RelativeInstructionzHRepresents a single parsed slicing instruction, can use % and negatives.rF   NrG   rH   unitroundingc                 C   s   | j d ur| j dvrtd| jd ur| jdvrtd| j dkr*| jd ur*td| j dkr?| jd ur?t| jdkr?td| j dkrT| jd urTt| jdkrTtd| jd u r`| j dkr`d	n| j| jd
< d S )N)%abszunit must be either % or abs)closestZpct1_dropremainderz5rounding must be either closest or pct1_dropremainderr   zAIt is forbidden to specify rounding if not using percent slicing.d   z2Percent slice boundaries must be > -100 and < 100.r   r   )r   rC   r   rG   r   rH   __dict__rS   r    r    r!   __post_init__  s   ""(z"_RelativeInstruction.__post_init__)r   r   r   r   r@   r)   rG   r   r(   rH   r   r   r   r    r    r    r!   r     s   
 r   c                 C   s   t | }|std|  |ds|drdnd}t|d|d|dr1t|dnd	|d
rAt|d
|dS d	|dS )z)Returns ReadInstruction for given string.z!Unrecognized instruction format: Zfrom_pctZto_pctr   r   r:   r   fromNrH   )
split_namer   rG   rH   r   )_SUB_SPEC_REmatchrC   groupr.   r(   )specresr   r    r    r!   _str_to_read_instruction  s   
r   c                 C   s&   |dk r
d}t || t|d  S )Nr   zUsing "pct1_dropremainder" rounding on a split with less than 100 elements is forbidden: it always results in an empty dataset.      Y@)rC   mathtrunc)boundaryr&   rr   r    r    r!   _pct_to_abs_pct1  s
   r   c                 C   s   t t| | d S )Nr   )r(   round)r   r&   r    r    r!   _pct_to_abs_closest  s   r   c                 C   s   | j dkrtnt}| j}||vrtd| dt| d|| }| j}| j}| jdkrC|du r2dn|||}|du r=|n|||}n|du rIdn|}|du rQ|n|}|dk r^t	|| d}|dk rit	|| d}t
||}t
||}t|||S )zReturns _AbsoluteInstruction instance for given RelativeInstruction.

    Args:
        rel_instr: RelativeInstruction instance.
        name2len: dict {split_name: num_examples}.
    r   zUnknown split "z". Should be one of .r   Nr   )r   r   r   rF   rC   listrG   rH   r   maxminr   )	rel_instrrK   Z
pct_to_absr:   r&   rG   rH   r    r    r!   _rel_to_abs_instr  s&   


r   c                   @   sb   e Zd ZdZdd Zedd ZdddZed	d
 Zdd Z	dd Z
dd Zdd Zdd ZdS )r.   a  Reading instruction for a dataset.

    Examples::

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%'))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%]+train[1:-1]'))
      ds = datasets.load_dataset('mnist', split=(
          datasets.ReadInstruction('test', to=33, unit='%') +
          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%](pct1_dropremainder)'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))

      # 10-fold validation:
      tests = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
          for k in range(0, 100, 10)])
      trains = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
          for k in range(0, 100, 10)])

    c                 C   s
   || _ d S rZ   _relative_instructions)rS   relative_instructionsr    r    r!   _init  s   
zReadInstruction._initc                 C   s   |  | }|| |S )zCReturns ReadInstruction obj initialized with relative_instructions.)__new__r   )clsr   resultr    r    r!   ,_read_instruction_from_relative_instructions  s   

z<ReadInstruction._read_instruction_from_relative_instructionsNc                 C   s   |  t|||||g dS )a  Initialize ReadInstruction.

        Args:
            split_name (str): name of the split to read. Eg: 'train'.
            rounding (str, optional): The rounding behaviour to use when percent slicing is
                used. Ignored when slicing with absolute indices.
                Possible values:
                 - 'closest' (default): The specified percentages are rounded to the
                     closest value. Use this if you want specified percents to be as
                     much exact as possible.
                 - 'pct1_dropremainder': the specified percentages are treated as
                     multiple of 1%. Use this option if you want consistency. Eg:
                         len(5%) == 5 * len(1%).
                     Using this option, one might not be able to use the full set of
                     examples, if the number of those is not a multiple of 100.
            from_ (int):
            to (int): alternative way of specifying slicing boundaries. If any of
                {from_, to, unit} argument is used, slicing cannot be specified as
                string.
            unit (str): optional, one of:
                '%': to set the slicing unit as percents of the split size.
                'abs': to set the slicing unit as absolute numbers.
        N)r   r   )rS   r   r   rG   rH   r   r    r    r!   rT   $  s   zReadInstruction.__init__c                 C   sL   t |}t|}|std| t|d }tdd |dd D |S )aM  Creates a `ReadInstruction` instance out of a string spec.

        Args:
            spec (`str`):
                Split(s) + optional slice(s) to read + optional rounding
                if percents are used as the slicing unit. A slice can be specified,
                using absolute numbers (`int`) or percentages (`int`).

        Examples:

            ```
            test: test split.
            test + validation: test split + validation split.
            test[10:]: test split, minus its first 10 records.
            test[:10%]: first 10% records of test split.
            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
            ```

        Returns:
            ReadInstruction instance.
        z&No instructions could be built out of r   c                 s   s    | ]}t |V  qd S rZ   )r   )r4   subr    r    r!   r\   ^  s    z,ReadInstruction.from_spec.<locals>.<genexpr>r
   N)r@   _ADDITION_SEP_REr:   rC   r   sum)r   r   subsr-   r    r    r!   rD   A  s   
zReadInstruction.from_specc           
      C   s   g }| j D ]c}|j}|jd us|jd urc|j}|j}|j}|j}|dkr&|nd}|d ur2t|| nd}|d ur>t|| nd}d| d| d}|dkr[|d ur[|dkr[d| dnd}	|||	 7 }|| qd	|S )
Nr    [:]r   ()+)	r   rF   rG   rH   r   r   r@   rI   rh   )
rS   Zrel_instr_specsr   Zrel_instr_specrG   rH   r   r   Z	slice_strZrounding_strr    r    r!   to_spec`  s"   
&
zReadInstruction.to_specc                 C   sj   t |tsd}t|| j}|j}|d jdkr.|d jdkr.| jd j|d jkr.td| || S )zEReturns a new ReadInstruction obj, result of appending other to self.zAReadInstruction can only be added to another ReadInstruction obj.r   r   zPIt is forbidden to sum ReadInstruction instances with different rounding values.)r?   r.   rA   r   r   r   rC   r   )rS   otherrr   Zself_risZ	other_risr    r    r!   __add__t  s   
zReadInstruction.__add__c                 C   s   |   S rZ   )r   r   r    r    r!   __str__  s   zReadInstruction.__str__c                 C   s   d| j  dS )NzReadInstruction(r   r   r   r    r    r!   __repr__  s   zReadInstruction.__repr__c                    s    fdd| j D S )aZ  Translate instruction into a list of absolute instructions.

        Those absolute instructions are then to be added together.

        Args:
            name2len (`dict`):
                Associating split names to number of examples.

        Returns:
            list of _AbsoluteInstruction instances (corresponds to the + in spec).
        c                    s   g | ]}t | qS r    )r   )r4   r   rK   r    r!   rb     r7   z/ReadInstruction.to_absolute.<locals>.<listcomp>r   )rS   rK   r    r   r!   rE     s   zReadInstruction.to_absolute)NNNN)r   r   r   r   r   classmethodr   rT   rD   r   r   r   r   rE   r    r    r    r!   r.     s    &


)NN)Er   re   r   rg   rerz   dataclassesr   	functoolsr   pathlibr   typingr   r   r   r   Zpyarrowrk   Zpyarrow.parquetr   r   Ztqdm.contrib.concurrentr	   Zdownload.download_configr   Znamingr   r   r   r   r   r   r   utilsr   r   ri   Zutils.file_utilsr   r5   r   rt   r   r   Z
get_loggerr   loggerrw   compileXr   r   ConnectionErrorr   r#   r%   r@   rM   rN   r   r   r   r   r   r   r   r   r.   r    r    r    r!   <module>   s~   




K /