o
    +ifdj                     @   sF  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
mZ ddlmZmZmZmZ ddlZddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZm Z m!Z! ddl"m#Z# ddl"m$Z% ddl&m'Z' ddl(m)Z) erddl*m+Z+ ddl,m-Z-m.Z. e#/e0Z1dZ2e3dedd  dej4Z5e3dZ6G dd de7Z8G dd de7Z9eddG dd dZ:		d<de;d ed! d"ee;d#f d$ee; d%ee; d&e:fd'd(Z<G d)d* d*Z=G d+d, d,e=Z>G d-d. d.e=Z?eddG d/d0 d0Z@eddG d1d2 d2ZAd3d4 ZBd5d6 ZCd7d8 ZDd9d: ZEG d;d# d#ZFdS )=zArrow ArrowReader.    N)	dataclass)partial)Path)TYPE_CHECKINGListOptionalUnion)
thread_map   )DownloadConfig)	_split_refilenames_for_dataset_split)InMemoryTableMemoryMappedTableTableconcat_tables)logging)tqdm)
deprecated)cached_path)DatasetInfo)Split	SplitInfoz=https://storage.googleapis.com/huggingface-nlp/cache/datasetsz
^
 (?P<split>z)
 (\[
    ((?P<from>-?\d+)
     (?P<from_pct>%)?)?
    :
    ((?P<to>-?\d+)
     (?P<to_pct>%)?)?
 \])?(\((?P<rounding>[^\)]*)\))?
$
z\s*\+\s*c                   @      e Zd ZdZdS )DatasetNotOnHfGcsErrorz?When you can't get the dataset from the Hf google cloud storageN__name__
__module____qualname____doc__ r!   r!   N/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/arrow_reader.pyr   E       r   c                   @   r   )MissingFilesOnHfGcsErrorz9When some files are missing on the Hf oogle cloud storageNr   r!   r!   r!   r"   r$   K   r#   r$   T)frozenc                   @   s&   e Zd ZU dZeed< ee ed< dS )FileInstructionsa}  The file instructions associated with a split ReadInstruction.

    Attributes:
        num_examples: `int`, The total number of examples
        file_instructions: List[dict(filename, skip, take)], the files information.
            The filenames contains the relative path, not absolute.
            skip/take indicates which example read in the file: `ds.slice(skip, take)`
    num_examplesfile_instructionsN)r   r   r   r    int__annotations__r   dictr!   r!   r!   r"   r&   Q   s   
 	r&   namesplit_infosr   instructionReadInstructionfiletype_suffixprefix_pathreturnc                    s  t tstdtj stddd |D }dd |D  fdd|D }t |ts9t|}||}g }d}	|D ]}
||
j	 }||
j	 }|
j	 }|
j
du r\dn|
j
}|
jdu rf|n|
j}|du r|D ]}|| }|dkrzqo|	|7 }	||||d	 qoqDd}d}t||D ]D\}}||7 }||k r||kr||kr|| nd}||k r|| | nd
}|dkrq||||d	 |	|d
kr|| n|7 }	||7 }qqDt|	|dS )a  Returns instructions of the split dict.

    Args:
        name (`str`): Name of the dataset.
        split_infos (`list` of `[SplitInfo]`): Dataset splits information.
        instruction ([`ReadInstruction`] or `str`): Reading instruction for a dataset.
        filetype_suffix (`str`, *optional*): Suffix of dataset files, e.g. 'arrow' or 'parquet'.
        prefix_path (`str`, *optional*): Prefix of dataset files, e.g. directory name.

    Returns:
        [`FileInstructions`]
    zExpected str 'name', but got: zExpected non-empty str 'name'c                 S      i | ]}|j |jqS r!   )r,   r'   .0infor!   r!   r"   
<dictcomp>w       z*make_file_instructions.<locals>.<dictcomp>c                 S   r3   r!   )r,   shard_lengthsr4   r!   r!   r"   r7   x   r8   c              
      s*   i | ]}|j t|j  |j  d qS ))pathdataset_namesplitr0   r9   )r,   r   r4   r0   r,   name2shard_lengthsr1   r!   r"   r7   y   s    r   N)filenameskiptaker   )r'   r(   )
isinstancestr	TypeErrortyper   
ValueErrorr/   	from_specto_absolute	splitnamefrom_toappendzipr&   )r,   r-   r.   r0   r1   name2lenname2filenamesabsolute_instructionsr(   r'   	abs_instrsplit_length	filenamesr9   rJ   rK   r?   rA   index_start	index_endshard_lengthr@   r!   r=   r"   make_file_instructions`   sZ   









rW   c                   @   s   e Zd ZdZdeded fddZddefd	d
ZddefddZ	dd Z
	dddZ		ddee ded fddZe defddZdS )
BaseReaderz@
    Build a Dataset object out of Instruction instance(s).
    r:   r6   r   c                 C   s   || _ || _d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        N)_path_info_filetype_suffixselfr:   r6   r!   r!   r"   __init__   s   
zBaseReader.__init__Fr2   c                 C   s   t )=Returns a Dataset instance from given (filename, skip, take).)NotImplementedError)r]   filename_skip_take	in_memoryr!   r!   r"   _get_table_from_filename   s   z#BaseReader._get_table_from_filenamec                 C   s   t |dkstdd |D stdt|}|D ]}tj| j|d |d< qt	t
| j|d|tdt |dkp:d	d
}dd |D }|sU| jd	u sQ| jjd	u rUtd|pdtjg t| jjjdg}t |dkrqt|}|S |d }|S )a  Returns Dataset for given file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contain the absolute path, not relative.
                skip/take indicates which example read in the file: `ds.slice(skip, take)`
            in_memory (bool, default False): Whether to copy the data in-memory.
        r   c                 s   s    | ]}t |tV  qd S N)rB   r+   )r5   fr!   r!   r"   	<genexpr>   s    z)BaseReader._read_files.<locals>.<genexpr>z&please provide valid file informationsr?   rb   zLoading dataset shards   N)
tqdm_classdescdisablec                 S   s   g | ]
}t |d kr|qS )r   )len)r5   tr!   r!   r"   
<listcomp>   s    z*BaseReader._read_files.<locals>.<listcomp>zqTried to read an empty table. Please specify at least info.features to create an empty table with the right type.)schemar
   )rl   allrF   copydeepcopyosr:   joinrY   r	   r   rc   hf_tqdmrZ   featuresr   from_batchesparo   rE   r   )r]   filesrb   re   	pa_tablespa_tabler!   r!   r"   _read_files   s,   	
 zBaseReader._read_filesc                 C   s    t |||| j| jd}|j}|S )z?Return list of dict {'filename': str, 'skip': int, 'take': int})r0   r1   )rW   r[   rY   r(   )r]   r,   r.   r-   r(   ry   r!   r!   r"   get_file_instructions   s
   z BaseReader.get_file_instructionsc                 C   s6   |  |||}|sd| d}t|| j|||dS )a  Returns Dataset instance(s).

        Args:
            name (str): name of the dataset.
            instructions (ReadInstruction): instructions to read.
                Instruction can be string and will then be passed to the Instruction
                constructor as it.
            split_infos (list of SplitInfo proto): the available splits for dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
             kwargs to build a single Dataset instance.
        zInstruction "z" corresponds to no data!)ry   original_instructionsrb   )r}   rF   
read_files)r]   r,   instructionsr-   rb   ry   msgr!   r!   r"   read   s
   zBaseReader.readNry   r~   )Nr/   r   c                 C   sF   | j ||d}|durddlm} |t|}nd}|| j|d}|S )aJ  Returns single Dataset instance for the set of file instructions.

        Args:
            files: List[dict(filename, skip, take)], the files information.
                The filenames contains the relative path, not absolute.
                skip/take indicates which example read in the file: `ds.skip().take()`
            original_instructions: store the original instructions used to build the dataset split in the dataset.
            in_memory (bool, default False): Whether to copy the data in-memory.

        Returns:
            kwargs to build a Dataset instance.
        rg   Nr
   )r   )arrow_tabler6   r<   )r|   splitsr   rC   rZ   )r]   ry   r~   rb   r{   r   r<   dataset_kwargsr!   r!   r"   r     s   zBaseReader.read_filesdownload_configc              
   C   sD  t d |tjd }z0tj|d}t|tjd|d}t|tj| j	d | j
dur:| j
| j
| j	 W n tyL } zt|dd}~ww zC| j
jD ];}| j| j
j|| j
j d}|D ](}	tt|	d | j	}
tj||
}t|tjd|d}t||	d  qdqRW dS  ty } zt|dd}~ww )a%  
        Download the dataset files from the Hf GCS

        Args:
            dl_cache_dir: `str`, the local cache directory used to download files
            relative_data_dir: `str`, the relative directory of the remote files from
                the `datasets` directory on GCS.

        /zdataset_info.json)r   N)r,   r.   r-   r?   )HF_GCP_BASE_URLreplacers   sepr:   rt   r   shutilmoverY   rZ   updatefrom_directoryFileNotFoundErrorr   r   r}   builder_namevaluesrC   r   relative_tor$   )r]   r   relative_data_dirremote_cache_dirremote_dataset_infodownloaded_dataset_infoerrr<   r(   file_instructionfile_to_downloadremote_prepared_filenamedownloaded_prepared_filenamer!   r!   r"   download_from_hf_gcs   sD   



zBaseReader.download_from_hf_gcsF)NF)r   r   r   r    rC   r   r^   r   rc   r|   r}   r   r   r+   r   r   r   r   r   r!   r!   r!   r"   rX      s"     

rX   c                       sR   e Zd ZdZdeded f fddZddefd	d
Ze	ddefddZ
  ZS )ArrowReaderz
    Build a Dataset object out of Instruction instance(s).
    This Reader uses either memory mapping or file descriptors (in-memory) on arrow files.
    r:   r6   r   c                       t  || d| _dS )zInitializes ArrowReader.

        Args:
            path (str): path where Arrow files are stored.
            info (DatasetInfo): info about the dataset.
        arrowNsuperr^   r[   r\   	__class__r!   r"   r^   N     
zArrowReader.__init__Fr2   c                 C   s   |d d|v r|d ndd|v r|d nd}}}t j||d}|dkr+t|| }|durC|durC|dkr=|t|ksC|||}|S )r_   r?   r@   NrA   rg   r   r   )r   
read_tablerl   slice)r]   ra   rb   r?   r@   rA   tabler!   r!   r"   rc   X  s   
$z$ArrowReader._get_table_from_filenamec                 C   s   |rt nt}|| S )z
        Read table from file.

        Args:
            filename (str): File name of the table.
            in_memory (bool, default=False): Whether to copy the data in-memory.

        Returns:
            pyarrow.Table
        )r   r   	from_file)r?   rb   	table_clsr!   r!   r"   r   g  s   
zArrowReader.read_tabler   )r   r   r   r    rC   r   r^   r   rc   staticmethodr   __classcell__r!   r!   r   r"   r   H  s    
r   c                       s6   e Zd ZdZdeded f fddZdd Z  ZS )	ParquetReaderzv
    Build a Dataset object out of Instruction instance(s).
    This Reader uses memory mapping on parquet files.
    r:   r6   r   c                    r   )zInitializes ParquetReader.

        Args:
            path (str): path where tfrecords are stored.
            info (DatasetInfo): info about the dataset.
        parquetNr   r\   r   r!   r"   r^   }  r   zParquetReader.__init__c                 K   sv   |d d|v r|d ndd|v r|d nd}}}t j|dd}|dur9|dur9|dkr3|t|ks9|||}|S )r_   r?   r@   NrA   T)
memory_mapr   )pqr   rl   r   )r]   ra   kwargsr?   r@   rA   r{   r!   r!   r"   rc     s   
$z&ParquetReader._get_table_from_filename)	r   r   r   r    rC   r   r^   rc   r   r!   r!   r   r"   r   w  s    
r   c                   @   s*   e Zd ZU dZeed< eed< eed< dS )_AbsoluteInstructionz?A machine friendly slice: defined absolute positive boundaries.rI   rJ   rK   N)r   r   r   r    rC   r*   r)   r!   r!   r!   r"   r     s
   
 r   c                   @   sb   e Zd ZU dZeed< dZee ed< dZ	ee ed< dZ
ee ed< dZee ed< dd	 ZdS )
_RelativeInstructionzHRepresents a single parsed slicing instruction, can use % and negatives.rI   NrJ   rK   unitroundingc                 C   s   | j d ur| j dvrtd| jd ur| jdvrtd| j dkr*| jd ur*td| j dkr?| jd ur?t| jdkr?td| j dkrT| jd urTt| jdkrTtd| jd u r`| j dkr`d	n| j| jd
< d S )N)%abszunit must be either % or abs)closestpct1_dropremainderz5rounding must be either closest or pct1_dropremainderr   zAIt is forbidden to specify rounding if not using percent slicing.d   z2Percent slice boundaries must be > -100 and < 100.r   r   )r   rF   r   rJ   r   rK   __dict__r]   r!   r!   r"   __post_init__  s   ""(z"_RelativeInstruction.__post_init__)r   r   r   r    rC   r*   rJ   r   r)   rK   r   r   r   r!   r!   r!   r"   r     s   
 r   c                 C   s   t | }|std|  |ds|drdnd}t|d|d|dr1t|dnd	|d
rAt|d
|dS d	|dS )z)Returns ReadInstruction for given string.z!Unrecognized instruction format: from_pctto_pctr   r   r<   r   fromNrK   )
split_namer   rJ   rK   r   )_SUB_SPEC_REmatchrF   groupr/   r)   )specresr   r!   r!   r"   _str_to_read_instruction  s   
r   c                 C   s&   |dk r
d}t || t|d  S )Nr   zUsing "pct1_dropremainder" rounding on a split with less than 100 elements is forbidden: it always results in an empty dataset.      Y@)rF   mathtrunc)boundaryr'   r   r!   r!   r"   _pct_to_abs_pct1  s
   r   c                 C   s   t t| | d S )Nr   )r)   round)r   r'   r!   r!   r"   _pct_to_abs_closest  s   r   c                 C   s   | j dkrtnt}| j}||vrtd| dt| d|| }| j}| j}| jdkrC|du r2dn|||}|du r=|n|||}n|du rIdn|}|du rQ|n|}|dk r^t	|| d}|dk rit	|| d}t
||}t
||}t|||S )zReturns _AbsoluteInstruction instance for given RelativeInstruction.

    Args:
        rel_instr: RelativeInstruction instance.
        name2len: dict {split_name: num_examples}.
    r   zUnknown split "z". Should be one of .r   Nr   )r   r   r   rI   rF   listrJ   rK   r   maxminr   )	rel_instrrN   
pct_to_absr<   r'   rJ   rK   r!   r!   r"   _rel_to_abs_instr  s&   


r   c                   @   sb   e Zd ZdZdd Zedd ZdddZed	d
 Zdd Z	dd Z
dd Zdd Zdd ZdS )r/   a  Reading instruction for a dataset.

    Examples::

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec('test[:33%]'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction('test', to=33, unit='%'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%'))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%]+train[1:-1]')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%]+train[1:-1]'))
      ds = datasets.load_dataset('mnist', split=(
          datasets.ReadInstruction('test', to=33, unit='%') +
          datasets.ReadInstruction('train', from_=1, to=-1, unit='abs')))

      # The following lines are equivalent:
      ds = datasets.load_dataset('mnist', split='test[:33%](pct1_dropremainder)')
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction.from_spec(
          'test[:33%](pct1_dropremainder)'))
      ds = datasets.load_dataset('mnist', split=datasets.ReadInstruction(
          'test', from_=0, to=33, unit='%', rounding="pct1_dropremainder"))

      # 10-fold validation:
      tests = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', from_=k, to=k+10, unit='%')
          for k in range(0, 100, 10)])
      trains = datasets.load_dataset(
          'mnist',
          [datasets.ReadInstruction('train', to=k, unit='%') + datasets.ReadInstruction('train', from_=k+10, unit='%')
          for k in range(0, 100, 10)])

    c                 C   s
   || _ d S rd   _relative_instructions)r]   relative_instructionsr!   r!   r"   _init  s   
zReadInstruction._initc                 C   s   |  | }|| |S )zCReturns ReadInstruction obj initialized with relative_instructions.)__new__r   )clsr   resultr!   r!   r"   ,_read_instruction_from_relative_instructions  s   

z<ReadInstruction._read_instruction_from_relative_instructionsNc                 C   s   |  t|||||g dS )a  Initialize ReadInstruction.

        Args:
            split_name (str): name of the split to read. Eg: 'train'.
            rounding (str, optional): The rounding behaviour to use when percent slicing is
                used. Ignored when slicing with absolute indices.
                Possible values:
                 - 'closest' (default): The specified percentages are rounded to the
                     closest value. Use this if you want specified percents to be as
                     much exact as possible.
                 - 'pct1_dropremainder': the specified percentages are treated as
                     multiple of 1%. Use this option if you want consistency. Eg:
                         len(5%) == 5 * len(1%).
                     Using this option, one might not be able to use the full set of
                     examples, if the number of those is not a multiple of 100.
            from_ (int):
            to (int): alternative way of specifying slicing boundaries. If any of
                {from_, to, unit} argument is used, slicing cannot be specified as
                string.
            unit (str): optional, one of:
                '%': to set the slicing unit as percents of the split size.
                'abs': to set the slicing unit as absolute numbers.
        N)r   r   )r]   r   r   rJ   rK   r   r!   r!   r"   r^   &  s   zReadInstruction.__init__c                 C   sL   t |}t|}|std| t|d }tdd |dd D |S )aM  Creates a `ReadInstruction` instance out of a string spec.

        Args:
            spec (`str`):
                Split(s) + optional slice(s) to read + optional rounding
                if percents are used as the slicing unit. A slice can be specified,
                using absolute numbers (`int`) or percentages (`int`).

        Examples:

            ```
            test: test split.
            test + validation: test split + validation split.
            test[10:]: test split, minus its first 10 records.
            test[:10%]: first 10% records of test split.
            test[:20%](pct1_dropremainder): first 10% records, rounded with the pct1_dropremainder rounding.
            test[:-5%]+train[40%:60%]: first 95% of test + middle 20% of train.
            ```

        Returns:
            ReadInstruction instance.
        z&No instructions could be built out of r   c                 s   s    | ]}t |V  qd S rd   )r   )r5   subr!   r!   r"   rf   `  s    z,ReadInstruction.from_spec.<locals>.<genexpr>r
   N)rC   _ADDITION_SEP_REr<   rF   r   sum)r   r   subsr.   r!   r!   r"   rG   C  s   
zReadInstruction.from_specc           
      C   s   g }| j D ]c}|j}|jd us|jd urc|j}|j}|j}|j}|dkr&|nd}|d ur2t|| nd}|d ur>t|| nd}d| d| d}|dkr[|d ur[|dkr[d| dnd}	|||	 7 }|| qd	|S )
Nr    [:]r   ()+)	r   rI   rJ   rK   r   r   rC   rL   rt   )
r]   rel_instr_specsr   rel_instr_specrJ   rK   r   r   	slice_strrounding_strr!   r!   r"   to_specb  s"   
&
zReadInstruction.to_specc                 C   sj   t |tsd}t|| j}|j}|d jdkr.|d jdkr.| jd j|d jkr.td| || S )zEReturns a new ReadInstruction obj, result of appending other to self.zAReadInstruction can only be added to another ReadInstruction obj.r   r   zPIt is forbidden to sum ReadInstruction instances with different rounding values.)rB   r/   rD   r   r   r   rF   r   )r]   otherr   self_ris	other_risr!   r!   r"   __add__v  s   
zReadInstruction.__add__c                 C   s   |   S rd   )r   r   r!   r!   r"   __str__  s   zReadInstruction.__str__c                 C   s   d| j  dS )NzReadInstruction(r   r   r   r!   r!   r"   __repr__  s   zReadInstruction.__repr__c                    s    fdd| j D S )aZ  Translate instruction into a list of absolute instructions.

        Those absolute instructions are then to be added together.

        Args:
            name2len (`dict`):
                Associating split names to number of examples.

        Returns:
            list of _AbsoluteInstruction instances (corresponds to the + in spec).
        c                    s   g | ]}t | qS r!   )r   )r5   r   rN   r!   r"   rn     r8   z/ReadInstruction.to_absolute.<locals>.<listcomp>r   )r]   rN   r!   r   r"   rH     s   zReadInstruction.to_absolute)NNNN)r   r   r   r    r   classmethodr   r^   rG   r   r   r   r   rH   r!   r!   r!   r"   r/     s    &


)NN)Gr    rq   r   rs   rer   dataclassesr   	functoolsr   pathlibr   typingr   r   r   r   pyarrowrx   pyarrow.parquetr   r   tqdm.contrib.concurrentr	   download.download_configr   namingr   r   r   r   r   r   r   utilsr   r   ru   utils.deprecation_utilsr   utils.file_utilsr   r6   r   r   r   r   
get_loggerr   loggerr   compileXr   r   ConnectionErrorr   r$   r&   rC   rW   rX   r   r   r   r   r   r   r   r   r/   r!   r!   r!   r"   <module>   s   




K /