o
    TZhb                     @   s   d dl Z d dlmZmZmZ d dlZd dlmZ	 ddl
mZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZm Z  ddl!m"Z" dedee# fddZ$G dd de"Z%G dd dZ&dS )    N)BinaryIOOptionalUnion   )AudioDatasetFeaturesImage
NamedSplitValueconfig)FeatureType_visit)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderfeaturesreturnc                    s8   t j dtddf fdd}t| |  t ju rdS  S )a  
    Get the writer_batch_size that defines the maximum row group size in the parquet files.
    The default in `datasets` is 1,000 but we lower it to 100 for image datasets.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    This can be improved to get optimized size for querying/iterating
    but at least it matches the dataset viewer expectations on HF.

    Args:
        ds_config_info (`datasets.info.DatasetInfo`):
            Dataset info from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a dataset builder.
            If `None`, then it will use the `datasets` default.
    featurer   Nc                    s`   t | trt tj d S t | trt tj d S t | tr,| jdkr.t tj	 d S d S d S )Nbinary)

isinstancer	   minr   Z)PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETSr   Z)PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETSr   ZdtypeZ*PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)r   
batch_size J/var/www/html/lang_env/lib/python3.10/site-packages/datasets/io/parquet.pyset_batch_size&   s   

z-get_writer_batch_size.<locals>.set_batch_size)npinfr   r   )r   r!   r   r   r    get_writer_batch_size   s   
	r$   c                       s`   e Zd Z						ddee dee dee dede	de	d	ee
 f fd
dZdd Z  ZS )ParquetDatasetReaderNFpath_or_pathssplitr   	cache_dirkeep_in_memory	streamingnum_procc           
   	      sd   t  j|f||||||d| t|tr|n| j|i}td d }	td||||	d|| _d S )N)r'   r   r(   r)   r*   r+   parquetr   )r(   Z
data_filesr   hashr   )super__init__r   dictr'   r   r   builder)
selfr&   r'   r   r(   r)   r*   r+   kwargsr-   	__class__r   r    r/   5   s,   
zParquetDatasetReader.__init__c                 C   s\   | j r| jj| jd}|S d }d }d }d }| jj||||| jd | jj| j|| jd}|S )N)r'   )download_configdownload_modeverification_mode	base_pathr+   )r'   r8   Z	in_memory)r*   r1   Zas_streaming_datasetr'   Zdownload_and_preparer+   Z
as_datasetr)   )r2   datasetr6   r7   r8   r9   r   r   r    readT   s$   
zParquetDatasetReader.read)NNNFFN)__name__
__module____qualname__r   r   r   r
   r   strboolintr/   r;   __classcell__r   r   r4   r    r%   4   s.    r%   c                   @   sV   e Zd Z	ddedeeef dee fddZ	defdd	Z
d
ededefddZdS )ParquetDatasetWriterNr:   path_or_bufr   c                 K   s&   || _ || _|pt|j| _|| _d S N)r:   rD   r$   r   r   parquet_writer_kwargs)r2   r:   rD   r   rF   r   r   r    r/   n   s   
zParquetDatasetWriter.__init__r   c                 C   s   | j r| j ntj}t| jtttjfr8t	| jd}| j
d||d| j}W d    |S 1 s1w   Y  |S | j
d| j|d| j}|S )Nzwb+)file_objr   r   )r   r   ZDEFAULT_MAX_BATCH_SIZEr   rD   r?   bytesosr   open_writerF   )r2   r   bufferwrittenr   r   r    writez   s   
zParquetDatasetWriter.writerG   c           
      K   s   d}| dd}| jjj}tj|fd|i|}ttdt| j|dddD ]}t	| jj
t||| | jjd}	||	 ||	j7 }q&|  |S )	zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rD   Nschemabaz"Creating parquet from Arrow format)unitZdesc)tablekeyindices)popr:   r   Zarrow_schemapqZParquetWriterhf_tqdmrangelenr   _datasliceZ_indicesZwrite_tablenbytesclose)
r2   rG   r   rF   rM   _rO   writeroffsetbatchr   r   r    rK      s$   


zParquetDatasetWriter._writerE   )r<   r=   r>   r   r   r   r   r   rA   r/   rN   rK   r   r   r   r    rC   m   s    


rC   )'rI   typingr   r   r   numpyr"   Zpyarrow.parquetr,   rV    r   r   r   r	   r
   r   r   Zfeatures.featuresr   r   Z
formattingr   Zpackaged_modulesr   Z packaged_modules.parquet.parquetr   utilsr   rW   Zutils.typingr   r   abcr   rA   r$   r%   rC   r   r   r   r    <module>   s    $#9