o
    +if                     @   s   d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z
 ddlmZmZmZmZmZmZmZ ddlmZmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lm Z m!Z! ddl"m#Z# dedee$ fddZ%G dd de#Z&G dd dZ'dS )    N)BinaryIOOptionalUnion   )AudioDatasetFeaturesImage
NamedSplitValueconfig)FeatureType_visit)query_table)_PACKAGED_DATASETS_MODULES)Parquet)tqdm)NestedDataStructureLikePathLike   )AbstractDatasetReaderfeaturesreturnc                    s8   t j dtddf fdd}t| |  t ju rdS  S )a  
    Get the writer_batch_size that defines the maximum row group size in the parquet files.
    The default in `datasets` is 1,000 but we lower it to 100 for image datasets.
    This allows to optimize random access to parquet file, since accessing 1 row requires
    to read its entire row group.

    This can be improved to get optimized size for querying/iterating
    but at least it matches the dataset viewer expectations on HF.

    Args:
        ds_config_info (`datasets.info.DatasetInfo`):
            Dataset info from `datasets`.
    Returns:
        writer_batch_size (`Optional[int]`):
            Writer batch size to pass to a dataset builder.
            If `None`, then it will use the `datasets` default.
    featurer   Nc                    s`   t | trt tj d S t | trt tj d S t | tr,| jdkr.t tj	 d S d S d S )Nbinary)

isinstancer	   minr   )PARQUET_ROW_GROUP_SIZE_FOR_IMAGE_DATASETSr   )PARQUET_ROW_GROUP_SIZE_FOR_AUDIO_DATASETSr   dtype*PARQUET_ROW_GROUP_SIZE_FOR_BINARY_DATASETS)r   
batch_size L/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/io/parquet.pyset_batch_size'   s   

z-get_writer_batch_size.<locals>.set_batch_size)npinfr   r   )r   r%   r#   r!   r$   get_writer_batch_size   s   
	r(   c                       s`   e Zd Z						ddee dee dee dede	de	d	ee
 f fd
dZdd Z  ZS )ParquetDatasetReaderNFpath_or_pathssplitr   	cache_dirkeep_in_memory	streamingnum_procc           
   	      sd   t  j|f||||||d| t|tr|n| j|i}td d }	td||||	d|| _d S )N)r+   r   r,   r-   r.   r/   parquetr   )r,   
data_filesr   hashr#   )super__init__r   dictr+   r   r   builder)
selfr*   r+   r   r,   r-   r.   r/   kwargsr2   	__class__r#   r$   r4   6   s,   
zParquetDatasetReader.__init__c                 C   s\   | j r| jj| jd}|S d }d }d }d }| jj||||| jd | jj| j|| jd}|S )N)r+   )download_configdownload_modeverification_mode	base_pathr/   )r+   r=   	in_memory)r.   r6   as_streaming_datasetr+   download_and_preparer/   
as_datasetr-   )r7   datasetr;   r<   r=   r>   r#   r#   r$   readU   s$   
zParquetDatasetReader.read)NNNFFN)__name__
__module____qualname__r   r   r   r
   r   strboolintr4   rD   __classcell__r#   r#   r9   r$   r)   5   s.    r)   c                
   @   s`   e Zd Z		ddedeeef dee dee	 fddZ
defd	d
ZdededefddZdS )ParquetDatasetWriterNrC   path_or_bufr"   storage_optionsc                 K   s0   || _ || _|pt|j| _|pi | _|| _d S )N)rC   rM   r(   r   r"   rN   parquet_writer_kwargs)r7   rC   rM   r"   rN   rO   r#   r#   r$   r4   n   s
   

zParquetDatasetWriter.__init__r   c                 C   s   | j r| j ntj}t| jtttjfr@t	j
| jdfi | jpi }| jd||d| j}W d    |S 1 s9w   Y  |S | jd| j|d| j}|S )Nwb)file_objr"   r#   )r"   r   DEFAULT_MAX_BATCH_SIZEr   rM   rH   bytesosr   fsspecopenrN   _writerO   )r7   r"   bufferwrittenr#   r#   r$   write|   s   
zParquetDatasetWriter.writerQ   c           
      K   s   d}| dd}| jjj}tj|fd|i|}ttdt| j|dddD ]}t	| jj
t||| | jjd}	||	 ||	j7 }q&|  |S )	zWrites the pyarrow table as Parquet to a binary file handle.

        Caller is responsible for opening and closing the handle.
        r   rM   Nschemabaz"Creating parquet from Arrow format)unitdesc)tablekeyindices)poprC   r   arrow_schemapqParquetWriterhf_tqdmrangelenr   _dataslice_indiceswrite_tablenbytesclose)
r7   rQ   r"   rO   rY   _r[   writeroffsetbatchr#   r#   r$   rW      s$   


zParquetDatasetWriter._write)NN)rE   rF   rG   r   r   r   r   r   rJ   r5   r4   rZ   rW   r#   r#   r#   r$   rL   m   s    


rL   )(rT   typingr   r   r   rU   numpyr&   pyarrow.parquetr0   rd    r   r   r   r	   r
   r   r   features.featuresr   r   
formattingr   packaged_modulesr    packaged_modules.parquet.parquetr   utilsr   rf   utils.typingr   r   abcr   rJ   r(   r)   rL   r#   r#   r#   r$   <module>   s     $#8