o
    TZh@"                     @   sT  d dl Z d dlZd dlmZ d dlmZmZmZmZ d dl	Z
d dlZd dlZejjeZG dd dejZg dZee_g dZee_defd	d
ZdefddZdefddZdefddZdefddZdefddZi dededededededededejdejded ed!ed"ed#ed$ed%eZ e e_ dS )&    N)islice)AnyCallableDictListc                   @   st   e Zd ZU dZee ed< ee ed< eeee	ge	f f ed< dZ
edd Zdejfd	d
Zdd Zdd ZdS )
WebDatasetd   IMAGE_EXTENSIONSAUDIO_EXTENSIONSDECODERS   c                 c   s    i }|D ]=\}}d|v rB| dd\}}|r"|d |kr"|V  i }||d< ||d< | || < || jv rB| j| || ||< q|rJ|V  d S d S )N.   __key____url__)splitreadlowerr   )clstar_pathtar_iteratorZcurrent_examplefilenamefZexample_key
field_name r   f/var/www/html/lang_env/lib/python3.10/site-packages/datasets/packaged_modules/webdataset/webdataset.py_get_pipeline_from_tar   s"   

z!WebDataset._get_pipeline_from_tarreturnc                 C   s   t  S )N)datasetsDatasetInfo)selfr   r   r   _info'   s   zWebDataset._infoc                    s  | j jstd| j j  | j j}t|tttfr=|}t|tr&|g} fdd|D }tj	tj
j||ddg}n)g }| D ]"\}}t|trO|g} fdd|D }|tj	|||dd qC| jjs| |d |d }tt|| jtfdd	D rtd
dd D }tj jjdk rtj|ddj}	ntj|ddj}	tj|	}
d D ]}|ddd }|| jv rt |
|< qd D ]}|ddd }|| jv rt |
|< q|
| j_|S )z-We handle string, list and dicts in datafilesz=At least one data file must be specified, but got data_files=c                       g | ]}  |qS r   Ziter_archive.0r   
dl_managerr   r   
<listcomp>4       z0WebDataset._split_generators.<locals>.<listcomp>)	tar_pathstar_iterators)nameZ
gen_kwargsc                    r"   r   r#   r$   r&   r   r   r(   ?   r)   r   c                 3   s$    | ]}|   d    kV  qdS )r   N)keysr%   example)first_examplesr   r   	<genexpr>I   s   " z/WebDataset._split_generators.<locals>.<genexpr>zThe TAR archives of the dataset should be in WebDataset format, but the files in the archive don't share the same prefix or the same types.c                 S   s   g | ]	}t j|gqS r   )paTableZfrom_pylistr.   r   r   r   r(   N   s       T)Zpromotedefault)Zpromote_optionsr   r   ) config
data_files
ValueErrordownload
isinstancestrlisttupler   ZSplitGeneratorZSplitZTRAINitemsappendinfofeaturesr   r   #NUM_EXAMPLES_FOR_FEATURES_INFERENCEanyZPYARROW_VERSIONmajorr2   Zconcat_tablesZschemaZFeaturesZfrom_arrow_schemarsplitr	   Imager
   Audio)r    r'   r8   r*   r+   ZsplitsZ
split_nameZpipelineZ	pa_tablesZinferred_arrow_schemarB   r   	extensionr   )r'   r0   r   _split_generators*   s\   




zWebDataset._split_generatorsc                 c   s    dd | j j D }dd | j j D }tt||D ]3\}\}}t| ||D ]$\}}	|| D ]}
|	d d |
 |	|
 d|	|
< q4| d| |	fV  q,qd S )Nc                 S       g | ]\}}t |tjr|qS r   )r;   r   rG   r%   r   featurer   r   r   r(   d   
    z1WebDataset._generate_examples.<locals>.<listcomp>c                 S   rK   r   )r;   r   rH   rL   r   r   r   r(   g   rN   r   r   )pathbytes_)rA   rB   r?   	enumeratezipr   )r    r*   r+   Zimage_field_namesZaudio_field_namesZtar_idxr   r   Zexample_idxr/   r   r   r   r   _generate_examplesc   s   

 zWebDataset._generate_examplesN)__name__
__module____qualname__ZDEFAULT_WRITER_BATCH_SIZEr   r<   __annotations__r   r   r   rC   classmethodr   r   r   r!   rJ   rT   r   r   r   r   r      s   
 
9r   )?ZblpZbmpZdibZbufrcurZpcxZdcxZddsZpsZepsfitZfitsZfliZflcZftcZftuZgbrZgifZgribZh5ZhdfZpngZapngZjp2Zj2kZjpcZjpfZjpxZj2cZicnsZicoZimZiimZtifZtiffZjfifZjpeZjpgZjpegZmpgZmpegZmspZpcdZpxrZpbmZpgmppmZpnmZpsdbwrgbZrgbaZsgiZrasZtgaZicbZvdaZvstZwebpZwmfZemfZxbmZxpm)ZaiffauZavrZcafZflacZhtkZsvxZmat4Zmat5Zmpc2kZoggZpafZpvfrawZrf64Zsd2ZsdsZircamZvocZw64ZwavZnistZwavexZwvexiZmp3Zopusdatac                 C   s
   |  dS )Nzutf-8)decoderb   r   r   r   
text_loads   s   
re   c                 C   s   ddl m} || S )Nr   )_tenbin) rf   Zdecode_buffer)rb   rf   r   r   r   tenbin_loads   s   
rh   c                 C      dd l }|| S Nr   )msgpackunpackb)rb   rk   r   r   r   msgpack_loads      
rm   c                 C   s$   dd l }t| }|jjj|ddS )Nr   FZallow_pickle)Znumpy.lib.formatioBytesIOlibformatZ
read_array)rb   numpystreamr   r   r   	npy_loads   s   
rv   c                 C   s   t jt| ddS )NFro   )nploadrp   rq   rd   r   r   r   	npz_loads  s   ry   c                 C   ri   rj   )cborloads)rb   rz   r   r   r   
cbor_loads  rn   r|   txttextZ
transcriptr   Zcls2indexZinxidjsonZjsntentbmpmsgZnpyZnpzrz   )!rp   r   	itertoolsr   typingr   r   r   r   rt   rw   Zpyarrowr2   r   utilsloggingZ
get_loggerrU   loggerZGeneratorBasedBuilderr   r	   r
   rP   re   rh   rm   rv   ry   r|   intr{   r   r   r   r   r   <module>   sn    nA	

