o
    TZhBZ                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlm  mZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZm Z  d dl!m"Z" ee#Z$dZ%dd Z&G dd deZ'edG dd deZ(dS )    N)ArgumentParser)Path)Optional)config)BaseDatasetsCLICommand)DownloadConfig)DownloadManager)MockDownloadManager)dataset_module_factoryimport_main_class)
deprecated)
get_loggerset_verbosity_warning)
map_nestedzutf-8c              
   C   s*   t | j| j| j| j| j| j| j| j| j		S N)
DummyDataCommandpath_to_datasetauto_generaten_lines
json_fieldxml_tagmatch_text_fileskeep_uncompressed	cache_direncoding)args r   S/var/www/html/lang_env/lib/python3.10/site-packages/datasets/commands/dummy_data.pydummy_data_command_factory   s   r   c                       s   e Zd Z fddZ fddZ fddZ					dd	ed
ee dee dee dee de	fddZ
				ddeded	ed
ee dee dee dee defddZedefddZdd Z  ZS )!DummyDataGeneratorDownloadManagerc                    s(   t  j|i | || _g | _g | _d S r   )super__init__mock_download_managerdownloaded_dummy_pathsexpected_dummy_paths)selfr"   r   kwargs	__class__r   r   r!   *   s   
z*DummyDataGeneratorDownloadManager.__init__c                    s@   t  |}| j|}t| jj|dd t| jj|dd |S NT)Z	map_tuple)r    downloadr"   r   r#   appendr$   r%   Zurl_or_urlsoutputZdummy_outputr'   r   r   r*   0   s
   z*DummyDataGeneratorDownloadManager.downloadc                    sH   t  t  |}| j|}t| jj|dd t| jj|dd |S r)   )r    extractr*   r"   r   r#   r+   r$   r,   r'   r   r   download_and_extract7   s
   z6DummyDataGeneratorDownloadManager.download_and_extract   Nr   r   r   r   r   returnc           
      C   s   t jt j| jj| jj| jjddd d}d| j_t	| j
| jD ]!\}}t j| jj| jj| jj|}	|| j||	|||||d7 }q!|dkrLtd |dkS )N
dummy_dataTexist_okr   Fr   r   r   r   r   zDummy data generation failed: no dummy files were created. Make sure the data files format is supported by the auto-generation.)osmakedirspathjoinr"   datasets_scripts_dirdataset_namedummy_data_folderload_existing_dummy_datazipr#   r$   _create_dummy_dataloggererror)
r%   r   r   r   r   r   totalsrc_pathZrelative_dst_pathdst_pathr   r   r   auto_generate_dummy_data_folder>   s@   	
	zADummyDataGeneratorDownloadManager.auto_generate_dummy_data_folderrC   rD   c                    sX  |pt }tj|rftd|  t|j g d}t fdd|D }	|d urAtj	|}
|
dD ]
}|	t|
|O }	q6|	rt|jjddd t||dH}t|d	|d(}g }t|D ]\}}|krl n|| qb|d
|  W d    n1 sw   Y  W d    dS W d    dS 1 sw   Y  dS d v r7t||d|}t|}|d ur|| }t|trtdd | D stdt|  dfdd| D }n|d  }|d ur||i}t|jjddd t|d	|d}t|| W d    n1 sw   Y  W d    dS W d    dS 1 s0w   Y  dS t fdddD r[|d u rOtd dS | j ||||d dS td| d dS tj!|rd}t"|D ]3\}}}|D ]*}|#dstj||}tj|t|$|}|| j%||||||d7 }q{qt|S d S )Nz#Trying to generate dummy data file )z.txtz.csvz.jsonlz.tsvc                 3       | ]}| v V  qd S r   r   .0	extensiondst_path_extensionsr   r   	<genexpr>w       zGDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<genexpr>,Tr4   parentsr   w    z.jsonc                 s   s    | ]}t |tV  qd S r   )
isinstancelist)rH   vr   r   r   rL      s    zCouldn't parse columns z\. Maybe specify which json field must be used to read the data with --json_field <my_field>.c                    s   i | ]\}}||d   qS r   r   )rH   krW   )r   r   r   
<dictcomp>   s    zHDummyDataGeneratorDownloadManager._create_dummy_data.<locals>.<dictcomp>c                 3   rF   r   r   rG   rJ   r   r   rL      rM   )z.xmlz.txmzEFound xml file but 'xml_tag' is set to None. Please provide --xml_tag)r   r   zCouldn't generate dummy file 'z9'. Ignore that if this file is not useful for dummy data.r   .r5   )&DEFAULT_ENCODINGr6   r8   isfiler@   debugr   suffixesanybasenamesplitfnmatchparentmkdiropen	enumerater+   writer9   stripjsonloadrU   dictallvalues
ValueErrorrV   keysitemsdumpwarning_create_xml_dummy_dataisdirwalk
startswithrelative_tor?   )r%   rC   rD   r   r   r   r   r   Zline_by_line_extensionsZis_line_by_line_text_file	file_namepatternsrc_fileZdst_fileZfirst_linesilineZ	json_dataZfirst_json_datarB   r8   _filesnameZsrc_file_pathZdst_file_pathr   )rK   r   r   r?   h   s   






 




z4DummyDataGeneratorDownloadManager._create_dummy_datac                 C   s   t |jjddd t| |dI}d}g }tj|ddD ])\}}	|dkr*||	 q| }
|	j|krE||k r<|d7 }q|rE|d	 	|	 qtj
|	d
j||d W d    d S 1 s\w   Y  d S )NTrO   rQ   r   )startend)eventsr   rT   )element)r   rc   rd   re   ETZ	iterparser+   poptagremoveElementTreerg   )rC   rD   r   r   r   rz   Zn_linerP   eventelemr}   r   r   r   rs      s    

"z8DummyDataGeneratorDownloadManager._create_xml_dummy_datac                 C   sT   t j|| jj}t j|d}d}td| d t|d|| t	| d S )Nr2   z"Compressing dummy data folder to 'z.zip'r>   )
r6   r8   r9   r"   r<   r@   infoshutilmake_archivermtree)r%   r   root_dir	base_namebase_dirr   r   r   !compress_autogenerated_dummy_data   s   zCDummyDataGeneratorDownloadManager.compress_autogenerated_dummy_data)r0   NNNN)NNNN)__name__
__module____qualname__r!   r*   r/   intr   strboolrE   r?   staticmethodr[   rs   r   __classcell__r   r   r'   r   r   )   s\    	
/	
Tr   zThe `datasets` repository does not host the dataset scripts anymore. Therefore, dummy data is no longer needed to test their loading with CI.c                   @   s   e Zd ZedefddZdededede	e de	e d	e	e d
ede	e de	e fddZ
dd Zde	e fddZdd ZdS )r   parserc                 C   s   | j ddd}|jdddd |jdtd	d
d |jdtd dd |jdtd dd |jdtd dd |jdddd |jdtd dd |jdtd dt d |jdtdd |jtd d S )Nr2   zGenerate dummy data.)helpz--auto_generate
store_truez!Automatically generate dummy data)actionr   z	--n_linesr0   zBNumber of lines or samples to keep when auto-generating dummy data)typedefaultr   z--json_fieldzOptional, json field to read the data from when auto-generating dummy data. In the json data files, this field must point to a list of samples as json objects (ex: the 'data' field for squad-like files)z	--xml_tagz[Optional, xml tag name of the samples inside the xml files when auto-generating dummy data.z--match_text_fileszOptional, a comma separated list of file patterns that looks for line-by-line text files other than *.txt or *.csv. Example: --match_text_files *.labelz--keep_uncompressedzWhether to leave the dummy data folders uncompressed when auto-generating dummy data. Useful for debugging for to do manual adjustements before compressing.z--cache_dirzKCache directory to download and cache files when auto-generating dummy dataz
--encodingz=Encoding to use when auto-generating dummy data. Defaults to r   z/Path to the dataset (example: ./datasets/squad))r   r   )func)
add_parseradd_argumentr   r   r[   set_defaultsr   )r   Ztest_parserr   r   r   register_subcommand   sT   z$DummyDataCommand.register_subcommandr   r   r   r   r   r   r   r   r   c
           
      C   s   || _ tj|r|tjddd | _n|tjddd | _tj|p+t	j
}|| _|| _|| _|| _|| _|| _|| _|	| _d S )N/r   )_path_to_datasetr6   r8   rt   replacesepra   _dataset_name
expanduserr   ZHF_DATASETS_CACHE_auto_generate_n_lines_json_field_xml_tag_match_text_files_keep_uncompressed
_cache_dir	_encoding)
r%   r   r   r   r   r   r   r   r   r   r   r   r   r!   	  s   
zDummyDataCommand.__init__c              	   C   s>  t   t| j}t|j}|jpd g}g }t |}|D ];}|r#|jnd }|||j	|d}|r2|j
n|jj
}	t| j||	ddd}
| jrP|| j||
| jd q| j||
d q| jr}| jst|rltd| j d n!td	| j d W d    d S W d    d S W d    d S W d    d S 1 sw   Y  d S )
N)config_namehashr   TF)r;   r   versionZuse_local_dummy_datar=   )dataset_buildermock_dl_managerr   )r   r   z>Automatic dummy data generation succeeded for all configs of ''z<Automatic dummy data generation failed for some configs of ')r   r
   r   r   module_pathZBUILDER_CONFIGStempfileTemporaryDirectoryr   r   r   r   r	   r   r   r+   _autogenerate_dummy_datar   _print_dummy_data_instructionsrl   print)r%   Zdataset_moduleZbuilder_clsZbuilder_configsZauto_generate_resultsZtmp_dirZbuilder_configr   r   r   r   r   r   r   run$  sN   


"zDummyDataCommand.runr1   c              
      s  | j rtj| j tjntj}t|d}t| j	||d}|
| d|_|j| j| j| j| j| jd |stj|j|j}|| d|_i  tj|j dd z|
|}|D ]}	|j|	dd |	jj |	j< qXW n" ty }
 ztd|jj d	t|
  W Y d }
~
dS d }
~
ww td
d   D rt d|jj d dS  fdd D }t d| d|jj d dS tj| j!|j"}t#d| d d S )N)r   )r;   r"   download_configFr5   Tr3   )Zcheck_duplicate_keysz&Failed to load dummy data for config 'z''.
Original error:
c                 s   s    | ]}|d kV  qdS )r   Nr   )rH   Z
n_examplesr   r   r   rL   q  rM   z<DummyDataCommand._autogenerate_dummy_data.<locals>.<genexpr>zEDummy data generation done and dummy data test succeeded for config 'z''.c                    s   g | ]
} | d kr|qS )r   r   )rH   Z
split_nameZn_examples_per_splitr   r   
<listcomp>w  s    z=DummyDataCommand._autogenerate_dummy_data.<locals>.<listcomp>zCDummy data generation done but dummy data test failed since splits z have 0 examples for config 'z#Dummy data generated in directory 'zg' but kept uncompressed. Please compress this directory into a zip file to use it for dummy data tests.)$r   r6   r8   r9   r   ZDOWNLOADED_DATASETS_DIRZDOWNLOADED_DATASETS_PATHr   r   r   _split_generatorsr=   rE   r   r   r   r   r   r:   r;   r   r7   Z_prepare_splitZ
split_infoZnum_examplesr   OSErrorr@   rA   r   rl   rm   rr   r   r<   r   )r%   r   r   r   Zdl_cache_dirr   Z
dl_managerZpath_do_datasetZsplit_generatorsZsplit_generatoreZempty_splitsZgenerated_dummy_data_dirr   r   r   r   K  sl   





z)DummyDataCommand._autogenerate_dummy_datac                 C   s  t j| j|j}td| d t j|dd z||}W n# t	yC } zt
d| j d|j d|j d W Y d }~nd }~ww t }g }|j}|D ]h}	td	|	j  ||	j |	j}
|jd5i |
}z4d
}|jd urzd|jj dnd}|d| | j d| d| d 7 }|D ]\}}q|d| d7 }W qN t	y } z||j W Y d }~qNd }~ww d|}t|dkrt|dkrtt||kr|dtt| d| d| d7 }|}nd|}|d| d| d7 }|d| d7 }|d| d 7 }t|dkr>tt||kr>|d!| d"| d#| d$7 }|d%| d&| d$7 }|d'| d(| d)| d*7 }n'|d+| d,| d#| d-7 }|d.| d/| d$7 }|d0| d(| d)| d*7 }|d1| d2| d37 }|d47 }t
| d S )6Nz$Creating dummy folder structure for z... Tr3   zDataset z with config a   seems to already open files in the method `_split_generators(...)`. You might consider to instead only open files in the method `_generate_examples(...)` instead. If this is not possible the dummy data has to be created with less guidance. Make sure you create the file rZ   z/Collecting dummy data file paths to create for zU
==============================DUMMY DATA INSTRUCTIONS==============================
zconfig z of rS   z(- In order to create the dummy data for z, please go into the folder 'z' with `cd z` . 

za- It appears that the function `_generate_examples(...)` expects one or more files in the folder z using the function `glob.glob(...)`. In this case, please refer to the `_generate_examples(...)` method to see under which filename the dummy data files should be created. 

z, r   rT   z1- Please create a single dummy data file called 'z' from the folder 'zV'. Make sure that the dummy data file provides at least one example for the split(s) 'z' 

z0- Please create the following dummy data files 'z'

z- For each of the splits 'zU', make sure that one or more of the dummy data files provide at least one example 

z- If the method `_generate_examples(...)` includes multiple `open()` statements, you might have to create other files in addition to 'zG'. In this case please refer to the `_generate_examples(...)` method 

z@- After the dummy data file is created, it should be zipped to 'z.zip' with the command `zip z.zip z` 

z- You can now delete the file 'z' with the command `rm z- To get the file 'z;' back for further changes to the dummy data, simply unzip z.zip with the command `unzip z.zip` 

zP- After all dummy data files are created, they should be zipped recursively to 'z.zip' with the command `zip -r z/` 

z!- You can now delete the folder 'z' with the command `rm -r z- To get the folder 'z'- Make sure you have created the file 'z
.zip' in 'z' 
zT===================================================================================
r   )r6   r8   r9   r   r<   r@   r   r7   r   FileNotFoundErrorr   r   r   filenamesetdummy_file_namer   r+   
gen_kwargsZ_generate_examplesaddlennextiter)r%   r   r   r<   Zgenerator_splitsr   Zfiles_to_createZsplit_namesr   ra   r   	generatorZdummy_data_guidance_printZconfig_stringkeyrecordZfiles_stringr   r   r   r     s|   
$
 z/DummyDataCommand._print_dummy_data_instructionsN)r   r   r   r   r   r   r   r   r   r   r!   r   r   r   r   r   r   r   r      s2    ,	

':r   ))rb   ri   r6   r   r   Zxml.etree.ElementTreeetreer   r   argparser   pathlibr   typingr   Zdatasetsr   Zdatasets.commandsr   Z!datasets.download.download_configr   Z"datasets.download.download_managerr   Z'datasets.download.mock_download_managerr	   Zdatasets.loadr
   r   Z datasets.utils.deprecation_utilsr   Zdatasets.utils.loggingr   r   Zdatasets.utils.py_utilsr   r   r@   r[   r   r   r   r   r   r   r   <module>   s6     0