o
    TZhc7                     @   s~  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ eeZG d
d dZG dd de
ZG dd dee
ZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G d d! d!eZ'G d"d# d#Z(dS )$    N)ABCabstractmethod)Path)DictListOptionalTypeUnion   )config   )FileLock)
get_loggerc                   @   s`   e Zd Zddee fddZdedefddZd	ed
edefddZdded
edefddZ	dS )ExtractManagerN	cache_dirc                 C   s&   |r
t j|tjntj| _t| _d S N)	ospathjoinr   ZEXTRACTED_DATASETS_DIRZEXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr    r   M/var/www/html/lang_env/lib/python3.10/site-packages/datasets/utils/extract.py__init__   s   
zExtractManager.__init__r   returnc                 C   s,   ddl m} tj|}tj| j||S )Nr   )hash_url_to_filename)Z
file_utilsr   r   r   abspathr   r   )r   r   r   abs_pathr   r   r   _get_output_path   s   zExtractManager._get_output_pathoutput_pathforce_extractc                 C   s*   |pt j| ot j|ot | S r   )r   r   isfileisdirlistdir)r   r!   r"   r   r   r   _do_extract%   s   $zExtractManager._do_extractF
input_pathc                 C   s>   | j |}|s
|S | |}| ||r| j ||| |S r   )r   infer_extractor_formatr    r&   extract)r   r'   r"   extractor_formatr!   r   r   r   r)   *   s   
zExtractManager.extractr   F)
__name__
__module____qualname__r   strr   r    boolr&   r)   r   r   r   r   r      s
    r   c                   @   s\   e Zd Zeedeeef defddZ	e
edeeef deeef ddfdd	ZdS )
BaseExtractorr   r   c                 K      d S r   r   clsr   kwargsr   r   r   is_extractable5      zBaseExtractor.is_extractabler'   r!   Nc                 C   r2   r   r   )r'   r!   r   r   r   r)   9   r7   zBaseExtractor.extract)r,   r-   r.   classmethodr   r	   r   r/   r0   r6   staticmethodr)   r   r   r   r   r1   4   s    .r1   c                   @   s`   e Zd ZU g Zee ed< edee	e
f defddZeddee	e
f dedefd	d
ZdS )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 C   s8   t | d}||W  d    S 1 sw   Y  d S )Nrb)openread)r   r<   fr   r   r   read_magic_numberA   s   $z*MagicNumberBaseExtractor.read_magic_number    magic_numberr   c                    sV    st dd | jD }z| || W n
 ty   Y dS w t fdd| jD S )Nc                 s   s    | ]}t |V  qd S r   )len.0Zcls_magic_numberr   r   r   	<genexpr>I   s    z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>Fc                 3   s    | ]}  |V  qd S r   )
startswithrE   rC   r   r   rG   N   s    )maxr;   rA   OSErrorany)r4   r   rC   r<   r   rI   r   r6   F   s   z'MagicNumberBaseExtractor.is_extractableNrB   )r,   r-   r.   r;   r   bytes__annotations__r9   r	   r   r/   intrA   r8   r0   r6   r   r   r   r   r:   >   s   
 &r:   c                   @   s`   e Zd Zedeeef defddZe	dd Z
e	deeef deeef dd	fd
dZd	S )TarExtractorr   r   c                 K   s
   t |S r   )tarfile
is_tarfiler3   r   r   r   r6   R   s   
zTarExtractor.is_extractablec                 #   s    dt dt fdddt dt dtffdd dt dtf fdd	}|}| D ]D} |j|r<td
|j d q)| rS|||rStd
|j d|j  q)| rj|||rjtd
|j d|j  q)|V  q)dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 S   s   t jt j| S r   )r   r   realpathr   )r   r   r   r   resolvedb   s   z*TarExtractor.safemembers.<locals>.resolvedbasec                    s    t j|| | S r   )r   r   r   rH   )r   rV   )rU   r   r   badpathe   s   z)TarExtractor.safemembers.<locals>.badpathc                    s*   t j|t j| j} | j|dS )N)rV   )r   r   r   dirnamenamelinkname)inforV   ZtiprW   rU   r   r   badlinki   s   z)TarExtractor.safemembers.<locals>.badlinkzExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r/   r0   rY   loggererrorissymrZ   islnk)membersr!   r]   rV   Zfinfor   r\   r   safemembersV   s   zTarExtractor.safemembersr'   r!   Nc                 C   s:   t j|dd t| }|j|t||d |  d S )NTexist_ok)rb   )r   makedirsrR   r>   
extractallrQ   rc   close)r'   r!   Ztar_filer   r   r   r)   z   s   
zTarExtractor.extract)r,   r-   r.   r8   r	   r   r/   r0   r6   r9   rc   r)   r   r   r   r   rQ   Q   s    
#,rQ   c                   @   <   e Zd ZdgZedeeef deeef ddfddZdS )GzipExtractors   r'   r!   r   Nc              	   C   x   t | d,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s5w   Y  d S Nr=   wb)gzipr>   shutilcopyfileobj)r'   r!   Z	gzip_fileextracted_filer   r   r   r)         "zGzipExtractor.extract	r,   r-   r.   r;   r9   r	   r   r/   r)   r   r   r   r   rj          ,rj   c                       sj   e Zd Zg dZeddeeef dede	f fddZ
edeeef d	eeef dd
fddZ  ZS )ZipExtractor)s   PKs   PKs   PKrB   r   rC   r   c                    s  t  j||dr
dS zddlm}m}m}m}m}m}m	}	m
}
m}m} t|d}|	|}|r|| dkrK|| dkrK|| dkrK	 W d    W dS || || kr|||  | || kr|| |
kr||
}t||
krt||}|| |kr	 W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS 1 sw   Y  W dS  ty   Y dS w )NrI   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirr=   F)superr6   zipfilerv   rw   rx   ry   rz   r{   r|   r}   r~   r   r>   seektellr?   rD   structunpack	Exception)r4   r   rC   rv   rw   rx   ry   rz   r{   r|   r}   r~   r   fpendrecdatacentdir	__class__r   r   r6      sT   0$






zZipExtractor.is_extractabler'   r!   Nc                 C   sR   t j|dd t| d}|| |  W d    d S 1 s"w   Y  d S )NTrd   r)r   rf   r   ZipFilerg   rh   )r'   r!   zip_filer   r   r   r)      s
   

"zZipExtractor.extractrM   )r,   r-   r.   r;   r8   r	   r   r/   rN   r0   r6   r9   r)   __classcell__r   r   r   r   ru      s    &$0ru   c                   @   ri   )XzExtractors   7zXZ r'   r!   r   Nc              	   C   sv   t | ,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s4w   Y  d S )Nrm   )lzmar>   ro   rp   r'   r!   compressed_filerq   r   r   r   r)      s   "zXzExtractor.extractrs   r   r   r   r   r      rt   r   c                   @   s>   e Zd ZddgZedeeef deeef ddfddZdS )	RarExtractors   Rar! s   Rar! r'   r!   r   Nc                 C   sD   t jstddd l}tj|dd || }|| |  d S )NzPlease pip install rarfiler   Trd   )	r   ZRARFILE_AVAILABLEImportErrorrarfiler   rf   ZRarFilerg   rh   )r'   r!   r   rfr   r   r   r)      s   

zRarExtractor.extractrs   r   r   r   r   r      s    ,r   c                   @   ri   )ZstdExtractors   (/r'   r!   r   Nc              	   C   s   t jstddd l}| }t| d,}t|d}||| W d    n1 s+w   Y  W d    d S W d    d S 1 sCw   Y  d S )NzPlease pip install zstandardr   r=   rm   )r   ZZSTANDARD_AVAILABLEr   Z	zstandardZZstdDecompressorr>   copy_stream)r'   r!   zstdZdctxZifhZofhr   r   r   r)      s   PzZstdExtractor.extractrs   r   r   r   r   r      rt   r   c                   @   ri   )Bzip2Extractors   BZhr'   r!   r   Nc              	   C   rk   rl   )bz2r>   ro   rp   r   r   r   r   r)      rr   zBzip2Extractor.extractrs   r   r   r   r   r      rt   r   c                   @   ri   )SevenZipExtractors   7z'r'   r!   r   Nc                 C   s`   t jstddd l}tj|dd || d}|| W d    d S 1 s)w   Y  d S )NzPlease pip install py7zrr   Trd   r   )r   ZPY7ZR_AVAILABLEr   py7zrr   rf   ZSevenZipFilerg   )r'   r!   r   archiver   r   r   r)      s   "zSevenZipExtractor.extractrs   r   r   r   r   r      rt   r   c                   @   ri   )Lz4Extractors   "Mr'   r!   r   Nc              	   C   s   t jstddd l}|j| d,}t|d}t|| W d    n1 s)w   Y  W d    d S W d    d S 1 sAw   Y  d S )NzPlease pip install lz4r   r=   rm   )r   ZLZ4_AVAILABLEr   Z	lz4.frameframer>   ro   rp   )r'   r!   lz4r   rq   r   r   r   r)     s   "zLz4Extractor.extractrs   r   r   r   r   r      rt   r   c                   @   s   e Zd ZU eeeeeee	e
ed	Zeeee f ed< edd Zedeeef defddZeddeeef d
edefddZedeeef defddZe		ddeeef deeef dee dee ddf
ddZdS )r   )	tarrn   zipxzZrarr   r   Z7zr   
extractorsc                 C   s   t dd | j D S )Nc                 s   s.    | ]}t |tr|jD ]}t|V  qqd S r   )
issubclassr:   r;   rD   )rF   r   Zextractor_magic_numberr   r   r   rG     s    z9Extractor._get_magic_number_max_length.<locals>.<genexpr>)rJ   r   values)r4   r   r   r   _get_magic_number_max_length  s   z&Extractor._get_magic_number_max_lengthr   r<   c                 C   s&   zt j| |dW S  ty   Y dS w )N)r<   rB   )r:   rA   rK   )r   r<   r   r   r   _read_magic_number$  s
   zExtractor._read_magic_numberFreturn_extractorr   c                 C   s>   t jdtd | |}|r|sdS d| j| fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.categoryTF)FN)warningswarnFutureWarningr(   r   )r4   r   r   r*   r   r   r   r6   +  s   
zExtractor.is_extractablec                 C   sB   |   }| ||}| j D ]\}}|j||dr|  S qd S )NrI   )r   r   r   itemsr6   )r4   r   Zmagic_number_max_lengthrC   r*   r   r   r   r   r(   7  s   z Extractor.infer_extractor_formatN
deprecatedr'   r!   r*   r   c                 C   s  t jt j|dd tt|d}t|d tj	|dd |s&|dkrQ|dks/t
|ts?tjdtd |dkr<|n|}n| j| }|||W  d    S tjdtd | j D ]}||rs|||  W  d    S q]W d    d S 1 sw   Y  d S )	NTrd   z.lock)ignore_errorsr   zsParameter 'extractor' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'extractor_format' instead.r   ztParameter 'extractor_format' was made required in version 2.4.0 and not passing it will raise an exception in 3.0.0.)r   rf   r   rX   r/   r   with_suffixr   ro   rmtree
isinstancer   r   r   r   r)   r   r6   )r4   r'   r!   r*   r   Z	lock_pathr   r   r   r)   ?  s2   



"zExtractor.extractr+   )Nr   )r,   r-   r.   rQ   rj   ru   r   r   r   r   r   r   r   r   r/   r   r1   rO   r8   r   r9   r	   r   rP   r   r0   r6   r(   r   r)   r   r   r   r   r     sB   
 
"

r   ))r   rn   r   r   ro   r   rR   r   r   abcr   r   pathlibr   typingr   r   r   r   r	    r   Z	_filelockr   loggingr   r,   r^   r   r1   r:   rQ   rj   ru   r   r   r   r   r   r   r   r   r   r   r   <module>   s:    
1
4

