o
    +ifm7                     @   s~  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	m
Z
mZ d dlmZ d dlmZmZmZmZmZ ddlmZ ddlmZ dd	lmZ eeZG d
d dZG dd de
ZG dd dee
ZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%G dd deZ&G d d! d!eZ'G d"d# d#Z(dS )$    N)ABCabstractmethod)Path)DictListOptionalTypeUnion   )config   )FileLock)
get_loggerc                   @   s`   e Zd Zddee fddZdedefddZd	ed
edefddZdded
edefddZ	dS )ExtractManagerN	cache_dirc                 C   s&   |r
t j|tjntj| _t| _d S N)	ospathjoinr   EXTRACTED_DATASETS_DIREXTRACTED_DATASETS_PATHextract_dir	Extractor	extractor)selfr    r   O/var/www/html/corbot_env/lib/python3.10/site-packages/datasets/utils/extract.py__init__   s   
zExtractManager.__init__r   returnc                 C   s,   ddl m} tj|}tj| j||S )Nr   )hash_url_to_filename)
file_utilsr   r   r   abspathr   r   )r   r   r   abs_pathr   r   r   _get_output_path   s   zExtractManager._get_output_pathoutput_pathforce_extractc                 C   s*   |pt j| ot j|ot | S r   )r   r   isfileisdirlistdir)r   r$   r%   r   r   r   _do_extract%   s   $zExtractManager._do_extractF
input_pathc                 C   s>   | j |}|s
|S | |}| ||r| j ||| |S r   )r   infer_extractor_formatr#   r)   extract)r   r*   r%   extractor_formatr$   r   r   r   r,   *   s   
zExtractManager.extractr   F)
__name__
__module____qualname__r   strr   r#   boolr)   r,   r   r   r   r   r      s
    r   c                   @   s\   e Zd Zeedeeef defddZ	e
edeeef deeef ddfdd	ZdS )
BaseExtractorr   r   c                 K      d S r   r   clsr   kwargsr   r   r   is_extractable5      zBaseExtractor.is_extractabler*   r$   Nc                 C   r5   r   r   )r*   r$   r   r   r   r,   9   r:   zBaseExtractor.extract)r/   r0   r1   classmethodr   r	   r   r2   r3   r9   staticmethodr,   r   r   r   r   r4   4   s    .r4   c                   @   s`   e Zd ZU g Zee ed< edee	e
f defddZeddee	e
f dedefd	d
ZdS )MagicNumberBaseExtractormagic_numbersr   magic_number_lengthc                 C   s8   t | d}||W  d    S 1 sw   Y  d S )Nrb)openread)r   r?   fr   r   r   read_magic_numberA   s   $z*MagicNumberBaseExtractor.read_magic_number    magic_numberr   c                    sV    st dd | jD }z| || W n
 ty   Y dS w t fdd| jD S )Nc                 s   s    | ]}t |V  qd S r   )len.0cls_magic_numberr   r   r   	<genexpr>I   s    z:MagicNumberBaseExtractor.is_extractable.<locals>.<genexpr>Fc                 3   s    | ]}  |V  qd S r   )
startswithrH   rF   r   r   rK   N   s    )maxr>   rD   OSErrorany)r7   r   rF   r?   r   rM   r   r9   F   s   z'MagicNumberBaseExtractor.is_extractableNrE   )r/   r0   r1   r>   r   bytes__annotations__r<   r	   r   r2   intrD   r;   r3   r9   r   r   r   r   r=   >   s   
 &r=   c                   @   s`   e Zd Zedeeef defddZe	dd Z
e	deeef deeef dd	fd
dZd	S )TarExtractorr   r   c                 K   s
   t |S r   )tarfile
is_tarfiler6   r   r   r   r9   R   s   
zTarExtractor.is_extractablec                 #   s    dt dt fdddt dt dtffdd dt dtf fdd	}|}| D ]D} |j|r<td
|j d q)| rS|||rStd
|j d|j  q)| rj|||rjtd
|j d|j  q)|V  q)dS )a  
        Fix for CVE-2007-4559
        Desc:
            Directory traversal vulnerability in the (1) extract and (2) extractall functions in the tarfile
            module in Python allows user-assisted remote attackers to overwrite arbitrary files via a .. (dot dot)
            sequence in filenames in a TAR archive, a related issue to CVE-2001-1267.
        See: https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2007-4559
        From: https://stackoverflow.com/a/10077309
        r   r   c                 S   s   t jt j| S r   )r   r   realpathr!   )r   r   r   r   resolvedb   s   z*TarExtractor.safemembers.<locals>.resolvedbasec                    s    t j|| | S r   )r   r   r   rL   )r   rZ   )rY   r   r   badpathe   s   z)TarExtractor.safemembers.<locals>.badpathc                    s*   t j|t j| j} | j|dS )N)rZ   )r   r   r   dirnamenamelinkname)inforZ   tipr[   rY   r   r   badlinki   s   z)TarExtractor.safemembers.<locals>.badlinkzExtraction of z is blocked (illegal path)z is blocked: Symlink to z is blocked: Hard link to N)r2   r3   r]   loggererrorissymr^   islnk)membersr$   rb   rZ   finfor   ra   r   safemembersV   s   zTarExtractor.safemembersr*   r$   Nc                 C   s:   t j|dd t| }|j|t||d |  d S )NTexist_ok)rg   )r   makedirsrV   rA   
extractallrU   ri   close)r*   r$   tar_filer   r   r   r,   z   s   
zTarExtractor.extract)r/   r0   r1   r;   r	   r   r2   r3   r9   r<   ri   r,   r   r   r   r   rU   Q   s    
#,rU   c                   @   <   e Zd ZdgZedeeef deeef ddfddZdS )GzipExtractors   r*   r$   r   Nc              	   C   x   t | d,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s5w   Y  d S Nr@   wb)gziprA   shutilcopyfileobj)r*   r$   	gzip_fileextracted_filer   r   r   r,         "zGzipExtractor.extract	r/   r0   r1   r>   r<   r	   r   r2   r,   r   r   r   r   rq          ,rq   c                       sj   e Zd Zg dZeddeeef dede	f fddZ
edeeef d	eeef dd
fddZ  ZS )ZipExtractor)s   PKs   PKs   PKrE   r   rF   r   c                    s  t  j||dr
dS zddlm}m}m}m}m}m}m	}	m
}
m}m} t|d}|	|}|r|| dkrK|| dkrK|| dkrK	 W d    W dS || || kr|||  | || kr|| |
kr||
}t||
krt||}|| |kr	 W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS W d    W dS 1 sw   Y  W dS  ty   Y dS w )NrM   Tr   )
_CD_SIGNATURE_ECD_DISK_NUMBER_ECD_DISK_START_ECD_ENTRIES_TOTAL_ECD_OFFSET	_ECD_SIZE_EndRecDatasizeCentralDirstringCentralDirstructCentralDirr@   F)superr9   zipfiler~   r   r   r   r   r   r   r   r   r   rA   seektellrB   rG   structunpack	Exception)r7   r   rF   r~   r   r   r   r   r   r   r   r   r   fpendrecdatacentdir	__class__r   r   r9      sT   0$






zZipExtractor.is_extractabler*   r$   Nc                 C   sR   t j|dd t| d}|| |  W d    d S 1 s"w   Y  d S )NTrj   r)r   rl   r   ZipFilerm   rn   )r*   r$   zip_filer   r   r   r,      s
   

"zZipExtractor.extractrQ   )r/   r0   r1   r>   r;   r	   r   r2   rR   r3   r9   r<   r,   __classcell__r   r   r   r   r}      s    &$0r}   c                   @   rp   )XzExtractors   7zXZ r*   r$   r   Nc              	   C   sv   t | ,}t|d}t|| W d    n1 sw   Y  W d    d S W d    d S 1 s4w   Y  d S )Nrt   )lzmarA   rv   rw   r*   r$   compressed_filery   r   r   r   r,      s   "zXzExtractor.extractr{   r   r   r   r   r      r|   r   c                   @   s>   e Zd ZddgZedeeef deeef ddfddZdS )	RarExtractors   Rar! s   Rar! r*   r$   r   Nc                 C   sD   t jstddd l}tj|dd || }|| |  d S )NzPlease pip install rarfiler   Trj   )	r   RARFILE_AVAILABLEImportErrorrarfiler   rl   RarFilerm   rn   )r*   r$   r   rfr   r   r   r,      s   

zRarExtractor.extractr{   r   r   r   r   r      s    ,r   c                   @   rp   )ZstdExtractors   (/r*   r$   r   Nc              	   C   s   t jstddd l}| }t| d,}t|d}||| W d    n1 s+w   Y  W d    d S W d    d S 1 sCw   Y  d S )NzPlease pip install zstandardr   r@   rt   )r   ZSTANDARD_AVAILABLEr   	zstandardZstdDecompressorrA   copy_stream)r*   r$   zstddctxifhofhr   r   r   r,      s   PzZstdExtractor.extractr{   r   r   r   r   r      r|   r   c                   @   rp   )Bzip2Extractors   BZhr*   r$   r   Nc              	   C   rr   rs   )bz2rA   rv   rw   r   r   r   r   r,      rz   zBzip2Extractor.extractr{   r   r   r   r   r      r|   r   c                   @   rp   )SevenZipExtractors   7z'r*   r$   r   Nc                 C   s`   t jstddd l}tj|dd || d}|| W d    d S 1 s)w   Y  d S )NzPlease pip install py7zrr   Trj   r   )r   PY7ZR_AVAILABLEr   py7zrr   rl   SevenZipFilerm   )r*   r$   r   archiver   r   r   r,      s   "zSevenZipExtractor.extractr{   r   r   r   r   r      r|   r   c                   @   rp   )Lz4Extractors   "Mr*   r$   r   Nc              	   C   s   t jstddd l}|j| d,}t|d}t|| W d    n1 s)w   Y  W d    d S W d    d S 1 sAw   Y  d S )NzPlease pip install lz4r   r@   rt   )r   LZ4_AVAILABLEr   	lz4.frameframerA   rv   rw   )r*   r$   lz4r   ry   r   r   r   r,     s   "zLz4Extractor.extractr{   r   r   r   r   r      r|   r   c                   @   s   e Zd ZU eeeeeee	e
ed	Zeeee f ed< edd Zedeeef defddZeddeeef d
edefddZedeeef dee fddZe		ddeeef deeef dee dee ddf
ddZdS )r   )	tarru   zipxzrarr   r   7zr   
extractorsc                 C   s   t dd | j D S )Nc                 s   s.    | ]}t |tr|jD ]}t|V  qqd S r   )
issubclassr=   r>   rG   )rI   r   extractor_magic_numberr   r   r   rK     s    z9Extractor._get_magic_number_max_length.<locals>.<genexpr>)rN   r   values)r7   r   r   r   _get_magic_number_max_length  s   z&Extractor._get_magic_number_max_lengthr   r?   c                 C   s&   zt j| |dW S  ty   Y dS w )N)r?   rE   )r=   rD   rO   )r   r?   r   r   r   _read_magic_number$  s
   zExtractor._read_magic_numberFreturn_extractorr   c                 C   s>   t jdtd | |}|r|sdS d| j| fS |sdS dS )Nz{Method 'is_extractable' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'infer_extractor_format' instead.categoryTF)FN)warningswarnFutureWarningr+   r   )r7   r   r   r-   r   r   r   r9   +  s   
zExtractor.is_extractablec                 C   sB   |   }| ||}| j D ]\}}|j||dr|  S qd S )NrM   )r   r   r   itemsr9   )r7   r   magic_number_max_lengthrF   r-   r   r   r   r   r+   7  s   z Extractor.infer_extractor_formatN
deprecatedr*   r$   r-   r   c                 C   s  t jt j|dd tt|d}t|d tj	|dd |s&|dkrQ|dks/t
|ts?tjdtd |dkr<|n|}n| j| }|||W  d    S tjdtd | j D ]}||rs|||  W  d    S q]W d    d S 1 sw   Y  d S )	NTrj   z.lock)ignore_errorsr   zsParameter 'extractor' was deprecated in version 2.4.0 and will be removed in 3.0.0. Use 'extractor_format' instead.r   ztParameter 'extractor_format' was made required in version 2.4.0 and not passing it will raise an exception in 3.0.0.)r   rl   r   r\   r2   r   with_suffixr   rv   rmtree
isinstancer   r   r   r   r,   r   r9   )r7   r*   r$   r-   r   	lock_pathr   r   r   r,   ?  s2   



"zExtractor.extractr.   )Nr   )r/   r0   r1   rU   rq   r}   r   r   r   r   r   r   r   r   r2   r   r4   rS   r;   r   r<   r	   r   rT   r   r3   r9   r   r+   r,   r   r   r   r   r     sB   
 
" 

r   ))r   ru   r   r   rv   r   rV   r   r   abcr   r   pathlibr   typingr   r   r   r   r	    r   	_filelockr   loggingr   r/   rc   r   r4   r=   rU   rq   r}   r   r   r   r   r   r   r   r   r   r   r   <module>   s:    
1
4

