o
    Zh                    @   s~  d dl Z d dlZd dlZd dlZd dlZd dlZd dlZd dlZd dlZd dl	Z	d dl
mZ d dlmZ d dlZd dlZd dlZd dlmZ d dlZd dlZd dlmZ d dlZd dlmZmZmZmZm Z  zd dl!Z"W n e#yw   dZ"Y nw zd dl$m%Z& W n e#y   dZ&Y nw zd dl'm(Z) W n e#y   dZ)Y nw ej*j%Z+dd Z,dd Z-d	d
 Z.ej/dd Z0ej/dd Z1ej/dddd Z2ej/dd Z%ej/ddgddgddd Z3ej*j(dd Z4dd Z5ej*j(d d! Z6ej*j(d"d# Z7ej*j(d$d% Z8ej*j(d&d' Z9ej*j(d(d) Z:ej*j(d*d+ Z;ej*j(d,d- Z<d.d/ Z=d0d1 Z>d2d3 Z?d4d5 Z@d6d7 ZAej*j(d8d9 ZBej*j(d:d; ZCej*j(d<d= ZDd>d? ZEd@dA ZFej*GdBejHdCddDdEdFggej*GdGddgej*j(dHdI ZIej*j(dJdK ZJej*j(ej*jKdLdM ZLdNdO ZMdPdQ ZNej*j(dRdS ZOej*j(ddTdUZPej*j(dVdW ZQej*j!ej*j(dXdY ZRej*j(dZd[ ZSej*j(d\d] ZTej*j(d^d_ ZUej*j!ej*j(d`da ZVej*j(dbdc ZWej*j(ddde ZXddfdgZYej*j!ej*j(dhdi ZZej*j(djdk Z[ej*j!ej*j(dldm Z\ej*j(dndo Z]ej*j(dpdq Z^ej*j(drds Z_ej*j(dtdu Z`ej*j(dvdw Zaej*j(dxdy Zbej*j!ej*j(dzd{ Zcej*j(ej*Gd|d}d~ dd~ gdd Zdej*j(ej*Gdddgej*Gd|dd~ dd~ gdd Zeej*Gd|dd~ dd~ gdd Zfej*Gd|dd~ dd~ gdd Zgdd Zhdd Ziej*j(ej*j!dd Zjdd Zkdd Zldd Zmdd Zndd Zodd Zpdd Zqej*j(dd Zrdd ZsdddZtdd Zudd Zvdd Zwej*j(dd Zxej*j(dd Zyej*j(dd Zzej*j(dd Z{ej*j(dd Z|ej*j(dd Z}ej*j(dd Z~ej*j(dd Zej*j(dd Zej*j(dd ZddĄ ZddƄ ZddȄ Zddʄ Zej*Gdddgdd̈́ Zddτ Zej*j(ddф Zej*j(ddӄ Zej*j(ddՄ Zej*j(ddׄ Zddل Zddۄ Zej*j(ej*Gdddgej*Gdddgej*Gdddgej*Gdg dg dfg dg dfg dg dfg dg dfg dg dfg dg dfg dg dfgdd Zej*j!dd Zej/dd Zej*j(ej*jKdd Zej*j(ej*jKdd Zej*j(ej*jKdd Zej*j(dd Zej*j(dd Zej*j!dd Zej*j(dd Zej*j(dd Zej*j(dd  Zdd Zdd Zdd Zdd Zej*j(d	d
 Zej*j(dd Zdd Zej*jdd Zej*jdd Zdd Zej*jdd Zej*j!dd Zej*j!ej*Gdg ddd Zdd Zdd  Zd!d" Zej*j!d#d$ Zej*j!d%d& Zej*j!d'd( Zd)d* Zd+d, Zd-d. Zej*j!ej*Gdg d/d0d1 Zd2d3 Zej*j(ej*j!d4d5 Zej*j(ej*j!ej*jejd6kd7d8d9d: Zej*j(ej*j!d;d< Zej*j(d=d> Zej*j(ej*j!d?d@ ZdAdB ZdCdD Zej*j(ej*j!dEdF Zej*j(ej*j!dGdH Zej*j(ej*j!dIdJ Zej*j(ej*j!dKdL Zej*j(dMdN Zej*j(ej*j!dOdP Zej*j(ej*j!dQdR ZdSdT Zej*j!ej*j(dUdV Zej*j(ej*j!dWdX ZŐdYdZ Z	dd[d\Zej*j(d]d^ Zej*j(ej*j!d_d` Zɐdadb Zʐdcdd Zːdedf Zej*j(dgdh Z͐didj Zej*j!dkdl ZϐdĐdpdqZАdrds Zѐdtdu Zej*j(dvdw Zej*j(dxdy Zej*j(dzd{ Zej*j(d|d} Zej*j(ej*j!d~d Zej*j(ej*j!dd Zؐdd Zِdd Zڐdd Zېdd Zܐdd Zej*j(dd Zސdd Zej*j(dd Zej*j(dd Zej*j(ej*j!dd Zdd Zej*j(dd Zej*j(ej*jKdd ZdZej*j(ej*jKdd Zej*j(dd Zej*j%dd Zej*j%dd Zej*j%dd Zej*Gdddgdd Zej*Gdddgdd Zdd Zdd Zej*Gdddd Zdd Zej*j(dd Zdd Zdd Zej*Gdddgdd Zdd ZdS (      N)copytreequote)FSProtocolClassProxyHandler_configure_s3_limited_user_filesystem_uri
change_cwdc                 C   s~   dd l }dd l}| ddd}|jdd}|g d}g }t| D ]}|||t|t|f ||7 }q"tj	|g ddS )	Nr   i        )days)greenblueyellowredZorange)dateindexvaluecolorcolumns)
datetime	itertools	timedeltacyclerangeappendfloatnextpd	DataFrame)nr   r   dayintervalcolorsdatai r'   Q/var/www/html/lang_env/lib/python3.10/site-packages/pyarrow/tests/test_dataset.py_generate_data@   s   
r)   c              
   C   s\   t t dt  t dt  t dt  t dt  g}t jj| |dd}|	 S )Nr   r   r   r   F)schemaZpreserve_index)
par*   fielddate32int64float64stringTableZfrom_pandasreplace_schema_metadata)dfr*   tabler'   r'   r(   _table_from_pandasP   s   r5   c              	   C   sx   |   D ]5}| '}t|tjsJ |jrJ | sJ | s$J | r*J W d    n1 s4w   Y  qd S N)	get_fragmentsopen
isinstancer+   Z
NativeFileclosedseekablereadablewritable)datasetfragmentnfr'   r'   r(   +assert_dataset_fragment_convenience_methods[   s   

rA   c            
      C   s   t  } ddg}t|D ]\}}d||}| | | |e}ttdttt	tdttt
td|gd dd tdD g}tdt fdt fd	t fd
t fdtt t dfg}tj||d}tj|g}	t|	| W d    n1 sw   Y  q| S )Nzsubdir/1/xxxzsubdir/2/yyyz{}/file{}.parquetr   c                 S   "   g | ]}|d  t |d  dqS    abstr).0jr'   r'   r(   
<listcomp>x      " zmockfs.<locals>.<listcomp>i64f64rI   conststructrE   r*   )fs_MockFileSystem	enumerateformat
create_diropen_output_streamlistr   mapr   rI   r+   r*   r.   r/   r0   rQ   record_batchr1   from_batchespqwrite_table)
mockfsdirectoriesr&   	directorypathoutr%   r*   batchr4   r'   r'   r(   r_   f   s6   





r_   c                    sx   ddl m}m} ddlm} |   fddt  fdd}| |d	| || }tjfd
d}||fS )Nr   )LocalFileSystemPyFileSystemr
   )r   c                    s    fdd| D S )Nc                    s   h | ]	}  t|qS r'   )normalize_pathrI   rJ   plocalfsr'   r(   	<setcomp>       z6open_logging_fs.<locals>.normalized.<locals>.<setcomp>r'   )pathsrj   r'   r(   
normalized   s   z#open_logging_fs.<locals>.normalizedc                    s$     t|}| | j|S r6   )rg   rI   add_fsopen_input_file)selfrb   )rk   openedr'   r(   rr      s   
z(open_logging_fs.<locals>.open_input_filerr   c              	   3   sB       zd V  W   | ksJ d S   | ks J w r6   )clear)Zexpected_opened)ro   rt   r'   r(   assert_opens   s
   .z%open_logging_fs.<locals>.assert_opens)	
pyarrow.fsre   rf   Ztest_fsr   setsetattr
contextlibcontextmanager)Zmonkeypatchre   rf   r   rr   rS   rv   r'   )rk   ro   rt   r(   open_logging_fs   s   r|   module)scopec              	   C   s,  | j jd | j jd td}t }t|d\}}}}|d t	t|dD ]&\}}d
|}	||	}
tt||
 W d    n1 sNw   Y  q-|d ||jjj|jgD ]0\}}d	j
| }d

|}	|| ||	}
tt||
 W d    n1 sw   Y  qd|d ||jjj|jjjgD ]0\}}dj
| }d

|}	|| ||	}
tt||
 W d    n1 sw   Y  q|d |dD ]1\}}d
|}d

|}	|| ||	}
tt||
 W d    n	1 sw   Y  q|S )Npandasparquet     plain
   zplain/chunk-{}.parquetr*   zschema/{}/{}z{}/chunk.parquethivezhive/year={}/month={}Z
hive_colorr   zhive_color/color={})configpyarrowrequiresr)   rS   rT   npZarray_splitrW   rU   rV   rX   r]   r^   r5   groupbyr   dtZ	dayofweekr   yearmonth)requestr3   r_   Zdf_aZdf_bZdf_cZdf_dr&   chunkrb   rc   partfolderr'   r'   r(   multisourcefs   sP   






"






r   c              
   C   sf   t  }tjddd}t d}t ttdt	 tdt
 g|_t | |||}| S )NsubdirT	recursivegroupkey)dsParquetFileFormatrS   FileSelectorFileSystemFactoryOptionsDirectoryPartitioningr+   r*   r,   int32r0   partitioningFileSystemDatasetFactoryfinish)r_   rV   selectoroptionsfactoryr'   r'   r(   r>      s   
r>   TFZthreadedserial)paramsZidsc                    s   | j  G  fddd}| S )z]
    Fixture which allows dataset scanning operations to be
    run with/without threads
    c                       sT   e Zd Z fddZ fddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dS )zdataset_reader.<locals>.readerc                    s
    | _ d S r6   use_threads)rs   r   r'   r(   __init__   s   
z'dataset_reader.<locals>.reader.__init__c                    s   d|v rt d |d< d S )Nr   z9Invalid use of dataset_reader, do not specify use_threads)	Exception)rs   kwargsr   r'   r(   _patch_kwargs   s
   z,dataset_reader.<locals>.reader._patch_kwargsc                 [      |  | |jdi |S Nr'   )r   to_tablers   r>   r   r'   r'   r(   r        
z'dataset_reader.<locals>.reader.to_tablec                 [   r   r   )r   
to_batchesr   r'   r'   r(   r     r   z)dataset_reader.<locals>.reader.to_batchesc                 [   r   r   )r   scannerr   r'   r'   r(   r   
  r   z&dataset_reader.<locals>.reader.scannerc                 [      |  | |j|fi |S r6   )r   head)rs   r>   num_rowsr   r'   r'   r(   r        
z#dataset_reader.<locals>.reader.headc                 [   r   r6   )r   take)rs   r>   indicesr   r'   r'   r(   r     r   z#dataset_reader.<locals>.reader.takec                 [   r   r   )r   
count_rowsr   r'   r'   r(   r     r   z)dataset_reader.<locals>.reader.count_rowsN)__name__
__module____qualname__r   r   r   r   r   r   r   r   r'   r   r'   r(   reader   s    r   )param)r   r   r'   r   r(   dataset_reader   s   	$r   c                    sX  t t dt  g}t  ddg}dd tddD } fddt||D }td	td
k}tj	|| |d}tj	j
|| ||d}||fD ]}t|tj	sYJ t|jtjsbJ |j|sjJ t|jt|ksuJ t| }t|||D ]q\}	}
}|	j|
sJ |	j|ksJ t|	jtjsJ t|	tjsJ |	jdgksJ |	jdksJ t|	 }|	jt|  krdksJ  J t|d tjsJ |d j|ksJ |d jdgksJ |d jdksJ qt|jtddkd}t|dks	J qOtj	|| d}|jtdsJ tj	j
|| d}|jtds5J | D ]}	|	jtdsGJ q9tjtdd t	| | W d    n	1 scw   Y  tjtdd tj	|| dd W d    n	1 sw   Y  tjtdd tj	j
| d W d    d S 1 sw   Y  d S )NrP   subdir/1/xxx/file0.parquetsubdir/2/yyy/file1.parquetc                 S   s   g | ]	}t d |kqS )r   r   r,   rJ   xr'   r'   r(   rL   $  rm   z+test_filesystem_dataset.<locals>.<listcomp>r
   rD   c                    s   g | ]\}}  ||qS r'   make_fragment)rJ   rb   r   file_formatr_   r'   r(   rL   %  s    leveli9  )r*   rV   
filesystemroot_partition)r*   rV   r   
partitionsr   r   filter   r*   rV   r   Tzincorrect typematch)r*   rV   r   rV   )r+   r*   r,   r.   r   r   r   zipscalarFileSystemDataset
from_pathsr9   rV   partition_expressionequalsrx   filesrY   r7   rb   ParquetFileFragment
row_groupsnum_row_groupssplit_by_row_grouplenpytestraises	TypeError)r_   r*   rn   r   	fragmentsr   Zdataset_from_fragmentsZdataset_from_pathsr>   r?   	partitionrb   row_group_fragmentsr'   r   r(   test_filesystem_dataset  sx   "$r   c                 C   s   t t dt  g}t }dg}tjj|||t	 d}|
  tt | | W d    d S 1 s9w   Y  d S )Nf1znonexistingfile.arrowr   )r+   r*   r,   r.   r   IpcFileFormatr   r   rS   re   r7   r   r   FileNotFoundErrorr   )r   r*   r   rn   r>   r'   r'   r(   1test_filesystem_dataset_no_filesystem_interactione  s   "r   c           	      C   s  t | tjsJ t | jtjsJ tjg dt d}tjg dt d}|	| D ]}t |tj
s6J |d|s@J |d|sJJ q,||  D ]}t |tjs\J t |jtjseJ qR|| }t |tjssJ t|dks{J tddk}| jd|d}|d	 }|d ddgksJ |d
 ddgksJ t|d	 ddgksJ t|d ddgksJ tddk}| jd|d}|d	 }|d g dksJ |d
 g dksJ |d	 g dksJ |d g dksJ tdtd
tddkd}| jd|d}|d }t|g dks#J |d g dks.J |d
 g dks9J |d g dksDJ t|  d S )Nr   r
   r   rD   r   typer   r
   r   rN   T)r   r   r   rO         ?r   r   xxxyyy)rQ   rG   1)r
   r   r
   r   )r         @r   r   )r
   r
   r   r   )r   r   r   r   )rN   rO   new)r   r   )
r   r   r
   r
   r   r   rD   rD   r   r   )
        r   r   r          @r         @r   r   r   r   )
FFTTFFFFTT)r9   r   Datasetr*   r+   Schemaarrayr.   r/   r   RecordBatchcolumnr   r   scan_batchesZTaggedRecordBatchr?   ZFragmentr   r1   r   r,   sort_by	to_pydictsortedrY   rA   )	r>   r   expected_i64expected_f64rd   r4   	conditionresultZ
projectionr'   r'   r(   test_dataset{  sN   
r  c                 C   s(   | j ddd}t|}|jdksJ d S )N      )Zfragment_readaheadZbatch_readahead   )r   r   num_columns)r>   r   rd   r'   r'   r(   test_scanner_options  s   r	  c           	      C   sV  |j | t d}t|tjsJ ttj |j | dgd W d    n1 s*w   Y  |j | dgt d}|j	| j
ksBJ |jt
dt fgksQJ t|tjsYJ | }| D ]}|j
|jkskJ |jdksrJ qa||  ks}J |j
|jksJ t|jD ]}t|g}||||ksJ qttj |t|jg W d    n1 sw   Y  |j| ksJ |j | g dt d}| }g d}|j|ksJ |d}|d	  d
gd dgd  ksJ |d  dgd dgd  ksJ |d  dgd ksJ |d  dgd ks)J d S )N)memory_poolunknownr   rN   )r   r
  r
   )
__filename__fragment_index__batch_index__last_in_fragmentr  r  r   r   r   r   r  r   r  T)r   r+   default_memory_poolr9   r   Scannerr   r   ArrowInvalidZdataset_schemar*   projected_schemar.   r   r   r  	to_readerread_allr   r   r   r   ZArrowIndexErrorr   column_namesr   	to_pylist)	r>   r   r   r4   rd   r&   r   Zexpected_namesZsorted_tabler'   r'   r(   test_scanner  sR   


& r  c              	   C   sd   t  }t  }t | z| }tj| }| }| |ks$J W t | d S t | w r6   )	r+   r  Zsystem_memory_poolZset_memory_poolZbytes_allocatedr   r  Zfrom_datasetr   )r>   old_poolpoolZallocated_beforer   _r'   r'   r(   test_scanner_memory_pool  s   
r  c                 C   s  | | d}|tjjg | jdksJ |j | ddgd }|ddgiks'J |j | ddgtddkd }|dddgiksBJ |j | d	dgd }|dtt	d
d iks[J t
|  }|j ddgd }|ddgikstJ |j d	dgd }|dtt	d
iksJ d S )Nr   rR   r
   rN   r   r   r   r   rD      r   )r   r+   r1   r\   r*   r   r   r,   rY   r   r   r7   )r>   r   r  r?   r'   r'   r(   	test_head  s"   r  c                 C   s
  t |  }ddgtddgfD ]}|||}||||ks%J qtt ||tdg W d    n1 s@w   Y  ddgtddgfD ]}|| ||| |ksbJ qPtt || tdg W d    d S 1 s~w   Y  d S )Nr
   rD   r   r  r   )	r   r7   r+   r   r   r   r   r   
IndexError)r>   r   r?   r   expectedr'   r'   r(   	test_take  s    
"r"  c                 C   s   t |  }||dksJ |j|tddkddksJ || dks(J |j| tddkddks8J |j| tddkddksHJ |j| tdd	k dd	ksXJ d S )
Nr   rN   r   r   r
   r   r   rD   r   )r   r7   r   r   r,   )r>   r   r?   r'   r'   r(   test_count_rows)  s    $r#  c               	   C   sN   t jt jt jg} | D ]}tt |  W d    n1 sw   Y  q
d S r6   )r   Z
FileFormatr  Partitioningr   r   r   )classesklassr'   r'   r(   test_abstract_classes9  s   r'  c                  C   s  t t dt  t dt  g} tjtjtjfD ]}|| }t	|tj
s)J ||| ks1J |dks7J qt t dt  t dt  g} t| }t|jdksYJ tdd |jD seJ |d	}t	|tjsrJ tdd
ktddk@ }||sJ tt j |d W d    n1 sw   Y  |d}tdd
k}||sJ |tj| ddksJ t t dt  t dt  g} tj| dd}t|jdksJ tdd |jD sJ |d}tdtdktdtd
k@ }||sJ |d}td tdtd
k@ }||s.J dD ]}tt j || W d    n	1 sIw   Y  q0|tj| ddks\J t t dt  t dt  g} t| }t|jdks~J tdd |jD sJ |d}t	|tjsJ tdd
ktddk@ }||sJ tt j |d W d    n	1 sw   Y  |tj| ddksJ t t dt  t dt t  t  g} tj| dt g did}|jd d u sJ |jd  g dksJ |tj| d dks"J tjt t dt  t dt t  t  gdt g did}|jd d u sQJ |jd  g dks_J t jt td t tjd t t d!d"gd#gg d$d%}t d&t  fg}tjtjtjfD ]9}t!" )}||}tj#||d'|d( tj$|d'|d(}	|	% }
|
|sJ W d    n	1 sw   Y  qd S ))NrN   rO   zother objectr   r   r   c                 s       | ]}|d u V  qd S r6   r'   r   r'   r'   r(   	<genexpr>V      z$test_partitioning.<locals>.<genexpr>z/3/3.14/rD   gQ	@z/prefix/3/aaaz/3/nonesegment_encodingalphabetaxyz)null_fallbackc                 s   r(  r6   r'   r   r'   r'   r(   r)  l  r*  z/alpha=0/beta=3/r   z/alpha=xyz/beta=3/)z/alpha=one/beta=2/z/alpha=one/z
/beta=two/otherc                 s   r(  r6   r'   r   r'   r'   r(   r)    r*  z3_3.14_Zprefix_3_aaa_)firstsecondthirddictionariesr
      rF   rG   r   r   f2r   namesr   ipcrV   r   )&r+   r*   r,   r.   r/   r   r   HivePartitioningFilenamePartitioningr9   r$  r   r7  allparse
Expressionr   r   r   r  r   is_null
dictionaryint8r0   r   r  r4   r   r   randomrandnrepeattempfileTemporaryDirectorywrite_datasetr>   r   )r*   r&  r   exprr!  Z
shouldfailr4   partitioning_schematempdir	load_backload_back_tabler'   r'   r(   test_partitioningD  s   




 


rR  c              
   C   s   t t dt  t dt  g}t|t|t|tj|ddtj|ddtj|dddg}|D ]}| 	| 
||ksDJ q6d S )NrN   rO   r+  r,  r0  )r-  r1  )r+   r*   r,   r.   r/   r   r   r?  r@  loadsdumps)pickle_moduler*   partsr   r'   r'   r(   test_partitioning_pickling  s   	rW  c                  C   s   t tg dg dd} t d}t d}| j|d || |d |d| d	d
}tg dg dg dg dd	}||sHJ d S )Nr
   r   rD   )r   r   r   rE   rF   rG   r
   r   r/   )za+1zb-aza*2za/br   r   rD   r   )r
   r   )r   r      )      ?r   g      ?)r   r>   r+   r4   r,   r   castr   )r>   rF   rG   r  r!  r'   r'   r(   $test_expression_arithmetic_operators  s   


r^  c                  C   s   dd dD \} }}t | ddiksJ t | t | ks!J t | |@ |@ dd dD ks3J t ddk}t |i ksCJ t | |@ ddiksPJ t d }t |dd iksbJ d S )	Nc                 S   s   g | ]	}t ||kqS r'   r   rJ   fr'   r'   r(   rL     rm   z'test_partition_keys.<locals>.<listcomp>abcrF   c                 S   s   i | ]}||qS r'   r'   r_  r'   r'   r(   
<dictcomp>  s    z'test_partition_keys.<locals>.<dictcomp>drD   )r   Zget_partition_keysZ_get_partition_keysr,   rD  )rF   rG   cZnopenullr'   r'   r(   test_partition_keys  s   $rf  c                  C   s   t  } t jddgd}t jdd}| jt ksJ |jddhks#J | jdks*J |jdks1J | | ks7J | |ks=J | |ksCJ d S )NrF   rG   dictionary_columnsmscoerce_int96_timestamp_unitns)r   ParquetReadOptionsrh  rx   rk  )opts1opts2opts3r'   r'   r(   test_parquet_read_options  s   rq  c                  C   sf   t  } t jdhd}t jdd}| jt  ksJ |jt jdgdks&J |jt jddks1J d S )NrF   rg  srj  )r   r   read_optionsrm  )Zpff1Zpff2Zpff3r'   r'   r(   %test_parquet_file_format_read_options   s   rt  c                  C   s  t  } t jdd}t jddd}t jddd}t jdd	d
}t jdd}tjdddd}t jd|d}| jdu s;J | jdksBJ | jdu sIJ | jdksPJ | jdksWJ | j	du s^J |jdu seJ |jdkslJ |jdu ssJ |jdu szJ |jdksJ |jdu sJ |jdu sJ |jdksJ |jdu sJ |jdksJ |jd	ksJ |j	du sJ |jdu sJ |j
|ksJ |j
| j
ksJ | | ksJ | |ksJ ||ksJ ||ksJ || ksJ || ksJ || ksJ d S )N   buffer_sizei    T)rw  use_buffered_streamF)rw  
pre_bufferi@ i )thrift_string_size_limitthrift_container_size_limitpage_checksum_verificationr  )Zhole_size_limitZrange_size_limitZlazy)ry  cache_optionsi @B )r   ParquetFragmentScanOptionsr+   ZCacheOptionsrx  rw  ry  rz  r{  r}  r~  )rn  ro  rp  Zopts4Zopts5Zopts6Z
cache_optsZopts7r'   r'   r(   test_parquet_scan_options  sZ   r  c                 C   s  t  t  t tjjdddt jtjjddgddt jtjjddd	dt  t jtjjdd
ddt jtjjddddg}z	|	t 
  W n	 tyT   Y nw td urt|t  t jdhdt jddt jdddddg |D ]}| | ||ksJ qvd S )N	T)	delimiterZignore_empty_linesrD   foo)	skip_rowsr  rs  i   )r  
block_sizeignoreZnewlines_in_valuesZunexpected_field_behavior)Zparse_optionsF   r   r  rF   rg  )rx  ru  {   i  )rx  rw  rz  r{  )r   r   CsvFileFormatr+   csvParseOptionsReadOptionsJsonFileFormatjsonr   OrcFileFormatImportErrorr]   extendr   rS  rT  )rU  formatsr   r'   r'   r(   test_file_format_picklingA  sR   



r  c              
   C   s   t  t jtjjdddt jtjjdddt  t tjjddd	t jtjjdd
ddg}t	d urD|
t jddt jddg |D ]}| | ||ksTJ qFd S )NT)strings_can_be_nullconvert_options   r  r  Ferrorr  i   r  ru  rv  )ry  )r   CsvFragmentScanOptionsr+   r  ConvertOptionsr  JsonFragmentScanOptionsr  r  r]   r  r  rS  rT  )rU  r   optionr'   r'   r(   #test_fragment_scan_options_picklingh  s2   

r  paths_or_selectorr   r   r   r   ry  c                 C   sj  t jt jdhd|d}t d}t ttdt tdt	 g|_
|jdks/J |jddgks8J |jd	u s?J t | |||}| }| jttd
t tdt tdtt t	 tdt tdtt t	 dtdt tdt	 gd	dsJ t| tsJ t||t jsJ |jt dsJ | }t|t jsJ | }tjg dt d}	tjg dt d}
tjtjg dt dtjd  t	 d}tdd t!dD }|" }t#|ddgddgD ]\\}}}}tj|gd t d}tj|gd t	 d}tj|d gd t d}|j$d usEJ |j%dksMJ |d |	sWJ |d |
saJ |d |skJ |d |suJ |d |sJ |d |sJ |d |sJ q|& }t|tj'sJ t(|d ksJ |j%dksJ d S )!NrI   rg  )rs  ry  r   r   r   .r  FrN   rO   rP   rQ   rE   Zcheck_metadataTr   r   z	0 1 2 3 4c                 S   rB   rC   rH   rJ   r&   r'   r'   r(   rL     s    z+test_filesystem_factory.<locals>.<listcomp>r   r
   r   r   r   r  r   rD   r   r[  r   ))r   r   rm  r   r   r+   r*   r,   r   r0   r   Zpartition_base_dirZselector_ignore_prefixesZexclude_invalid_filesr   inspectr   r.   r/   rE  rQ   r9   inspect_schemasrY   r   r   r   r   r   r   ZDictionaryArrayfrom_arrayssplitr   r   r   r   r  r   r1   r   )r_   r  ry  rV   r   r   inspected_schemar>   r   r   r  Zexpected_strZexpected_structiteratorrd   r?   r   r   Zexpected_groupZexpected_keyZexpected_constr4   r'   r'   r(   test_filesystem_factory  s   

	


"r  c                 C   s   t  }t jd| |d}|jD ]A}||| }|jdgksJ |j|| dgd}||fD ]}t|t js6J |j|ks=J t|j	t
| sGJ q,|jdgksPJ qd S )N/plainr   rV   r   r   )r   r   r>   r   r   r   r9   r   rb   r   r   )r   parquet_formatr>   rb   r?   Zrow_group_fragmentr`  r'   r'   r(   test_make_fragment  s    
r  c                    s  | \}}}}}}}t  |g}fdd|D }	t j|	|jd   }
|
|s0J  fdd jD }fddt||D }t j||jd}  }
|
|s\J dd |D }fddt||D }t j||jd}tj	t
jjdd	 | }W d
   n1 sw   Y  dd |D }fddt||D }t j||jd}tj	tdd	 | }W d
   d
S 1 sw   Y  d
S )z
    Test passing file_size to make_fragment. Not all FS implementations make use
    of the file size (by implementing an OpenInputFile that takes a FileInfo), but
    s3 does, which is why it's used here.
    c                    s   g | ]}  |qS r'   r   rJ   rb   r   rS   r'   r(   rL     s    z0test_make_fragment_with_size.<locals>.<listcomp>)rV   r*   r   c                    s   g | ]	} j |jqS r'   )r   Zget_file_infosizer   r>   r'   r(   rL     rm   c                        g | ]\}} j ||d qS )	file_sizer   rJ   rb   r  r  r'   r(   rL         c                 S      g | ]}d qS )r
   r'   r  r'   r'   r(   rL         c                    r  r  r   r  r  r'   r(   rL     r  zParquet file size is 1 bytesr   Nc                 S   r  )r  r'   r  r'   r'   r(   rL     r  c                    r  r  r   r  r  r'   r(   rL     r  zHTTP status 416)r   r   r   r*   r   r   r   r   r   r   r   libr  OSError)s3_example_simpler4   rb   urihostport
access_key
secret_keyrn   r   ZtblZ
sizes_trueZfragments_with_sizeZdataset_with_sizeZsizes_toosmallZsizes_toolarger'   )r>   r   rS   r(   test_make_fragment_with_size  sP   





"r  c                 C   s   t d}t|d}t }||}t|	 tj
s J tjg dg dg dgg dd}| ||s<J |||}| || sPJ d S )NzT
        alpha,num,animal
        a,12,dog
        b,11,cat
        c,10,rabbit
    utf-8rF   rG   rd        r   dogcatrabbitr.  numanimalr;  )textwrapdedentr+   	py_bufferencoder   r  r   r9   r8   BufferReaderr4   r   r   rS  rT  )r   rU  contentbuffer
csv_formatr?   r!  pickledr'   r'   r(   "test_make_csv_fragment_from_buffer  s   


r  c                 C   s   d}t |d}t }||}t| t jsJ t j	g dg dg dgg dd}| 
||s9J |||}| 
||
 sMJ d S )Nz{"alpha" : "a", "num": 12, "animal" : "dog"}
{"alpha" : "b", "num": 11, "animal" : "cat"}
{"alpha" : "c", "num": 10, "animal" : "rabbit"}
r  r  r  r  r  r;  )r+   r  r  r   r  r   r9   r8   r  r4   r   r   rS  rT  )r   rU  r  r  Zjson_formatr?   r!  r  r'   r'   r(   #test_make_json_fragment_from_buffer8  s   

r  c                 C   s   t g dt g dt g dg}|d  |d |d  g}tjtjddgd	d
dd}|t f||fg}|D ]<\}}t j|g dd}t  }t	|| |
 }	||	}
| |
|sgJ |||
}| ||syJ q=d S )Nr  r  r  r   r
   r   r.  r  rg  Tru  )rs  rx  rw  r  r;  )r+   r   dictionary_encoder   r   rm  r4   ZBufferOutputStreamr]   r^   getvaluer   r   r   rS  rT  )r   rU  arraysZdictionary_arraysZdictionary_formatcasesZformat_r4   rc   r  r?   r  r'   r'   r(   &test_make_parquet_fragment_from_bufferN  s8   


	
r  c                 C   sl   t jtddgd dgd dgd  gg dd}t| d }tj||d	g|d
 tj|dd|d}||fS )Nr  r
   rF   r   rG   r9  r;  test_parquet_datasetr   )partition_cols
chunk_sizer   r   )rV   r   r   )r+   r4   r   rI   r]   write_to_datasetr   r>   )rO  r  r   r4   rb   r>   r'   r'   r(   _create_dataset_for_fragmentst  s   "r  c                 C   s2  t | \}}t| }t|dksJ |d }ddg}|jj|ks$J |j|j|j	|jks2J |j
tddks?J ||}|j|ksKJ ||dddsYJ |j||jd}|jg d	ksjJ ||ddsuJ |j|jdksJ |j||jtddk d
}|jg d	ksJ d S )Nr   r   r   r:  r   rF   r   rR   r9  )r*   r   )r  rY   r7   r   physical_schemar<  rV   r  rb   r   r   r   r   r,   r   r  Zremove_columnslicer*   remove)rO  r   r4   r>   r   r`  Zphysical_namesr  r'   r'   r(   test_fragments  s&   
r  c                 C   s   t jtddgd dgd  gddgd}t| d }tj||dgd	 tjt d
gdd}tj	|d|d}|j
tddkd}tt|dksLJ d S )Nr  r
   r   r   colr   r;  r  r  )r   rF  r   flavorr   r>  r   )r+   r4   r   rI   r]   r  r   r   r*   r>   r7   r,   r   rY   )rO  r4   rb   r   r>   r   r'   r'   r(   test_fragments_implicit_cast  s   *r  c           
         s  t | \ }	 d fdd	}t| d }|j}|||}||||ks-J |j|j|j	|j
d}||||sEJ ||d |j|j|j	|j
d}||dtddk d	 |j|j|j	|j
d}||ddgtdd
k d |j|j|j	|j
d}||dtddkd	 d|jddd }	tjt|	d  |j|j|j	|j
d}|j|tddkd	 W d    d S 1 sw   Y  d S )Nc                    sP   | j  j||d}|r|n j}|j|ksJ  j| |}||s&J d S )Nr*   r   r   )r   r*   r  r  selectr   )r?   Z	row_slicer   r   actualr  r!  r4   r'   r(   assert_yields_projected  s   z;test_fragments_reconstruct.<locals>.assert_yields_projectedr   )r   )r   r   )r   r   r   r   r   r   r  r   rF   z&No match for FieldRef.Name\(part\) in Fr   NN)r  rY   r7   rV   rS  rT  r   r   rb   r   r   r   r   r,   r  Z	to_stringr   r   
ValueError)
rO  r   rU  r>   r  r?   r  pickled_fragmentnew_fragmentpatternr'   r  r(   test_fragments_reconstruct  s`   


"r  c                 C   s^  t | dd\}}t| d }t| }t||j  kr$dks'J  J |j|d |jd}|jg dks:J t|dksBJ |	|
ddsMJ |d jd usVJ |d jdks_J |d jd jdddddddkstJ t|jtd	dk d
d }t|td	dk }t|dksJ |j|d td	dk d
}t|dksJ d S )Nr   r  r   rR   r9  r
   minmaxr   r:  r   r   )r  rY   r7   r   r   r   r   r*   r  r   r  r   
statisticsr   r,   )rO  r   r4   r>   r?   r   r  r'   r'   r(   !test_fragments_parquet_row_groups  s.   "
r  c                 C   s   t dtdi}tj|| d dd tj| d dd}t| d }|j	j
|j|jd	d
gd}|jdks8J |  |jdksCJ t|jdksLJ d S )NrF   r  test.parquetr   row_group_sizer   r   r   r
   rD   r  )r+   r4   r   r]   r^   r   r>   rY   r7   rV   r   rb   r   r   ensure_complete_metadatar   r   )rO  r4   r>   Zoriginal_fragmentr?   r'   r'   r(   %test_fragments_parquet_num_row_groups  s   r  c                 C   s   t tddgddgd}|d d|d< tt|| d  d	d lm	} |	| d }|j
||ddkd
}|jd	 | k  sIJ d S )NrF   rG   r
   r   )col1col2r  categoryztest_filter_dictionary.parquetr   r   )r   r    dictastyper]   r^   r+   r4   pyarrow.datasetr>   r   r,   Ziloc	to_pandasrA  )rO  r   r3   r   r>   r  r'   r'   r(   ,test_fragments_parquet_row_groups_dictionary"  s   "r  c                 C   s  |\}}t | d|d\}}t| d }||jg |  W d    n1 s*w   Y  |jddgks8J |g  |  W d    n1 sKw   Y  t|jtj	sYJ |j
j|j|jddgd}|j|jksnJ |  |jd }	|	jdks~J |	jdksJ |	jd usJ |||}
||jg% |
jddgksJ |
jd }	|	jdksJ |	jd usJ W d    d S 1 sw   Y  d S )Nr   r  r   r   r
   r  )r  rY   r7   rb   r  r   r9   metadatar]   ZFileMetaDatarV   r   r   idr   r  rS  rT  )rO  r|   rU  rS   rv   r  r>   r?   r  	row_groupr  r'   r'   r(   &test_fragments_parquet_ensure_metadata1  s:   





"r  c           
      C   s   |\}}t | |d\}}t| d }|g  |||}W d    n1 s+w   Y  ||jg |j}	W d    n1 sDw   Y  |	dgksPJ d S )Nr   r
   r   )r  rY   r7   rS  rT  rb   r   )
rO  r|   rU  rS   rv   r  r>   r?   r  r   r'   r'   r(   )test_fragments_parquet_pickle_no_metadataZ  s   
r  c                 C   s  t jt g dt  t g dt  t g dt  t g dt  t g dt  t g dt  t g dt 	 t g dt 
 t g dt  t g dt  t g dt  t g dt  t g dt  t g dt dt g dt dt g dt dt g dt  t g dt  t g dt dt g dt dgg d	d
}t| d }tj|||d |tj|dddfS )N)TNF)r
   r   *   )r   g      $@      E@)rF   Nzrr  ri  us)r
   r   l    jt )booleanrF  uint8int16uint16r   uint32r.   uint64r   doubleutf8binaryts[s]ts[ms]ts[us]r-   date64time32time64r;  Ztest_parquet_dataset_all_typesr  r   r   r>  )r+   r4   r   Zbool_rF  r  r  r  r   r  r.   r   Zfloat32r/   r"  r#  	timestampr-   r'  r(  r)  rI   r]   r  r   r>   )rO  r  r4   rb   r'   r'   r(   _create_dataset_all_typesl  s6   /r+  c              
      s  t | \}}t| d }dd l  fdd} fdd} fdd} j} j}t| }	|	d jd us9J |	d jd }
|
jdksGJ |
j	d	ksNJ |
j
i d
ddddddddddddddddddddddddddddddddddddddddddddddddd d!dd"|d|ddd#|d|ddd$|d|ddd%|d&dd'|d&d'd(d|d&dd|d&d'd)d|ddd|dddd|dddd|dddddd*ksJ d S )+Nr   c                    s     ddddd| S N  r
   r   r   r   r.  r'   r(   dt_s  s    z.test_parquet_fragment_statistics.<locals>.dt_sc              
      s     dddddd| d S )Nr-  r
   r   r   r.  r/  r.  r'   r(   dt_ms  rm   z/test_parquet_fragment_statistics.<locals>.dt_msc              	      s     dddddd| S r,  r.  r/  r.  r'   r(   dt_us      z/test_parquet_fragment_statistics.<locals>.dt_usrD   r   r  FTr  rF  r
   r  r  r  r  r   r  r.   r   r   r   r  r!  r"  rF   r  r#     a   zr$  r%  r&  r-   r-  r   r     )r'  r(  r)  )r+  rY   r7   r   r   timer   r   r   Ztotal_byte_sizer  )rO  r4   r>   r?   r0  r1  r2  r   r7  r   r  r'   r.  r(    test_parquet_fragment_statistics  sh   








	




r8  c                 C   sn   t g dg dd}tj|| d dd tj| d dd}t| d	  }|d
 j	d	 j
i ks5J d S )N)r   r
   NN)rF   rG   NNrE   r  r   r  r   r   r   r
   )r+   r4   r]   r^   r   r>   rY   r7   r   r   r  )rO  r4   r>   r   r'   r'   r(   &test_parquet_fragment_statistics_nulls  s
   r9  c                 C   st   t g dg ddd d }|j| d dd tj| d dd	}t| d  }|d jd j	i ks8J d S )
N)rF   rG   rG   r   r   r[  rE   r   r  r   Zenginer   r   )
r   r    
to_parquetr   r>   rY   r7   r   r   r  )rO  r3   r>   r   r'   r'   r(   'test_parquet_empty_row_group_statistics  s
    r=  c                 C   s   t | dd\}}t| d }|jtddksJ t|jtddk|jd}t	|dks4J t|jtddk|jd}t	|dksKJ d S )Nr   r  r   r   rF   r   r*   rG   )
r  rY   r7   r   r   r   r,   r   r*   r   )rO  r4   r>   r?   r   r'   r'   r(   +test_fragments_parquet_row_groups_predicate  s   r?  c                 C   sL  t | dd\}}t| d }|j}t| }|||}||||ks-J |j|j	|j
|jdgd}	||	}
|
||d sKJ |j|j	|j
|jdhd}	|j|	|jddgtddk d	}
|
jddgksrJ t|
dkszJ |j|j	|j
|jdhd}	tjtd
d ||	 W d    d S 1 sw   Y  d S )Nr   r  r   )r   r   r
   r   r   rD   r  zreferences row group 2r   )r  rY   r7   rV   r   rS  rT  r   r   rb   r   r   r   r*   r   r,   r  r   r   r   r   )rO  r   rU  r4   r>   r?   r  r   r  r  r  r'   r'   r(   -test_fragments_parquet_row_groups_reconstruct  sH   
"r@  c           
      C   s  |\}}t | d|d\}}t| d }|jddgd}|g " |jdks)J |jddgks2J |jd jd us<J W d    n1 sFw   Y  ||}	|	 ddgddgdks_J |jg d}|jdkslJ |jg kssJ |j||j	d}	|	j
dksJ |	|d d sJ d S )	Nr
   r  r   rD   Zrow_group_idsr   r   rR   )r  rY   r7   subsetr   r   r  r   r   r*   r   r   
rO  r|   r   rS   rv   r4   r>   r?   subfragr  r'   r'   r(   !test_fragments_parquet_subset_ids(  s&   


rE  c           
      C   sR  |\}}t | d|d\}}t| d }|tddk}|g " |jdks+J t|jdks4J |jd j	d us>J W d    n1 sHw   Y  |
|}	|	 g dg ddksaJ |tdd	k}|jdksrJ |jg ksyJ |j
||jd
}	|	jdksJ |	|d d sJ |jtddk|jd
}|jdksJ d S )Nr
   r  r   r   rD   rX  )r
   r
   r
   r   r   rR   r   rF   r   )r  rY   r7   rB  r   r,   r   r   r   r  r   r   r*   r   r   rC  r'   r'   r(   $test_fragments_parquet_subset_filterD  s*   


rF  c                 C   s   t | dd\}}t| d }tt |jtddkddgd W d    n1 s.w   Y  tt |  W d    d S 1 sHw   Y  d S )Nr
   r  r   r   r   rA  )	r  rY   r7   r   r   r  rB  r   r,   )rO  r  r>   r?   r'   r'   r(   %test_fragments_parquet_subset_invalidd  s   
"rG  c           
      C   s  t g d}t g d}t g d}t jj||gddgd}t jj||gddgd}t d	|i}tj|| d
 dd tj| d
 dd}t	|
 d }|jdksVJ |td	ddk}	|	jdkshJ |td	ddk}	|	jdkszJ |td	dddk}	|	jdksJ |td	dddk}	|	jdksJ tjt jdd |td	ddk W d    n1 sw   Y  tjtdd |td	ddk W d    d S 1 sw   Y  d S )N)r   r
   r   rD   )皙?皙?333333?皙?r
   r   rD   r   f21f22r;  r   r:  r  zdata_struct.parquetr   r  r   r   r   r
   r   zNo match for FieldRef.Nestedr   f3z)Function 'greater' has no kernel matching)r+   r   ZStructArrayr  r4   r]   r^   r   r>   rY   r7   r   rB  r,   r   r   r  NotImplementedError)
rO  r   rM  rN  r:  Z
struct_colr4   r>   r?   rD  r'   r'   r(   0test_fragments_parquet_subset_with_nested_fieldsq  s4   "rQ  c                 C   s   t | d }t|dkst|dksJ t| \}}tj|dd}t | d }t|d|jt	|ks<J | d }t
j|| tj|dd}t | d }t|d	|jt	|ksgJ d S )
Nr   zb<pyarrow.dataset.ParquetFileFragment path=subdir/1/xxx/file0.parquet partition=[key=xxx, group=1]>zb<pyarrow.dataset.ParquetFileFragment path=subdir/1/xxx/file0.parquet partition=[group=1, key=xxx]>r   r   z-<pyarrow.dataset.ParquetFileFragment path={}>data.featherfeatherz/<pyarrow.dataset.FileFragment type=ipc path={}>)rY   r7   repr_create_single_filer   r>   rV   r   rg   rI   r+   rS  write_feather)rO  r>   r?   r4   rb   r'   r'   r(   test_fragments_repr  s0   rW  r  c                 C      | S r6   r'   r   mr'   r'   r(   <lambda>      r[  c                 C      | || S r6   rS  rT  rY  r'   r'   r(   r[    r  c                 C   s   t jddd}t }td}tjddg}|||}t|tjs%J ||_	t
| |||}| }tdt fdt fdt fd	t fd
tt t dfdt fdt fg}	||	skJ tj }
t|
tjsxJ d S )Nr   Tr   r   r   rN   rO   rI   rP   rQ   rE   )rS   r   r   r   r   r   discoverr9   PartitioningFactorypartitioning_factoryr   r  r+   r*   r.   r/   r0   rQ   r   r   r?  )r_   r  rU  r  rV   r   ra  r   r  expected_schemaZhive_partitioning_factoryr'   r'   r(   test_partitioning_factory  s.   







	
rc  infer_dictionaryc                 C   rX  r6   r'   rY  r'   r'   r(   r[    r\  c                 C   r]  r6   r^  rY  r'   r'   r(   r[    r  c                 C   s4  t jddd}t }td}tjjddg|d}||||_t| |||}|	 }	|rt
t
 t
 }
|	dj|
ksBJ |   }|dd}t
dgd	 d
gd	   }||shJ | jtddkd}|dd}|dd	}||sJ d S |	djt
 ksJ d S )Nr   Tr   r   r   rd  r   r   r   r   r   )rS   r   r   r   r   r   r_  ra  r   r  r+   rE  r   r0   r,   r   r   r   combine_chunksr   r   r   r  r   r  )r_   rd  r  rU  r  rV   r   ra  r   inferred_schemaexpected_typer4   r  r!  r'   r'   r(   $test_partitioning_factory_dictionary  s.   
ri  c                 C   rX  r6   r'   rY  r'   r'   r(   r[     r\  c                 C   r]  r6   r^  rY  r'   r'   r(   r[     r  c              
   C   s  t  }t }tdt fg}tjtt	dg|d}tdt
dfdt fg}tdt fdt fg}tt|t| }dD ]>}	||	 ||	d (}
tj|
|}|| |  W d    n1 suw   Y  W d    n1 sw   Y  qKt jd	d
d}td	}tjj|d}| |||_t||||}| }||ksJ | jdtdt id}|d d  dksJ tjjddgdd}| |||_t||||}t|  }|d j !tddktddk@ s
J tj|dd}| |||_"t||||}t|  }|d j !tddktddk@ s>J tjj|dd}| |||_t||||}t#j$tj%dd | }W d    n	1 slw   Y  t jdd
d}td}tj&j|d}| |||_t||||}| }||ksJ | jdtdt id}|d d  dksJ tj&jdd}| |||_t||||}t|  }|d j !tddktddk@ sJ tj&|dd|_"t||||}t|  }|d j !tddktddk@ sJ tj&j|dd}| |||_t||||}t#j$tj%dd | }W d    d S 1 sNw   Y  d S )NrN   r   rR   r   rr  r0   )z%directory/2021-05-04 00%3A00%3A00/%24z,hive/date=2021-05-04 00%3A00%3A00/string=%24
/0.featherra   Tr   date_intr   r   逎`r+  r,  2021-05-04 00%3A00%3A00%24r*   r-  +Could not cast segments for partition fieldr   r   )'rS   rT   r   r   r+   r*   r.   r4   r   r   r*  r0   rY   rW   rX   r=  new_filer^   closer   r   r   r_  ra  r   r  r   r   r,   r]  as_pyr7   r   r   r   r   r   r  r?  )r  rU  r_   rV   r*   r4   partition_schemastring_partition_schemafull_schemara   sinkwriterr   r   ra  r   rg  r  r   r   r'   r'   r(   *test_partitioning_factory_segment_encoding  s   














$ry  c                 C   rX  r6   r'   rY  r'   r'   r(   r[  f  r\  c                 C   r]  r6   r^  rY  r'   r'   r(   r[  f  r  c              	   C   s  t  }t }tdt fg}tjtt	dg|d}tdt
dfdt fg}tdt fdt fg}tt|t| }tdt
dfdt fg}	tdt fdt fg}
d	}|| ||d
 (}tj||}|| |  W d    n1 sw   Y  W d    n1 sw   Y  t jddd}td}tjj|d}| |||_t||||}| }||ksJ | jdtdt id}|d d  dksJ tjjdd}| |||_t||||}t|  }|d j !tddktddk@ s#J tj|dd}| |||_"t||||}t|  }|d j !tddktddk@ sWJ tjjdd}| |||_t||||}t|  }|d j !tddktddk@ sJ tj|
dd}| |||_"t||||}t|  }|d j !tddktddk@ sJ tjj|	dd}| |||_t||||}t#j$tj%dd | }W d    d S 1 sw   Y  d S )NrN   r   rR   ztest'; daterr  ztest';[ string'ztest%27%3B%20dateztest%27%3B%5B%20string%27zLhive/test%27%3B%20date=2021-05-04 00%3A00%3A00/test%27%3B%5B%20string%27=%24rj  r   Tr   rk  r   r   rl  r  r,  z2021-05-04 00:00:00$r+  rm  rn  ro  rp  r   )&rS   rT   r   r   r+   r*   r.   r4   r   r   r*  r0   rY   rW   rX   r=  rq  r^   rr  r   r   r?  r_  ra  r   r  r   r   r,   r]  rs  r7   r   r   r   r   r   r  )r  rU  r_   rV   r*   r4   rt  ru  rv  Zpartition_schema_enZstring_partition_schema_enra   rw  rx  r   r   ra  r   rg  r  r   r   r'   r'   r(   ;test_partitioning_factory_hive_segment_encoding_key_encodede  s   















$r{  c              
   C   s   t g dg dd}tt t dt  t dt  g}tt j	 tj
|| d|d W d    d S 1 s=w   Y  d S )Nr   yNr   r}  r  rE   rF   rG   r=  r>  )r+   r4   r   r   r*   r,   r0   r   r   r  rL  rO  r4   r   r'   r'   r(   /test_dictionary_partitioning_outer_nulls_raises  s   $"r  c                 C   sV   t g dg dd}tt t|| d W d    d S 1 s$w   Y  d S )Nr|  r~  rE   zbasename-{i}.arrow)r+   r4   r   r   r   r   rL  )rO  r4   r'   r'   r(   test_positional_keywords_raises  s   "r  c                 C   s   d}t t d|d t|d d}tj|d | | d dgd tj|d |d  | d dgd tj| d dgd	}|d jdksHJ tj| d dd
gd	}|d jdks\J tj| d dgd	}|d jdksoJ d S )Ni   r   r
   )r   r   oner   r  twor   r   r   )	r+   r4   rI  r   aranger]   r  
read_tableZ
num_chunks)rO  Z
BATCH_SIZEr4   r'   r'   r(   test_read_partition_keys_only  s&   


r  c                    s    t  }t fdd|D S )Nc                    s"   g | ]}t jt j |qS r'   )osrb   isdirjoin)rJ   elbasedirr'   r(   rL     rM   z _has_subdirs.<locals>.<listcomp>)r  listdirany)r  elementsr'   r  r(   _has_subdirs  s   
r  c                 C   sZ   t | D ]%}t j| |}t j|r*t||}t|r%t||| q|| qd S r6   )	r  r  rb   r  r  	posixpathr  _do_list_all_dirsr   )r  Zpath_so_farr  r`  Ztrue_nestedZnorm_nestedr'   r'   r(   r    s   
r  c                 C   s   g }t | d| |S )N )r  )r  r  r'   r'   r(   _list_all_dirs  s   r  c                 C   s    t t| }|t |ksJ d S r6   )rx   r  )rO  Zexpected_directoriesZactual_directoriesr'   r'   r(   _check_dataset_directories  s   r  c              
   C   sh   t g dg dd}tt t dt  t dt  g}tj|| d|d t| g d d S )	Nr~  r|  rE   rF   rG   r=  r>  )zx/xzy/yr  )	r+   r4   r   r   r*   r,   r0   rL  r  r  r'   r'   r(   (test_dictionary_partitioning_inner_nulls   s   $r  c              
   C   sl   t g dg dd}tt t dt  t dt  gd d}tj|| d|d t| g d	 d S )
N)r   Nr  r|  rE   rF   rG   r0  r=  r>  )za=x/b=xz	a=xyz/b=yz	a=z/b=xyz)	r+   r4   r   r?  r*   r,   r0   rL  r  r  r'   r'   r(   test_hive_partitioning_nulls  s   r  c                  C   s0  t dt  fdt  fg} ddg}t| }t|tjs J tj| dd}t|tjs/J tj|d}t|tjs=J t	
t t  W d    n1 sQw   Y  t	j
tdd tj| d W d    n1 snw   Y  t	j
tdd tj| | d W d    n1 sw   Y  tj| d	d
}t|tjsJ tj| dd	d}t|tjsJ tjd	d
}t|tjsJ t	
t tj|d	d
 W d    n1 sw   Y  t	j
tdd tj|d	d W d    n1 sw   Y  t	
t tj| dd
 W d    d S 1 sw   Y  d S )Nr   r   Zinferr6  )field_nameszExpected listr   zCannot specify bothr   r  )r7  r  zCannot specify 'field_names')r  r  unsupported)r+   r*   r  rF  r   r   r9   r   r`  r   r   r  r?  )r*   r<  r   r'   r'   r(   test_partitioning_function  s@   

$r  c                 C   s   t t dt t  t  t dt t  t  g}tjj	|d}tj
dd| |d}|jj|ks7J | }|dj|jd sIJ |d dgd	 d
gd	  ks\J |dj|jd sjJ |d dgd	 dgd	  ks}J d S )Nr   r   rR   r   r   rV   r   r   r   r
   r   r   r   r   )r+   r*   r,   rE  rF  r   r0   r   r   r_  r>   r   r   r   r   r   typesr  )r_   r*   r   r>   r4   r'   r'   r(   *test_directory_partitioning_dictionary_key5  s   &*r  c           	      C   s.  t t dt t  t  t dt t  t  g}tjj|d}tj	dd| |d}|j
j|ks7J | }ttdd}ttd	d
}|dj|jd sWJ |djD ]}|j }|  ||ksnJ q]|dj|jd	 s}J |djD ]}|j }|  ||ksJ qd S )Nr   r   rR   r   r   r  i  i  r
      r   )r+   r*   r,   rE  rF  r  r   r?  r_  r>   r   r   rY   r   r   r   r   r  chunksr  sort)	r   r*   r   r>   r4   Zyear_dictionaryZmonth_dictionaryr   r  r'   r'   r(   %test_hive_partitioning_dictionary_keyJ  s.   

r  c                 C   sL   |d u rt tddgd dgd  d}| d }tj|||d ||fS )	N	   r   r   r   r   rE   r  r  r+   r4   r   r]   r^   )base_dirr4   r  rb   r'   r'   r(   rU  f  s
   $rU  c                 C   s   t tddgd dgd  d}| d }t|| t tdddgd dgd  d}| d	 }t|| ||f||ffS )
Nr  r   r   r   r   rE   ztest1.parquetr6  ztest2.parquetr  )r  Ztable1path1Ztable2path2r'   r'   r(   _create_directory_of_filesn  s   $&r  c                 C   sD   | | || fD ]}| j|jsJ || |sJ q
d S r6   )rS  rT  r*   r   r   )r>   r4   r   picklerrc  r'   r'   r(   _check_datasetx  s   r  c                 K   s   t | tjsJ | t| | gt| gfD ]}tj| fi |}t |tjs'J t|||| qt| j	" tj| j
fi |}t |tjsGJ t|||| W d    d S 1 sYw   Y  d S r6   )r9   pathlibPathrI   r   r>   r   r  r	   parentname)rb   r4   r   r  r   ri   r>   r'   r'   r(   _check_dataset_from_path  s   "r  c                 C   s   t | \}}t|||| d S r6   rU  r  rO  r   rU  r4   rb   r'   r'   r(   test_open_dataset_single_file  s   r  c                 C   s"   t | dd\}}t|||| d S )Nr
   r  r  r  r'   r'   r(   test_deterministic_row_order  s   r  c                 C   s(   t | \}}t|}t| ||| d S r6   )r  r+   concat_tablesr  )rO  r   rU  tablesr  r4   r'   r'   r(   test_open_dataset_directory  s   
r  c           
         s   t | \}\}}t|}t||gtt|t|gg}| fdd|D 7 }|D ]}|j|js7J ||}	|	|sCJ q,d S )Nc                    s   g | ]
}   |qS r'   r^  )rJ   rc  rU  r'   r(   rL     s    z3test_open_dataset_list_of_files.<locals>.<listcomp>)	r  r+   r  r   r>   rI   r*   r   r   )
rO  r   rU  r  r  r  r4   Zdatasetsr>   r  r'   r  r(   test_open_dataset_list_of_files  s   

r  c                 C   s   t | \}}t|}t|}|j|jsJ tj|t d}|j|js*J t	t
 tj|t d W d    d S 1 sDw   Y  d S )Nr  )rU  r   r   r>   r*   r   rS   re   r   r   r   rT   )rO  r4   rb   fspathdataset1dataset2r'   r'   r(   #test_open_dataset_filesystem_fspath  s   
"r  c                 C   s   | d }|   t|\}}||}t|}tj|t d}tjt|t|d}	|	|
|}
||||  krP||	  krP||
ksSJ  J d S )Nsingle-filer  )mkdirrU  relative_tor   r>   rS   re   rI   r   rS  rT  r   )rO  r   rU  ra   r4   rb   Zrelative_pathd1d2d3d4r'   r'   r(   test_construct_from_single_file  s   


r  c                 C   s   | d }|   t|\}}t|}tj|t d}tj|jt| d}||}	||}
||}|	|
  kr@|ksCJ  J |||fD ]}|	|
|}|||	ks[J qHd S )Nsingle-directoryr  )r  r  r   r>   rS   re   r  r   r   rS  rT  )rO  r   rU  ra   r  rn   r  r  r  t1t2t3rc  restoredr'   r'   r(   $test_construct_from_single_directory  s   



r  c                    s    d }|   t|\}} fdd|D }t  t|}||}t|ttt|ks3J W d    n1 s=w   Y  tj|t	 d}||}	t|}
||
}tj|t
 d}||}||	  krx|  krx|ks{J  J d S )Nzlist-of-filesc                    s   g | ]}|  qS r'   )r  rh   rO  r'   r(   rL     r3  z5test_construct_from_list_of_files.<locals>.<listcomp>r  )r  r  r	   r   r>   r   r   sumrZ   r   rS   re   )rO  r   ra   r  rn   Zrelative_pathsr  r  r  r  r  r  r  Zt4r'   r  r(   !test_construct_from_list_of_files  s    






*r  c                 C   sJ   ddg}t jtdd tj|| d W d    d S 1 sw   Y  d S )Nr   z!subdir/1/xxx/doesnt-exist.parquetzdoesnt-existr   r  )r   r   r   r   r>   )r_   r   r'   r'   r(   -test_construct_from_list_of_mixed_paths_fails	  s   "r  c                 C   s   t jddg| d}t jd| d}t ||g}t|t jsJ tt| dks+J | }t|dks7J |jdks>J t|j	dksGJ |j	D ]}|j
ddgksUJ qJd S )	Nr   r   r  r   r   r8  r   r   )r   r>   r9   UnionDatasetr   rY   r7   r   r  childrenr   )r_   rF   rG   r>   r4   childr'   r'   r(   (test_construct_from_mixed_child_datasets	  s$   
r  c                  C   s6   t jg dd} |  }|jdksJ |jdksJ d S )Nr=  r   r   )r   r>   r   r   r  )emptyr4   r'   r'   r(   test_construct_empty_dataset-	  s   r  c               	   C   sf   t jg dtdt fdt fgd} tjtdd | 	  W d    d S 1 s,w   Y  d S )Nr=  rF   rV   r*   zMultiple matches for .*a.* in r   )
r   r>   r+   r*   r.   r0   r   r   r  r   )r  r'   r'   r(   *test_construct_dataset_with_invalid_schema4	  s   



"r  c                    s|  t j| tdt  d}t j| tdt  d}tjjtt	dgdgd tjjtt	dgdgd}t
jtdd	 t ||g W d    n1 sQw   Y  d
}t
jt|d	 t g d W d    n1 sqw   Y  d}t
jt|d	 t d  W d    n1 sw   Y  d}t
jt|d	 t  fddt	dD  W d    n1 sw   Y  d}t
jt|d	 t g  W d    n1 sw   Y  d}t
jt|d	 t  |g W d    n1 sw   Y  d}t
jt|d	 t  dg W d    n	1 sw   Y  d}t
jt|d	 t  dg W d    d S 1 s7w   Y  d S )Nr  r   /schemar   rF   r;  rG   z"Expected.*FileSystemDatasetFactoryr   zExpected a list of path-like or dataset objects, or a list of batches or tables. The given list contains the following types: intrX  zbExpected a path-like, list of path-likes or a list of Datasets instead of the given type: NoneTypezcExpected a path-like, list of path-likes or a list of Datasets instead of the given type: generatorc                 3       | ]} V  qd S r6   r'   rJ   r  Zbatch1r'   r(   r)  b	      z<test_construct_from_invalid_sources_raise.<locals>.<genexpr>rD   zEMust provide schema to construct in-memory dataset from an empty listzFItem has schema
b: int64
which does not match expected schema
a: int64z}Expected a list of path-like or dataset objects, or a list of batches or tables. The given list contains the following types:r   zCExpected a list of tables or batches. The given list contains a int)r   r   rS   r   r   r+   r   r  r   r   r   r   r   r>   r  InMemoryDataset)r   child1child2Zbatch2r!  r'   r  r(   )test_construct_from_invalid_sources_raise=	  sd   $r  c                 C   s   t jjt tdgdgd}t j|g}tjg dt 	g d
 }|t g ks,J |||g|gfD ]6}t|}| 
||ksDJ tt| dksPJ t| 
 |ks\J t jt| |ksjJ q4d S )Nr   rF   r;  r=  r  r
   )r+   r   r  r   r   r1   r\   r   r>   r*   r   r4   r   rY   r7   r   r   )r   rd   r4   Zdataset_tablesourcer>   r'   r'   r(   test_construct_in_memory	  s   
r  r   c              	      s   t jjt tdgdgd t j g}d} fddd f fdd jffD ]2\}}tj	j| || d}|
 |ks?J tjt j|d	 |
  W d    n1 sVw   Y  q)d S )
Nr   rF   r;  z#OneShotFragment was already scannedc                      s   t j j gS r6   )r+   RecordBatchReaderr\   r*   r'   rd   r'   r(   r[  	  s    z$test_scan_iterator.<locals>.<lambda>c                      s    fddt dD S )Nc                 3   r  r6   r'   r  r  r'   r(   r)  	  r  z7test_scan_iterator.<locals>.<lambda>.<locals>.<genexpr>r
   )r   r'   r  r'   r(   r[  	  r3  r*   r   r   )r+   r   r  r   r   r1   r\   r*   r   r  r   r   r   r  )r   r4   r   r   r*   r   r'   r  r(   test_scan_iterator	  s"   

r  c                 C   s   t tddgd dgd  d}| d }|  tdD ]}|d	| }|  t|d| d|d
  q|dt j	t
g ddt  d}||fS )Nr  r   r   r   r   rE   zdataset-partitionedrD   zpart={}r  r   r   r
   r   r   )r+   r4   r   r  rV   r]   r^   r  append_columnr   r   rI  r   )r  r4   rb   r&   r   
full_tabler'   r'   r(   _create_partitioned_dataset	  s   $r  c           
      C   s\  t | \}}|ddg}t|||| tjt|tjddd}|j|js*J t	|  tjdtjddd}|j|jsCJ W d    n1 sMw   Y  tjt|dd}|j|jsdJ tjt|tjt
dt
 fgddd}|jt
dt
 }|j|sJ | }|dt
jtg dd	t
 d
}	||	sJ d S )NrF   rG   r   r  r   zdataset-partitioned/r   r  rD   r   )r  r  r  r   r>   rI   r   r*   r   r	   r+   rF  r   r,   r   r  r   r   rI  )
rO  r   rU  r  rb   r4   r>   rb  r  r!  r'   r'   r(   'test_open_dataset_partitioned_directory	  s8   

r  c                 C   s   t | \}}tt|}|j|jsJ tjt|t d}|j|js*J t|  tjdt d}W d    n1 sBw   Y  |j|jsPJ t	
t tjt|t d W d    d S 1 slw   Y  d S )Nr  r  )rU  r   r>   rI   r*   r   rS   re   r	   r   r   r   rT   )rO  r4   rb   r  r  dataset3r'   r'   r(   test_open_dataset_filesystem	  s   
"r  c                 C   sP   t | \}}tjtdd tj|gdd W d    d S 1 s!w   Y  d S )Nz format 'blabla' is not supportedr   Zblablar   )rU  r   r   r  r   r>   )rO  r  rb   r'   r'   r(   $test_open_dataset_unsupported_format	  s   "r  c                 C   s`   t | \}}t|}t||g}t|tjsJ |||}||||ks.J d S r6   )rU  r   r>   r9   r  rS  rT  r   )rO  r   rU  r  rb   r>   unionr  r'   r'   r(   test_open_union_dataset	  s   
r  c                 C   sT   t jd| dd}tjtdd t j|gdd W d    d S 1 s#w   Y  d S )Nr  r   r  zcannot pass any additionalr   r   )r   r>   r   r   r  )r   r  r'   r'   r(   .test_open_union_dataset_with_additional_kwargs
  s   "r  c                   C   s|   t t tjddd W d    n1 sw   Y  t jtjdd tjddd W d    d S 1 s7w   Y  d S )Nzi-am-not-existing.arrowr=  r   zcannot be relativer   zfile:i-am-not-existing.arrow)r   r   r   r   r>   r+   r  r'   r'   r'   r(   #test_open_dataset_non_existing_file
  s   "r  r   ra   r   r1  r0  partition_keysABCrX  )DEFr:  )r
   NrD   )r  Nr  )Nr   rD   c                    sl  t tddgd dgd  d}d |d v pd |d v }|d	kr&|r&d S |d	kr9tjjd
dg d}d}d }n|rDtjj |d}ntjj d}d}|rR|}nd}| d }	|	  |\}
}|
D ]!}|D ]}|	||pn||pq| }|jdd t	
||d  qfqbtjt|	|d} fdd}|jt d
||
d t d||d }|j|sJ d S )Nr  r   r   r   r   rE   r   r
   ra   part1part2re  z{0}/{1})rd  r1  zpart1={0}/part2={1}Z__HIVE_DEFAULT_PARTITION__r>   T)parentsr  r  c                    sH    rt | trt nt }tt |S t | tr t S t S r6   )r9   rI   r+   r0   r   rE  )r   Z
value_typere  r'   r(   rh  J
  s   z/test_partition_discovery.<locals>.expected_type)r+   r4   r   r   r   r_  r?  r  rV   r]   r^   r>   rI   r*   r   r,   r   )rO  r   r1  rd  r  r4   Zhas_nullfmtZ
null_valueZbasepathZ
part_keys1Z
part_keys2r  r  rb   r>   rh  rb  r'   re  r(   test_partition_discovery
  sT   $r  c           	      C   sV  t tddgdtdd}tj|dgjdd}tj	|| |d	d
 tj
| d	tjjddd}t |d |d  d}| |sIJ t| d }|j|jd|d d saJ |j}|||}| |suJ |||}|j|jd|d d sJ |j|jd |d d  sJ |j|sJ d S )Nr  r  r   r   r   r  r   r   r  rS  r   rV   Tre  r>  r  )r  r   r   rR   )r+   r4   r   rI  r   r   r   r  r*   rL  r>   r?  r_  r  r   r   rY   r7   r   rS  rT  r  )	rO  rU  r4   r   r>   r!  r?   Z	part_exprr  r'   r'   r(   4test_dataset_partitioned_dictionary_type_reconstructX
  s,      r  c                 C   s   ddl m} | d \}}}}d||||}||\}}|d tdg di}	|d}
t	|	|
 W d    n1 sAw   Y  |	|||||||fS )	Nr   
FileSystem
connectionz_s3://{}:{}@mybucket/data.parquet?scheme=http&endpoint_override={}:{}&allow_bucket_creation=TruemybucketrF   rX  zmybucket/data.parquet)
rw   r  rV   from_urirW   r+   r4   rX   r]   r^   )	s3_serverr  r  r  r  r  r  rS   rb   r4   rc   r'   r'   r(   r  w
  s   
r  c                 C   s^   | \}}}}}}}}t j|dd}|||sJ t j|d|d}|||s-J d S )Nr   r   rV   r   )r   r>   r   r   )r  r   r4   rb   rS   r  r  r>   r'   r'   r(   test_open_dataset_from_uri_s3
  s
   r	  c                 C   s   | \}}}}}}}}t d}ddlm}	m}
 |j||dd||id}tj|d|d}|	 
|s6J |
|	|}tj|d|d}|	 
|sMJ d S )	Ns3fsr   )FSSpecHandlerrf   Zendpoint_urlzhttp://{}:{})r   secretZclient_kwargsr   r  )r   importorskiprw   r  rf   S3FileSystemrV   r   r>   r   r   )r  r4   rb   r  r  r  r  r  r
  r  rf   rS   r>   r'   r'   r(   $test_open_dataset_from_uri_s3_fsspec
  s   
	r  c                 C   s.  ddl m} | d \}}}}d}d}d||||||}||\}	}|dks)J |	| tdg d	i}
|	|}t	|
| W d    n1 sMw   Y  t
j|d
d}| |
sbJ d||||}g d}|D ]\}}||}t
j||d
d}| |
sJ qptjtjdd |d}t
jd|d W d    n1 sw   Y  d}d}||}tt}t
jd|d W d    n1 sw   Y  t|j|d||ksJ d}||}tt}t
jd|d W d    n	1 sw   Y  t|j|d||ksJ d S )Nr   r  r  theirbucketnested/folder/data.parquetzOs3://{}:{}@{}/{}?scheme=http&endpoint_override={}:{}&allow_bucket_creation=truez&theirbucket/nested/folder/data.parquetrF   rX  r   r   3s3://{}:{}@{{}}?scheme=http&endpoint_override={}:{}))ztheirbucket/nested/folder/z/data.parquet)ztheirbucket/nested/folderdata.parquet)ztheirbucket/nested/folder/data.parquet)ztheirbucket/nestedr  )r  z/nested/folder/data.parquet)r  r  r  zMissing bucket namer   /z'/theirbucket/nested/folder/data.parquetr  zThe path component of the filesystem URI must point to a directory but it has a type: `{}`. The path component is `{}` and the given filesystem URI is `{}`ztheirbucket/doesnt/existr  ZNotFoundFile)rw   r  rV   r  rW   r+   r4   rX   r]   r^   r   r>   r   r   r   r   r  r  rI   r   )r  r  r  r  r  r  Zbucketrb   r  rS   r4   rc   r>   templater  prefixr  excr'   r'   r(   -test_open_dataset_from_s3_with_filesystem_uri
  sZ   




"r  c                 C   sD   t | \}}td}|d}tj||d}|j|js J d S )Nfsspecfiler  )rU  r   r  r   r   r>   r*   r   )rO  r4   rb   r  rk   r>   r'   r'   r(   test_open_dataset_from_fsspec
  s
   

r  c           	      C   s   t d}tdg di}| d }t|| |d}|| d ds)J t	
 }tt|}|||}||jsCJ |||}|j|jsRJ d S )Nr  rF   rX  r  r  r   )r   r  r+   r4   r]   r^   r   Zlsendswithr   r   rS   rf   r  r  r   r*   r   r  )	rO  r  r4   rb   	fsspec_fsrV   r   r*   r?   r'   r'   r(   test_file_format_inspect_fsspec  s   

r   c                 C   s  | d }t ddgd tdd}tj|dgjdd	}tj|||d
d tjt dt dfgdd	}tj	|d
|d}t
dtdk}|j||d}|d g dks]J dd l}t
d|dddk}|j||d}|d g dksJ d S )NZtest_partition_timestamps
2012-01-01z
2012-01-02r   r   )datesr  r"  r   r  rS  r   rr  r>  r   r  )r
   rD   r   r  r  r   i  r
   )r+   r4   r   r   r   r  r*   rL  r*  r>   r,   r   	Timestampr   r   r  r   )rO  r   rb   r4   r   r>   r  r   r'   r'   r(   test_filter_timestamp  s$   
r$  c                 C   sh   t dt jg dt  di}t| |\}}tt|}tddk}t	|j
||ddks2J d S )NrF   )r   r
   r   rD   r   r   r   r   r   rD   )r+   r4   r   rF  rU  r   r>   rI   r,   r   r   )rO  r   r4   r  rb   r>   filter_r'   r'   r(   test_filter_implicit_cast:  s
    r&  c                 C   s^   t dg di}t| |\}}tt|}|j|tdtd kd}|j	dks-J d S )Nr  )rF   rG   Nr   r   )
r+   r4   rU  r   r>   rI   r   r,   r   r   )rO  r   r4   r  rb   r>   r'   r'   r(   test_filter_equal_nullE  s   r'  c           	      C   s   t g ddd tdD dd tddD d}t| |\}}tt|}tt	d	t 
d
dg}|j||djdksBJ tt	ddk}|j||djdksXJ tt	dt	d}|j|d|id}|d  g dksyJ d S )N)rF   rG   NrF   rd  c                 S   s   g | ]
}t  d dd|qS i  r
   r.  r  r'   r'   r(   rL   V      z2test_filter_compute_expression.<locals>.<listcomp>r   c                 S   s   g | ]	}t  d d|qS r(  r.  r  r'   r'   r(   rL   W  rm   r
   r[  r  r  rF   rG   r   rD   r  r   r  r   r   r   )r+   r4   r   rU  r   r>   rI   pcZis_inr,   r   r   r   hourZdays_betweenr  )	rO  r   r4   r  rb   r>   r%  r   r  r'   r'   r(   test_filter_compute_expressionR  s   r,  c                 C   s   t j| tdt  d}t |g}t| dksJ tdd | D s*J | d 	|
 s7J |
 	|
 sBJ t| t jsLJ d S )Nr  r   r
   c                 s   s    | ]	}t |tjV  qd S r6   )r9   r+   r   )rJ   rr  r'   r'   r(   r)  p  s    z%test_dataset_union.<locals>.<genexpr>r   )r   r   rS   r   r   ZUnionDatasetFactoryr   r  rA  r   r  r9   r   r   )r   r  r   r'   r'   r(   test_dataset_uniong  s   
r-  c                 C   s  t jd|dd}t jd|dddgd}t jd|dd	d}|j|j  kr*|jks-J  J t |||g}t|t js=J d
}tjt|d t j||g|d W d    n1 sZw   Y  tdt	 fdt
 fdt fdt fdt fdt fdt fg}|j|sJ | j|sJ t ||g}tdt	 fdt
 fdt fdt fdt fdt fg}|j|sJ | j|sJ tdt fdt fdt	 fg}t j||g|d}| j|sJ tdt fdt fdt fg}t j||g|d}| j|s$J tjtddgd dgd  dgg dd}t| |d\}	}
t |
}tjtjdd t ||g W d    d S 1 scw   Y  d S )Nr  r   r  r  weekr   r   rV   r   /hiver   z$cannot pass any additional argumentsr   r  r   r   r   r   r   rR   r  r  r   r   r   r   Z	abcdefghj)r   r   r   r;  r  zUnable to merge)r   r>   r*   r9   r  r   r   r  r+   r-   r.   r/   r0   r   r   r   r4   r   rU  ZArrowTypeError)rO  r   r  r  Zchild3Z	assembledmsgrb  r4   r  rb   Zchild4r'   r'   r(   &test_union_dataset_from_other_datasetsv  st   

"






	











 
$r2  c                 C   sJ   d}t jt|d tjg d| d W d    d S 1 sw   Y  d S )Nz8points to a directory, but only file paths are supportedr   )r  r  r0  r  )r   r   IsADirectoryErrorr   r>   )r   r1  r'   r'   r(   4test_dataset_from_a_list_of_local_directories_raises  s   "r4  c              
   C   s   t t jd| dt jd| dt jd| dg}tdt fdt fdt fdt fg}|j|s8J t t jd| dt jd| dt jd| d	d
g}tdt fdt fdt fdt fdt	 fdt	 fg}|j|s{J d S )Nr  r  r  r0  r   r   r   r   r   )r   r   r   r   )
r   r>   r+   r*   r-   r.   r/   r0   r   r   )r   r>   rb  r'   r'   r(   &test_union_dataset_filesystem_datasets  s4   









r5  c                    s  t g dg dd}t|d  d fdd	}d }|}||||jd |j}|}||| t dd	g}t jg dg dgd
dgd}||| t d	g}t jg dgdgd}||| t d	dg}t jg dt jg dddgddgd}||| t ddg}tjtd |d}t j|d 	d|d
 gdd
gd}||| t dt 
t  fdg}tjtd |d}|j|sJ tjtdd  | W d    d S 1 sw   Y  d S )NrX  rH  rI  rJ  rE   r  c                    s\   t jtd | d}|d ur|j|sJ n|j| s J  |}||s,J d S )Nr  rR   )r   r>   rI   r*   r   r   )r*   r!  rb  r>   r  r   rO  r'   r(   r    s   
z-test_specified_schema.<locals>._check_dataset)rb  )rG   r/   )rF   r.   rG   rF   r;  )rd  r   NNNr   r   rd  )rF   r   rR   z#Unsupported cast from int64 to listr   r6   )r+   r4   r]   r^   r*   r   r   r>   rI   r]  list_r   r   r   r   rP  r   )rO  r   r4   r  r*   r!  r>   r'   r7  r(   test_specified_schema  sL   






"r:  c                 C   s   | d }t dg di}t|| t dt  fg}tjt|gd |d}|j	|s1J |
|}tjtdd | }|  W d    d S 1 sQw   Y  d S )Nr  rF   rX  d   rR   z#Unsupported cast from int64 to nullr   )r+   r4   r]   r^   r*   re  r   r>   rI   r   r   r   r   rP  r  r  )rO  r   fnr4   r*   r>   r   r   r'   r'   r(   test_incompatible_schema_hang  s   

"r=  c           	      C   s   t t jg dddt jg dddd}t| d }t |}t ||j}|| d  |	  W d    n1 s@w   Y  t
j|t
 d	}||}||sZJ t| d
D ]}t
j||d	}||}||suJ q`d S )NrX  rF  r   r6  r/   rE   z
test.arrowr   r   )r=  arrow)r+   r4   r   rI   Zoutput_streamZRecordBatchFileWriterr*   Zwrite_batchr   rr  r   r>   r   r   r   rA   )	rO  r   r4   rb   rw  rx  r>   r  
format_strr'   r'   r(   test_ipc_format*  s$   


r@  c              	   C   s  ddl m} ttjg dddtjg dddd}t| d	 }||| tj|t	 d
}t
| }t|d tjsAJ ||}|jdd ||sSJ t| tj|dd
}||}|jdd ||spJ |j|dgd}|jdd ||dgsJ |j|dtdd id}|jdd |tdtjg dddisJ ||dksJ |j|tddkddksJ d S )Nr   orcrX  rF  r   r6  r/   rE   test.orcr   T)fullrB  rG   r   b2r   )rI  rK  g333333?rD   rF   r   r
   )r   rB  r+   r4   r   rI   r^   r   r>   r  rY   r7   r9   ZFileFragmentr   validater   rA   r  r,   r   )rO  r   rB  r4   rb   r>   r   r  r'   r'   r(   test_orc_format@  s:   

$rG  c                 C   s   ddl m} ttjg dddtjg dddd}t| d	 }||| tj|d
d}t	|
|}t|dks>J |d jdksGJ |d |
 d sTJ d S )Nr   rA  rX  rF  r   r6  r/   rE   rC  rB  r   r
   rD   )r   rB  r+   r4   r   rI   r^   r   r>   rY   r   r   r   r   )rO  r   rB  r4   rb   r>   r  r'   r'   r(   test_orc_scan_optionsg  s   rH  c                  C   sh   z	ddl m}  W d S  ty3   tjtdd tjddd W d    Y d S 1 s+w   Y  Y d S w )Nr   r  z'not built with support for the ORC filer   r  rB  r   )r  r  r  r   r   r  r   r>   rI  r'   r'   r(   test_orc_format_not_supported~  s   &rJ  c                  C   s   t jtdd tjtdtdiddd W d    n1 s!w   Y  t } t jtdd | 	  W d    d S 1 sAw   Y  d S )Nz9Writing datasets not yet implemented for this file formatr   rF   r   rB  z/tmp)rV   r  )
r   r   rP  r   rL  r+   r4   r   r  make_write_options)Zofr'   r'   r(   +test_orc_writer_not_implemented_for_dataset  s   
"rL  c                 C   s   t t jg dddt jg dddd}t| d }| j|dd	 tj|t d
}|	|}|
|s:J t| tj|dd
}|	|}|
|sQJ d S )NrX  r.   r   r6  r/   rE   test.csvFr   r   r  )r+   r4   r   rI   r  to_csvr   r>   r  r   r   rA   )rO  r   r4   rb   r>   r  r'   r'   r(   test_csv_format  s   

rP  compression)bz2gziplz4zstdc                 C   s   t j|std| ttjg dddtjg dddd}t	
 }|dkr.|nd	}t| d
|  }|j||d}| jdd}||d W d    n1 s[w   Y  tj|t d}	||	}
|
|suJ d S )Nz{} support is not builtrX  r.   r   r6  r/   rE   rS  gzz	test.csv.rQ  FrN  r  r   )r   Codecis_availabler   skiprV   r+   r4   r   rS   re   rI   rX   r  rO  writer  r   r>   r  r   r   )rO  rQ  r   r4   r   suffixrb   rw  Zcsv_strr>   r  r'   r'   r(   test_csv_format_compressed  s   
r]  c              	   C   s  t | d }t|d}|d W d    n1 sw   Y  tj|dd}||}|tdt	g dis=J tj|tj
tjjdd	d
d}||}|tdt	ddgiscJ tj|tj
tjjdgdd
d}||}|tdt	g disJ d S )NrM  wzskipped
col0
foo
bar
r  r   skipped)col0r  barr
   )r  r  r`  r  ra  r  )r_  r`  r  ra  )rI   r8   r[  r   r>   r   r   r+   r4   r   r  r  r  )rO  r   rb   rw  r>   r  r'   r'   r(   test_csv_format_options  s*   



"


rc  c              
   C   s   t | d }t|d}|d W d    n1 sw   Y  tj|tjtjjdddd}|	|}g d}|j
|ks@J |ttd	gtd
gtdgtd	gds_J d S )NrM  r^  z1,a,true,1
T)Zautogenerate_column_namesr  r   )Zf0r   r:  rO  r
   rF   )rI   r8   r[  r   r>   r  r+   r  r  r   r  r   r4   r   )rO  r   rb   rw  r>   r  Zexpected_column_namesr'   r'   r(   (test_csv_format_options_generate_columns  s   





rd  c           	   	   C   s*  t | d }t|d}|d W d    n1 sw   Y  tj|dd}tjjdgdd}tj|t	jj
d	d
d}|j||d}|t	dt	g disTJ tj|d}tj||d}||}|t	dt	g diswJ t }|j||d}|t	dt	g disJ d S )NrM  r^  zcol0
foo
spam
MYNULL
r  r   MYNULLT)Znull_valuesr  r  r  )r  rs  )Zfragment_scan_optionsr`  )r  spamNr  )r  rf  re  )rI   r8   r[  r   r>   r   r  r  r  r+   r  r   r   r4   r   r  )	rO  r   rb   rw  r>   r  r   r  r  r'   r'   r(   test_csv_fragment_options  s.   
"
"rg  c                 C   s   t t jg dddt jg dddd}t| d }| jdd	d
d dd}t|d}|| W d    n1 sAw   Y  t	j
|t	 d}||}||s[J t| t	j
|dd}||}||srJ d S )NrX  r.   r   r6  r/   rE   	test.jsonrecordsZorientr
   rZ  },{}
{r^  r   r  )r+   r4   r   rI   r  to_jsonreplacer8   r[  r   r>   r  r   r   rA   rO  r   r4   rb   rc   r`  r>   r  r'   r'   r(   test_json_format  s    

rp  c                 C   s  t t jg dddt jg dddd}t| d }| jdd	d
d dd}t|d}|| W d    n1 sAw   Y  t	j
tdd tj|tjt jjdddd}W d    n1 shw   Y  tj|tjt jjdddd}||}||sJ d S NrX  r.   r   r6  r/   rE   rh  ri  rj  r
   rZ  rk  rl  r^  ztry to increase block sizer   r   r  r  r   @   )r+   r4   r   rI   r  rm  rn  r8   r[  r   r   r  r   r>   r  r  r  r   r   ro  r'   r'   r(   test_json_format_options  s(    



rs  c           	      C   s*  t t jg dddt jg dddd}t| d }| jdd	d
d dd}t|d}|| W d    n1 sAw   Y  t	j
tdd tjt jjddd}tj|t|d}W d    n1 smw   Y  tjt jjddd}tj|t|d}||}||sJ d S rq  )r+   r4   r   rI   r  rm  rn  r8   r[  r   r   r  r   r  r  r  r>   r  r   r   )	rO  r   r4   rb   rc   r`  r   r>   r  r'   r'   r(   test_json_fragment_options0  s,    
rt  c              	   C   s   t | d }dD ]^\}}t|d}|| W d    n1 s!w   Y  tdt fdt fg}tjdgdgd|d	}tjj|d
}t	j
|d}	t	j||	d}
|
j|s]J |
 |sfJ qd S )NrM  ))latin-1s   a,b
un,lphant)utf16s    a , b 
 u n ,  l  p h a n t wbrF   rG   un
   éléphantrE   rR   encodingr  r   )rI   r8   r[  r+   r*   r0   r4   r  r  r   r  r>   r   r   )rO  r   rb   r{  Z
input_rowsrw  rb  expected_tablers  r   dataset_transcodedr'   r'   r(   test_encodingG  s"   r~  c           
      C   s  t | d }t|d}|d W d    n1 sw   Y  tdt fdt fg}tjdgdgd|d	}tj|d
|d}t	j
tjjdd || W d    n1 s\w   Y  tjjdd}tj|d}tj||d}	|	j|s}J |	 |sJ d S )NrM  rw  s   ,b
un,lphant   érG   rx  ry  )r  rG   rR   r  r  zinvalid UTF8r   ru  rz  r  r   )rI   r8   r[  r+   r*   r0   r4   r   r>   r   r   r   r  r  r   r  r  r  r   )
rO  r   rb   rw  rb  r|  r>   rs  r   r}  r'   r'   r(   test_column_names_encoding`  s&   r  c                 C   sT  ddl m} ttjg dddtjg dddd}| d	 }|  ||t|d
  tj|t	 d}|
|}||sBJ t| tj|dd}|
|}||sYJ |j
|ddgd}|jddgkskJ |j
|ddgd}|jddgks}J ||t|d dd tt |
tj|dd W d    d S 1 sw   Y  d S )Nr   )rV  rX  rF  r   r6  r/   rE   Zfeather_datasetrR  r   rS  rG   rF   r   zdata1.featherr
   version)pyarrow.featherrV  r+   r4   r   r  rI   r   r>   r   r   r   rA   r  r   r   r  )rO  r   rV  r4   r  r>   r  r'   r'   r(   test_feather_formatx  s,   

"r  )rT  rU  brotlic                 C   s  t t jdgd ddt jg dd ddd}t j|s#t  | d	 }|  t	 }| d
 }|  tj
|t|d ||jd dd |dkrtjtdd |j|d}W d    n1 sdw   Y  tjtdd t |}|j|d}W d    d S 1 sw   Y  d S |j|d}tj
|t|d ||d tj|t	 d}	||	}
|
|sJ |d d }| j}|d d }| j}||k sJ d S )Nr   i,  rF  r   r6  r;  r/   rE   Zfeather_dataset_compressedZfeather_dataset_uncompressedz
data.arrowrW  rV   file_optionsr  zCompression typer   r   part-0.arrow)r+   r4   r   rX  rY  r   rZ  r  r   r   rL  rI   rK  r   r  r>   r   r   statst_size)rO  rQ  r   r4   r  r   Zuncompressed_basedirwrite_optionscodecr>   r  Zcompressed_fileZcompressed_sizeZuncompressed_fileuncompressed_sizer'   r'   r(   test_feather_format_compressed  sX   







r  c                 C   sj   g }t dD ]}t|gd tjdd}tj|t| |d qt| d }tj	|j
||d ||fS )zO
    Creates a simple (flat files, no nested partitioning) Parquet dataset
    r   r   r   metadata_collector	_metadata)r   r+   r4   r   rG  rH  r]   r  rI   write_metadatar*   )	root_pathr  r&   r4   metadata_pathr'   r'   r(   _create_parquet_dataset_simple  s   
r  c                 C   s\   | d }t |\}}t|}|j|jsJ t|jdks!J | }|jdks,J d S )Nr  r   (   )	r  r   parquet_datasetr*   r   r   r   r   r   )rO  r  r  r4   r>   r  r'   r'   r(   test_parquet_dataset_factory  s   
r  win32z'Results in FileNotFoundError on Windows)reasonc           	      C   s   t d}| d }t|\}}|d}tt|}tj||d}|j	
|j	s,J t|jdks5J | }|jdks@J d S )Nr  r  r  r  r   r  )r   r  r  r   rS   rf   r  r   r  r*   r   r   r   r   r   )	rO  r  r  r  r4   r  r   r>   r  r'   r'   r(   #test_parquet_dataset_factory_fsspec  s   

r  c                 C   s   | d }t dgd tjdd}g }tj|t||d t|d }tj|j	||d t
|}|j	|j	s<J | }|jdksGJ d S )Nr  r   r   r   r  r  )r+   r4   r   rG  rH  r]   r  rI   r  r*   r   r  r   r   r   )rO  r  r4   r  r  r>   r  r'   r'   r(   &test_parquet_dataset_factory_roundtrip  s   	

r  c           	   	   C   s   g }t dD ]-}tdtt |d |d d i}| | d }tj|||d |d | d qt| d }t|j	|| t
|}| }|d }|tt dd	ks]J d S )
Nr   r   r
   z.parquetr  rZ  r  r   r;  )r   r+   r4   rY   r]   r^   set_file_pathrI   r  r*   r   r  r   r   r  )	rO  Z	metadatasr&   r4   Z
table_pathr  r>   Zscanned_tableZscanned_colr'   r'   r(   "test_parquet_dataset_factory_order"  s   
r  c                 C   s   | d }t |\}}t|dd   t|}|j|js#J t|j	dks,J t
t |  W d    d S 1 sAw   Y  d S )NZtest_parquet_dataset_invalid	*.parquetr   r   )r  rY   globunlinkr   r  r*   r   r   r   r   r   r   r   )rO  r  r  r4   r>   r'   r'   r(   $test_parquet_dataset_factory_invalid8  s   

"r  c                 C   sz   t t| d}t|d j }g }|D ]}t|j}|t	|
|  || q| d }tj|||d |S )Nr  r   r  r  )rY   r   rglobr]   ZParquetFiler*   Zto_arrow_schemar  r  rI   r  r   r  )r  Zparquet_pathsr*   r  rb   r  r  r'   r'   r(   _create_metadata_fileF  s   r  c              	   C   sr   t jt tdt tjdt tddgdgg dd}|ddi}t	j
|t| d	gd
 t| |fS )Nr8  rF   rG   r   r9  r;  r   r   r   r  )r+   r4   r   r   r   rG  rH  rI  r2   r]   r  rI   r  )r  r4   r'   r'   r(   #_create_parquet_dataset_partitionedX  s   r  c                 C   s   | d }t |\}}tjdd}tj||d}|j|js J t|jdks)J | }|j	dks4J |
 djdd	}|
 }tj|| d S )
N(test_parquet_dataset_factory_partitionedr   r  r  r   r8  r   TZdrop)r  r   r   r  r*   r   r   r   r   r   r  sort_valuesreset_indexr   ZtestingZassert_frame_equal)rO  r  r  r4   r   r>   r  r!  r'   r'   r(   r  c  s   r  c                 C   sh   | d }t |\}}tj|dd}|j|jsJ d|jjv s"J t| }d|d jjv s2J d S )N%test_parquet_dataset_factory_metadatar   r     keyr   )	r  r   r  r*   r   r  rY   r7   r  )rO  r  r  r4   r>   r   r'   r'   r(   r  w  s   r  c           
      C   sX  |\}}| d }t |\}}||g tj|tjdd|d}W d    n1 s*w   Y  |g  t| }W d    n1 sDw   Y  |g  t|tddk W d    n1 sdw   Y  |g  |d tddk W d    n1 sw   Y  |g  |d  }	|	d   W d    d S 1 sw   Y  d S )N#test_parquet_dataset_lazy_filteringr   r  )r   r   r      r   )	r  r   r  r   rY   r7   r,   r   r  )
rO  r|   rS   rv   r  r  r  r>   r   Zrg_fragmentsr'   r'   r(   r    s.   




"r  c                 C   sp   t dg di}| d }|| t|}||j}|j|dgdj}d|jv s-J |j|dds6J d S )NrF   rX  r  r   s   pandasTr  )	r   r    r<  r   r>   r   r*   r  r   )rO  r   r3   rb   r>   r*   r  r'   r'   r(   test_dataset_schema_metadata  s   

r  c                 C   s   t dt jg dddi}t|t| d  t dt  fg}tj	| d d|d}|j
|tddkd	}|d |d d
dsIJ t| d }|j
|tddk|d}|d |d d
dsoJ d S )Nr  rL  r   r   r  r   r  r   r   r.   r   r>  )r+   r4   r   r]   r^   rI   r*   r.   r   r>   r   r,   r   r]  r  rY   r7   )rO  r   r4   r*   r>   filteredr?   r'   r'   r(   test_filter_mismatching_schema  s   
"&r  c                 C   s   t d ttdd}t| d }tj||dgd tj	|dd}|
|}|j
|dgd	}|d|ds>J d S )
Nza a b br   r  r  r   r  r   r  r   )r+   r4   r  rY   r   rI   r]   r  r   r>   r   r   r   )rO  r   r4   rb   r>   Zall_colsZ	part_onlyr'   r'   r(   +test_dataset_project_only_partition_columns  s   
r  c                 C   s   t dtjg dddi}| d }|j|dd tj|dtdt	 fgd	}t
dtg dt	 i}|||sBJ d S )
Nr  r8  objectZdtypez(test_dataset_project_null_column.parquetr   r;  r   r  )r   r    r   r   r<  r   r>   r+   r*   r.   r4   r   r   )rO  r   r3   r`  r>   r!  r'   r'   r(    test_dataset_project_null_column  s   r  c                 C   s   ddl m} tg dg dg dd}||| d  tj| d dd	}|j|td
tdj	dddtddkdd}tg dtj
g dddg dd}||s\J tjtdd |j|d
d
id W d    d S 1 sxw   Y  d S )Nr   rS  rX  )r   r   r   r  r  rR  rS  r   r  r  r   Fsafer  rF   )Z	A_renamedZB_as_intZC_is_ar   r   )TFFzExpected an Expressionr   )r   rS  r+   r4   rV  r   r>   r   r,   r]  r   r   r   r   r   )rO  r   rS  r4   r>   r  r!  r'   r'   r(   test_dataset_project_columns  s$   
"r  c           	      C   sr  t | \}}t|}t|jtjsJ t| \}}t|}t|jtjs(J tj|dd}|j}|d us8J t|tjs@J |jt	dt	
 fgksOJ t|jdksXJ |jd t	g dt	
 ksiJ tjt	dt	
 fgdd}t|tjsJ t|jdksJ tdd	 |jD sJ tj||d}|j}t|tjsJ |jt	dt	
 fgksJ t|jdksJ td
d	 |jD sJ tj|dd}tjt| |j|j|jd}|jd u sJ | d }t|\}}tj|dd}|j}|d usJ t|tjsJ |jt	dt	 fgksJ t|jdks'J t|jd  ddhks7J d S )Nr   r  r   r
   r   r  r  c                 s   r(  r6   r'   r   r'   r'   r(   r)  -  r*  z6test_dataset_preserved_partitioning.<locals>.<genexpr>c                 s   r(  r6   r'   r   r'   r'   r(   r)  4  r*  r   zdata-partitioned-metadatarF   rG   )rU  r   r>   r9   r   r   r  r?  r*   r+   r   r   r7  r   rA  r   rY   r7   rV   r   r  r  r0   rx   r  )	rO  r  rb   r>   r  r   r  r  r  r'   r'   r(   #test_dataset_preserved_partitioning  sL   

" $r  c                 C   s   t t dt  t dt t  t  g}t jg dtt	dd|d}t
| d }tj||dgd t| d }|d |d ksOJ |d|ds\J d S )	Nr  r   )NNrF   rF   r   r  rR   r  r  )r+   r*   r,   r.   rE  r   r0   r4   rY   r   rI   r]   r  r  r   r  r   )rO  r*   r4   rb   Zactual_tabler'   r'   r(   +test_write_to_dataset_given_null_just_worksL  s    

r  c                 C   s2   dd l m} |j| ||dfgd}|| |S )Nr   	ascending)r   )pyarrow.computecomputeZsort_indicesZSortOptionsr   )tabsort_colr*  Zsorted_indicesr'   r'   r(   _sort_tablea  s
   r  c                 C   st   |p|}t j| |d|dd t|d}t|t|ksJ t j|d|d}t| |t|  |s8J d S )Nr>  FrV   r   r   *r>  )	r   rL  rY   r  rx   r>   r  r   r   )r>   r  expected_filesr  Zbase_dir_pathr   
file_pathsr  r'   r'   r(   _check_dataset_roundtriph  s   
r  c                 C   s   | d }|   t|}t|}| d }|d g}t|t||d| | d }|d g}t|||d| | d }|   t|}t|}| d }|d g}t|t||d| d S )Nr  zsingle-file-targetr  rF   zsingle-file-target2r  zsingle-directory-target)r  rU  r   r>   r  rI   r  )rO  ra   r  r>   targetr  r'   r'   r(   test_write_dataset{  s"   





r  c                 C   s   | d }t |}tjdd}tj||d}| d }|d |d d |d |d d g}tjtd	t fgdd}t|t||d
||d | d }|d |d d |d |d d g}ttd	t fg}t|t||d
||d d S )Npartitionedr   r  r  zpartitioned-hive-targetpart=ar  part=br   r   partitioned-dir-targetrF   rG   )	r  r   r   r>   r+   r*   r0   r  rI   )rO  ra   r  r   r>   r  expected_pathsrN  r'   r'   r(   test_write_dataset_partitioned  s4   
r  c                    s   t g dg dd}tj| ddgd tj ddgd}|j} fdd|D }|h d	ks3J | }||s>J d S )
Nr|  r~  rE   r=  rG   r>  c                    "   h | ]}t t| jqS r'   rI   r  r  r  r  r_  r  r'   r(   rl         z6test_write_dataset_with_field_names.<locals>.<setcomp>>   r}  r   r  r+   r4   r   rL  r>   r   r   r   rO  r4   rP  r   Zpartitioning_dirsrQ  r'   r  r(   #test_write_dataset_with_field_names  s   

r  c                    s   t g dg dd}tj| ddgdd tj ddd}|j} fd	d
|D }|h dks3J | }||s>J d S )Nr|  r~  rE   r=  rG   r   )rV   r   partitioning_flavorr>  c                    r  r'   r  r_  r  r'   r(   rl     r  z;test_write_dataset_with_field_names_hive.<locals>.<setcomp>>   zb=xzb=yzb=zr  r  r'   r  r(   (test_write_dataset_with_field_names_hive  s   

r  c                 C   s   t g dg dg dd}tj|| ddgd tj| ddgd}t 5}tj|jddgd	|ddgd tj|ddgd}| }t	|
 |d

 ksSJ W d    d S 1 s^w   Y  d S )Nr|  r~  rX  r  r=  rG   r>  rd  r   rF   )r+   r4   r   rL  r>   rJ  rK  r   r   r  r   Zdrop_columnsrO  r4   r>   Ztempdir2rP  rQ  r'   r'   r(   test_write_dataset_with_scanner  s"   



"r  c           	         sH  t  G fdddt}t|t ttdt	 g}tj
tttdg|d dd}dd	 fd
d}tjj| |d	dt jfddd}|  z;t fdd}d}d}| dk r|kr~|kr|d	}n}td | dk sq|sJ W d  |  d S d  |  w )Nc                       s   e Zd Z fddZdS )z6test_write_dataset_with_backpressure.<locals>.GatingFsc                    s       | jj||dS )Nr  )waitrq   rX   )rs   rb   r  consumer_gater'   r(   rX     s   zItest_write_dataset_with_backpressure.<locals>.GatingFs.open_output_streamN)r   r   r   rX   r'   r  r'   r(   GatingFs  s    r  r%   r  rR   r          Tc                   3   s:    k rs	d S t d d7  V  k sd S d S )Ng{Gz?r
   )r7  sleepr'   )rd   batches_readend
keep_goingr'   r(   counting_generator  s   
z@test_write_dataset_with_backpressure.<locals>.counting_generatorr  c                      s   t jtd dS )Nr   r  )r   rL  rI   r'   )	gating_fsr   rO  r'   r(   r[    s    z6test_write_dataset_with_backpressure.<locals>.<lambda>)r  c                      s   t     S r6   )r7  r'   )startr'   r(   duration   s   z6test_write_dataset_with_backpressure.<locals>.durationFr   r\  )	threadingEventr   rS   rf   re   r+   r*   r,   r   r[   r   rY   r   r   r  r\   Threadr  r7  r  rx   r  )	rO  r  r*   Zmin_backpressurer  Zwrite_threadr  
last_valueZbackpressure_probably_hitr'   )	rd   r  r  r  r  r  r   r  rO  r(   $test_write_dataset_with_backpressure  sJ   	




r  c                 C   s   t g dg dd}tj|| ddgd tj| ddgd}t ,}tj||ddgd tj|ddgd}| }t|	 |	 ksGJ W d    d S 1 sRw   Y  d S )Nr~  rX  rG   rd  r=  rG   r>  )
r+   r4   r   rL  r>   rJ  rK  r   r  r   r  r'   r'   r(   test_write_dataset_with_datasetA  s   

"r  c           	      C   s  | d }t g dg dd}tjt t dt  gdd}dd	 }tj|||d
d t g dg dd}t	t j
 tj|||d
d W d    n1 sTw   Y  t ddgi}|d d }tj|| tj|||d
dd t g dg dd}tj| d
|d }||| | sJ tj|||d
dd t g dg dd}tj| d
|d }||| | rJ d S )Nr   r~  rX  r  rd  r   )r*   r  c                 S   s>   |   djdd}|  djdd}||sJ d S )NrG   Tr  )r  r  r  r   )r  r  Zdf1Zdf2r'   r'   r(   compare_tables_ignoring_orderY  s   zGtest_write_dataset_existing_data.<locals>.compare_tables_ignoring_orderr=  r   r  rY  rG   ezc=2z	foo.arrowoverwrite_or_ignore)r   rV   existing_data_behavior)r  r   rF   rG   rd  )r   r
   r   rD   r   r>  Zdelete_matching)r   rF   rG   rd  rL  )r+   r4   r   r   r*   r,   r.   rL  r   r   r  r   rS  rV  r>   r   exists)	rO  ra   r4   r   r  Zextra_tableZ
extra_fileZoverwrittenZreadbackr'   r'   r(    test_write_dataset_existing_dataR  sV   



r  r   r
   r   c                 C   s   t j||| S r6   )r   rG  randintr  r  r  r'   r'   r(   _generate_random_int_array     r  c                 C   sN   g }g }t | D ]}|t|d|d |dt|  qtj||d}|S )Nr
   r  rd  r%   r<  )r   r   r  rI   r+   r[   )num_of_columnsnum_of_recordsr%   r  r&   r[   r'   r'   r(   _generate_data_and_columns  s   r  c                 C   s   t tt| d| S )Nz**/*.)r   rY   r  r  r  Zbase_directoryr   r'   r'   r(   _get_num_of_files_generated  s   r  c                    s   | d }d d}d}d}t ||}tj||d |d t|}|  d }t||ks.J g }t|D ]\}	}
|t|
 }tj|dd}|	|
 jd	  q4|t|ksXJ |t|ks`J t fd
d|D smJ d S )Nr   r   r   #   r   )rV   max_rows_per_filemax_rows_per_groupr
   r   r   c                 3   s    | ]}| kV  qd S r6   r'   )rJ   Zfile_rowcountr  r'   r(   r)    s    z7test_write_dataset_max_rows_per_file.<locals>.<genexpr>)r  r   rL  r  r  r   rU   rI   r>   r   r   shaper  rA  )rO  ra   r   r  r  r[   files_in_dirZexpected_partitionsZresult_row_combinationr  f_filef_pathr>   r'   r  r(   $test_write_dataset_max_rows_per_file  s2   

r  c                    s   | d }d}d}d g d} fdd|D }|d }t j||||d	d
 t|}t|D ]>\}}	|t|	 }
t j|
d	d}| }| }t|D ] \}}|j	}|t
|d k re||krb||ksdJ qK||kskJ qKq.d S )Nr   r[  r  r   )
r   r   r   r   r   r   r   r   r   r   c                    s   g | ]}t  |qS r'   )r  )rJ   r  r  r'   r(   rL     s
    z9test_write_dataset_min_rows_per_group.<locals>.<listcomp>Zmin_rows_groupr   )r%   r  min_rows_per_groupr   rV   r   r
   )r   rL  r  r  rU   rI   r>   r   r   r   r   )rO  ra   r  r   Zrecord_sizesZrecord_batchesdata_sourcer  r  r  r  r>   r4   batchesr  rd   Zrows_per_batchr'   r  r(   %test_write_dataset_min_rows_per_group  s8   

r  c                 C   s   | d }d}d}d}t ||}|d }tj|||dd t|}g }|D ]"}	|t|	 }
tj|
dd}| }| }|D ]}|	|j
 q>q%|dd	gksPJ d S )
Nr   r6  r      Zmax_rows_groupr   )r%   r  r   rV   r   r  )r  r   rL  r  r  rI   r>   r   r   r   r   )rO  ra   r   r  r  r[   r	  r  Zbatched_datar  r  r>   r4   r
  rd   r'   r'   r(   %test_write_dataset_max_rows_per_group  s.   
r  c                 C   s:  | d }d}d}ddg}t jg dg dg|d}t jg d	g d
g|d}t jg dg dg|d}t jg dg dg|d}t j||||g}	tjt || t  fgdd}
|d }tj|	||
|d dd }|||||\}}||ks{J |d }d}tj|	||
||dd |||||\}}||ksJ d S )Nr   r   r
   c1c2)r
   r   rD   r   r   r   )rF   rG   rd  rc  r  rF   r  )r   r[  r  r  r   r
   )rF   rG   rd  rc  r  rd  )r  r   r  r  r   r
   )rF   rG   rd  rc  r  rc  )r  r  r  r  r   r
   )rF   rG   rd  rc  r  rG   r   r  default)r%   r  r   rV   c                 S   s(   t | |d}ttj|| }||fS )Nr  )r  r   r+   r  unique)r	  r[   r   Zcol_idnum_of_files_generatednumber_of_partitionsr'   r'   r(   _get_compare_pair  s
   z<test_write_dataset_max_open_files.<locals>._get_compare_pairZmax_1rD   F)r%   r  r   rV   max_open_filesr   )	r+   r[   r1   r\   r   r   r*   r0   rL  )rO  ra   r   Zpartition_column_idr  Zrecord_batch_1Zrecord_batch_2Zrecord_batch_3Zrecord_batch_4r4   r   Zdata_source_1r  r  r  Zdata_source_2r  r'   r'   r(   !test_write_dataset_max_open_files  sh   




r  c                 C   s   | d }t |}tj|tjjddd}| d }|d |d d |d |d d g}tjt|jd	gd	t	ddgid
}t
|t||d||d d S )Nr  Tre  r  r  rF   r  rG   r   r6  r   )r  r   r>   r?  r_  r   r+   r*   r,   r   r  rI   )rO  ra   r  r>   r  r  r   r'   r'   r(   #test_write_dataset_partitioned_dictB  s&   

r  c                    s   | d }t |}tj|dd}tjtdt fgdd}| d }g   fdd}tj||d	|d
|d |d d |d d h}tt	t
j }||ksOJ | d }	tj||	d	|dd tj|d	|d}
tj|	d	|d}|
 | sxJ d S )Nr  r   r  r   r  Zpartitioned1c                    s     | j d S r6   )r   rb   Zwritten_filepaths_writtenr'   r(   file_visitori  r  z4test_write_dataset_use_threads.<locals>.file_visitorrS  TrV   r   r   r  r  part-0.featherr  Zpartitioned2Fr  r>  )r  r   r>   r   r+   r*   r0   rL  rx   rZ   r  r  r   r   )rO  ra   r  r>   r   Ztarget1r  r  paths_written_setZtarget2Zresult1Zresult2r'   r  r(   test_write_dataset_use_threads\  s4   

r  c           
   	      s  t jt tdt tjdt tddgdgg dd}| d }tj	||dd	d
 t
|d}|d g}t|t|ksDJ tj|dd }||sTJ | d }|d |d d |d |d d g}g  g  fdd}tjt dt  fgdd}tj	||d	d||d t
|d}t|t|ksJ dd  D }|ksJ tj|d|d}| |sJ t dksJ  D ]}	t|	|v sJ qd S )Nr8  rF   rG   r   r9  r;  singledat_{i}.arrowrS  basename_templaterV   r  zdat_0.arrowr=  r   r  r  r  c                    s     | j  | j d S r6   )r   rb   r  r  Zvisited_pathsZvisited_sizesr'   r(   r    s   z&test_write_table.<locals>.file_visitorr   r   r  )rV   r#  r   r  c                 S   s   g | ]}t j|qS r'   )r  rb   getsizer  r'   r'   r(   rL     s    z$test_write_table.<locals>.<listcomp>r>  r   )r+   r4   r   r   r   rG  rH  rI  r   rL  rY   r  rx   r>   r   r   r   r*   r0   r   r  r  )
rO  r4   r  r  r  r  r  r   Zactual_sizesZvisited_pathr'   r$  r(   test_write_table  sN   

r&  c              	   C   s  t jt tdt tjdt tddgdgg dd}t |gd }| d }t	j
||d	d
 t|dt|d gksEJ t	j|dd
 |sSJ | d }t	j
|g|d	d
 t|dt|d gkspJ t	j|dd
 |s~J | d }t	j
| |d	d
 t|dt|d gksJ t	j|dd
 |sJ | d }t	j
||g|d	d
 t|dt|d gksJ t	j|dd
 t |gd sJ d S )Nr   rF   rG   r   r9  r;  r   r   rS  r   r  r  r=  zsingle-listZmultiplezmultiple-table)r+   r4   r   r   r   rG  rH  rI  r  r   rL  rx   r  r>   r   r   r   )rO  r4   r  r'   r'   r(   #test_write_table_multiple_fragments  s:     

r'  c              	   C   s   t jt tdt tjdt tddgdgg dd}| d }tj	dd	 |
 D ||jd
dd tj|dd }||sGJ | d }t j|j|
 }tj	||d
dd tj|dd }||snJ d S )Nr8  rF   rG   r   r9  r;  Zinmemory_iterablec                 s   s    | ]}|V  qd S r6   r'   )rJ   rd   r'   r'   r(   r)    r  z&test_write_iterable.<locals>.<genexpr>r!  rS  )r*   r#  rV   r=  r   Zinmemory_readerr"  )r+   r4   r   r   r   rG  rH  rI  r   rL  r   r*   r>   r   r   r  r\   )rO  r4   r  r  r   r'   r'   r(   test_write_iterable  s,   
r(  c              	   C   s(  t jt tdt tjdt tddgdgg dd}t	|}| d }tj
|||dd	 |tj	|d
d	}||sFJ | d }tj
|j|dgd|dd	 |tj	|d
d	}||dgsmJ tjtdd tj
||||jdd W d    d S 1 sw   Y  d S )Nr8  rF   rG   r   r9  r;  Zdataset_from_scannerrS  r   r=  Zdataset_from_scanner2r   r   zCannot specify a schemar   )r*   rV   )r+   r4   r   r   r   rG  rH  rI  r   r>   rL  r   r   r   r  r   r   r  r*   )rO  r   r4   r>   r  r  r'   r'   r(   test_write_scanner  s4   
"r)  c                 C   s   t jt tdt tddgd gddgd}t|	dgj
}| d }tj||d	|d
 tjjdgdd}tj|d|d
 }||sLJ d S )Nr8  rF   rG   r   r  r   r;  r>   rS  r>  Tre  r=  )r+   r4   r   r   r   rI  r  r   r   r  r*   rL  r   r_  r>   r   r   )rO  r4   r   r  Zpartitioning_readr  r'   r'   r(   !test_write_table_partitioned_dict  s(   r*  c              	   C   s  t jt jtdddt tjdddddt tdd	gd
gg dd}| d }tj	||dd t
|d}|d g}t|t|ksJJ tj|dd }||sZJ dD ]w}t }|j|d}dt|v spJ | d| }tj	||||d t|d }	|dkrdnd}
|	j|
ksJ tj|dd }|j}|dkr|d|dt  }|dv r|d|dt d}||}||sJ q\d S )Nr8  r  r   r!  zdatetime64[D]r  zdatetime64[ns]rF   rG   r   r9  r;  r  r   r   r  part-0.parquet)1.02.42.6r  z(<pyarrow.dataset.ParquetFileWriteOptionszparquet_dataset_version{0}r  r,  r.  r   )r,  r-  r
   r  )r+   r4   r   r   r   r  r  rI  r   rL  rY   r  rx   r>   r   r   r   rK  rT  rV   r]   read_metadataformat_versionr*   r,   Z	with_typer.   r*  r]  )rO  r4   r  r  r  r  r  rV   optsmetaZexpected_versionr*   r!  r'   r'   r(   test_write_dataset_parquet  sD   	

r3  c              	   C   s
  t jt tdt tjdt tddgdgg dd}| d }tj	||dd	 t
|d
}|d g}t|t|ksCJ tj|dd	 }||sSJ tjtjj|jjdd}|jdd}| d }tj	||||d tj||d	 }||sJ d S )Nr8  rF   rG   r   )r   r:  Zchr1r;  Zcsv_datasetr  r   r  z
part-0.csvrb  r  F)Zinclude_headerZcsv_dataset_noheaderr  )r+   r4   r   r   r   rG  rH  rI  r   rL  rY   r  rx   r>   r   r   r  r   r  r  r*   r<  rK  )rO  r4   r  r  r  r  rV   r1  r'   r'   r(   test_write_dataset_csvK  s*   


r4  c              	      sx   t jt tdt tjdt tddgdgg dd}d  fdd	}| d
 }tj	||d|d  s:J d S )Nr8  rF   rG   r   r9  r;  Fc                    s&   | j d ur| j jdkrd d S d S d S )NrD   T)r  r  r  Zvisitor_calledr'   r(   r  n  s
   
z=test_write_dataset_parquet_file_visitor.<locals>.file_visitorr  r   )rV   r  )
r+   r4   r   r   r   rG  rH  rI  r   rL  )rO  r4   r  r  r'   r5  r(   'test_write_dataset_parquet_file_visitore  s   
r6  c           	         s   dd t dD }dd t dD }t||tddgdd}| d	 }tjtd
t fgdd}g  d  fdd}tj	||d|d|d |d d |d d h}t
ttj }||ksdJ d usjJ jdksqJ d S )Nc                 S   s    g | ]}|gd  D ]}|q	qS r   r'   rJ   r   itemr'   r'   r(   rL   }  s     z?test_partition_dataset_parquet_file_visitor.<locals>.<listcomp>r   c                 S   s$   g | ]}|gd  D ]}|d  q	qS r7  r'   r8  r'   r'   r(   rL   ~  s   $ rF   rG   r8  r9  r  r   r   r  c                    s   | j r| j  | j d S r6   )r  r   rb   r  r  Zsample_metadatar'   r(   r    s   zAtest_partition_dataset_parquet_file_visitor.<locals>.file_visitorr   Tr  r  r+  r  r   )r   r+   r4   r   rI  r   r   r*   r0   rL  rx   rZ   r  r  r  )	rO  Zf1_valsZf2_valsr4   r  r   r  r  r  r'   r:  r(   +test_partition_dataset_parquet_file_visitor{  s.   

r;  c                 C   sd   t dtjdddgi}|d jjdksJ tj|| dd t	| d }|d jjdks0J d S )NrF   r!  zEurope/Brussels)tzr   r   r+  )
r+   r4   r   r#  r   r<  r   rL  r]   r  )rO  r4   r  r'   r'   r(   (test_write_dataset_arrow_schema_metadata  s
   r=  c                 C   sb   ddl m} tdg di}|ddi}tj|| dd || d	 j}|j	ddiks/J d S )
Nr   r  rF   rX  r     valuerS  r   r  )
r   rS  r+   r4   r2   r   rL  r  r*   r  )rO  rS  r4   r*   r'   r'   r(   "test_write_dataset_schema_metadata  s   r?  c                 C   sV   t dg di}|ddi}tj|| dd t| d j}|jddiks)J d S )NrF   rX  r  r>  r   r   r+  )	r+   r4   r2   r   rL  r]   r  r*   r  )rO  r4   r*   r'   r'   r(   *test_write_dataset_schema_metadata_parquet  s
   r@  c              	   C   sB  | \}}}}}}}}d ||||}tjttdttjdttddgdgg dd}t	j
tdt fgd	d
}	t	j|d|d|	d t	jd|dd	d }
|
|s^J | d}t	j||d|	d t	jd|dd	d }
|
|s~J | d}t	j|d|d|	d t	jd|dd	d }
|
|sJ d S )Nr  r8  rF   rG   r   r9  r;  r   r   r  zmybucket/datasetrS  r/  r=  zmybucket/dataset2r>  r  r  zmybucket/dataset3)rV   r+   r4   r   r   r   rG  rH  rI  r   r   r*   r0   rL  r>   r   r   )r  r  rS   r  r  r  r  Zuri_templater4   r   r  r  r'   r'   r(   test_write_dataset_s3  sP   


rA  aC  {
    "Version": "2012-10-17",
    "Statement": [
        {
            "Effect": "Allow",
            "Action": [
                "s3:PutObject",
                "s3:ListBucket",
                "s3:GetObjectVersion"
            ],
            "Resource": [
                "arn:aws:s3:::*"
            ]
        }
    ]
}c           	   	   C   s  ddl m} | d \}}}}|ddd||dd}t| t tjttd	tt	j
d	tt	d
dgdgg dd}tjtdt fgdd}tj|d|dd|dd tjd|ddd }||smJ tj|d|dd|dd tjd|ddd }||sJ tjtdd tj|d|dddd W d    n1 sw   Y  |ddd||ddd}tjtdd tj|d|dddd W d    d S 1 sw   Y  d S ) Nr   )r  r  ZlimitedZ
limited123z{}:{}http)r  r  endpoint_overrideschemer8  rF   rG   r   r9  r;  r   r   r  zexisting-bucketrS  Fr  )r   rV   rW   r   r  r=  r/  Tz&Bucket 'non-existing-bucket' not foundr   znon-existing-bucket)r   rV   rW   r  )r  r  rC  rD  Zallow_bucket_creationzAccess Denied)rw   r  rV   r   _minio_put_only_policyr+   r4   r   r   r   rG  rH  rI  r   r   r*   r0   rL  r>   r   r   r   r   r  )	r  r  r  r  r  rS   r4   r   r  r'   r'   r(   test_write_dataset_s3_put_only  sz   

	
"rF  c              
   C   s   t dd d gi}t|| d  t t dt t  t  g}t	j
j| d g|t	 t d}||}|j|ks@J d S )NrF   r  )rn   r*   rV   r   )r+   r4   r]   r^   r*   r,   rE  r   r0   r   r   r   r   rS   re   r   )rO  r   r4   r*   Zfsdsr'   r'   r(   $test_dataset_null_to_dictionary_castR  s   
rG  c                 C   s   t g dg dd}tj|| d dd tj| d dd}t g dg dd	}tj|| d
 dd tj| d
 dd}||dd}| t g dg dg ddksZJ |j|dddd}| dt g dg dg ddks{J d S )Nr
   r   r[  rF   rG   r`  colAr	  r  r=  r   c   r   r
   Zr  r  )colBcol3r  rK  rP  r  r  NrK  r	  rQ  
full outer)	join_typer
   r   r[  rM  rF   rG   r`  Nr  r  NrO  r+   r4   r   rL  r>   r  r   r   rO  r  ds1r  ds2r  r'   r'   r(   test_dataset_joine  s0   
r]  c                 C   s   t g dg dd}tj|| d dd tj| d dd}t g dg dd	}tj|| d
 dd tj| d
 dd}||d}| t g dg dg ddksYJ |j|dddd}| dt g dg dg ddkszJ d S )NrH  rI  rJ  r  r=  r   rL  rN  )rK  rQ  r  rK  rR  rS  rT  _rrU  Zright_suffixrV  rW  rX  rY  rZ  r'   r'   r(   test_dataset_join_unique_key  s0   
r`  c                 C   s   t g dg dg dd}tj|| d dd tj| d dd}t g dg d	g d
d}tj|| d dd tj| d dd}|j|dddd}| dt jg dg dg dg dg dgg ddksnJ d S )NrH  )r   r8  <   rI  )rK  rP  colValsr  r=  r   rL  )rM  r8  r   rN  r  rK  rT  r^  r_  rV  )r   r8  ra  NrW  )r   r8  NrM  rX  )rK  rP  rb  ZcolB_rZ	colVals_rr;  rY  rZ  r'   r'   r(   test_dataset_join_collisions  s0   rc  dstyperS   memc                 C   s  t g dg dd}|dkr$tj|| d dd tj| d dd}n|dkr.t|}nt|td	d
k tddk}|dkrItj	ntj
}t||sSJ | t dgdgdkscJ |dt dgdgdkstJ |td	dk td	dkjtd	dkd}| t dgdgdksJ tj|| d dd tj| d dd}| t dgdgdksJ |jtt ddgddgdddd}| dt dd gddgddgdksJ tt |d  W d    n	1 sw   Y  tt |  W d    n	1 sw   Y  |jd}	|td	d
k |	}
|
 t d	ddgiksGJ tt j ||	  W d    d S 1 saw   Y  d S )Nr
   r   r[  r  rF   rG   r`  grJ  rS   r  r=  r   re  rK  rD   r	  rF   r
   r   r  r[  r   r   rG   r  r   r8  rP  r	  zright outerkeysrU  rP  )rK  rP  r	  )r+   r4   r   rL  r>   rP  r   r*  r,   r   r  r9   r   r   r   r  r   r   r   r   r  r7   r*   r  Zreplace_schemar  )rO  rd  r  r[  r  r!  r2r  joinedZschema_without_col2Z	newschemar'   r'   r(   test_dataset_filter  s   $




$rn  c           
      C   s  t g dg dd}t g dg dd}|dkrCtj|| d dd	 tj| d dd	}tj|| d
 dd	 tj| d
 dd	}n|dkrRt|}t|}ntt||ftddk tddkB }|	 t g dg ddks|J |j
tt ddgddgdddd}|	 dt g dg dg ddksJ |tddk }|tddk }	tjtdd t||	f W d    d S 1 sw   Y  d S )Nrf  rg  rJ  )r  r   r  )hr&   lrS   r  r=  r   r  re  rK  rD   r  )r
   r   r  )rF   rG   ro  r   r8  rF   rG   ri  r	  z
left outerrj  )r   r8  N)rK  r	  rP  zcurrently not supportedr   )r+   r4   r   rL  r>   rP  r   r*  r,   r   r  r   r   r   r  )
rO  rd  r  r  r[  r\  Zfiltered_union_dsrm  Zfiltered_ds1Zfiltered_ds2r'   r'   r(   test_union_dataset_filter  sP   

"rq  c                 C   s   | d }t |\}}t|}| }|jdksJ |tddk }| jdks-J t	t
 |  W d    d S 1 sBw   Y  d S )Ntest_parquet_dataset_filterr  r   r   r8  )r  r   r  r   r   r   r*  r,   r   r   r  r7   )rO  r  r  r  r>   r  Zfiltered_dsr'   r'   r(   rr  @  s   

"rr  c                 C   s   t jt tdgdgd}t|}dtdi}|j|d}tj|| dgdd t	j
tdd	 tj|| dgdd W d
   d
S 1 sGw   Y  d
S )z
    Ensure the projected schema is used to validate partitions for scanner

    https://issues.apache.org/jira/browse/ARROW-17228
    r8  Zoriginal_columnr;  Zrenamed_columnr   r=  r   z0'Column original_column does not exist in schemar   N)r+   r4   r   r   r   r>   r,   r   rL  r   r   KeyError)rO  r4   Ztable_datasetr   r   r'   r'   r(   4test_write_dataset_with_scanner_use_projected_schemaO  s    



"rt  rV   )r=  r   c              
   C   s   |dkr	t d tddgddgd dddgdd	id gd
ddg dddigd
gd}tj|| d |d tj| d |d}|jg dd}| dd ddgd d	dd gddddg ddd dgddgkskJ d S )Nr   zpyarrow.parquetZabc123Zqrs456r
   r   Zbuttonr  ra  )r   elementvaluesstructsscrollZwindow)NrD   r   fizzZbuzz)user_ida.dotted.fieldZinteractionr4   r   )rz  zinteraction.typezinteraction.valueszinteraction.structsr{  r   )ry  r  )rz  r   rv  rw  r{  )	r   r  r+   r4   r   rL  r>   r   r  )rO  rV   r4   r[  r'   r'   r(   test_read_table_nested_columnsh  s2   



r|  c                 C   s   ddl m} | d }tjtg dt tg dt gddg}|j||ddgd	d
 |j|dd	t	t
dt t
dt gd  }||dksWJ |d }tt|}dd |D }tt|}||ksxJ d S )Nr   r  zslash-writer-xr
   r   rD   r   r   )experiment/A/f.csvzexperiment/B/f.csvr~  zexperiment/C/k.csvzexperiment/M/i.csvZexp_idexp_metar=  r   )r%   r  rV   r   r  )r  rV   r   r*   r
   c                 S   s   g | ]
}d t |dd qS )z	exp_meta=r  r  r   r  r'   r'   r(   rL     r)  z5test_dataset_partition_with_slash.<locals>.<listcomp>)r   r>   r+   r1   r  r   r   r"  rL  r*   r,   r   rf  r   r   r  r   rx   r  r  )Ztmpdirr   rb   Zdt_tabler  r  Zencoded_pathsr  r'   r'   r(   !test_dataset_partition_with_slash  sB   
r  c                 C   s   t t jdt  ddt jdt  ddg}g dg dg}t jj||d}t|| d	  tj	| d	 d
d}|
 j|sBJ tj|| d d
d tj	| d d
d}|
 j|s_J tj||g| d d
d tj	| d d
d}|
 j|s~J d S )Nr   F)Znullabler}  TrX  Nr   NrR   Z	nulltest1r   r   Z	nulltest2Z	nulltest3)r+   r*   r,   r.   r1   r  r]   r  r   r>   r   r   rL  )rO  Zschema_nullabler  r4   r>   r'   r'   r(   'test_write_dataset_preserve_nullability  s   r  c                 C   sP  t t jdt  ddidt dt  g}t t dt  t dt  g}g dg dg}t jj||d}t jj||d}tj||g| d	 d
d tj| d	 d
d}|	 jj
|ddscJ tj||g| d d
d tj| d d
d}|	 jj
|ddsJ tj||g| d d
|d tj| d d
d}|	 jj
|ddsJ d S )Nr   s   foos   barr  r}  rX  r  rR   Ztest1r   r   Tr  Ztest2Ztest3r  )r+   r*   r,   r.   r1   r  r   rL  r>   r   r   )rO  Zschema_metadataZschema_no_metar  r4   Ztable_no_metar>   r'   r'   r(   *test_write_dataset_preserve_field_metadata  s,   r  c              
   C   s   dD ]n}dD ]i}t t dt  t dt  g}g dg dg}t jj||d}t }| d|  }tj||d|j	||d	d
d tj
|dd}|jD ]}	t|	}
|
dd}|j|u seJ |j||@ u snJ qOqqd S )N)TFr   r}  rX  r  rR   Zwrite_page_index_r   )write_statisticswrite_page_indexr  )rV   r  r  r   r   )r+   r*   r,   r.   r1   r  r   r   rL  rK  r>   r   r]   r/  r  r   Zhas_offset_indexZhas_column_index)rO  r  r  r*   r  r4   r   r  r[  r  r  ccr'   r'   r(   #test_write_dataset_write_page_index  s:   


r  c                 C   s  t jt g dt g dgddgd}|dkr-tj|| d dd	 tj| d dd	}n|d
kr7t|}nt|d 	 g dg ddksMJ |dg 	 g dg ddksbJ |
tddk d 	 g dg ddks~J t jjt jg dt  dt g dgddgd}t|}|dg}| 	 }|d g dksJ |d g dksJ |dg}| 	 }|d g dksJ |d g dksJ d S )N)rD   r
   r   r   r   )rG   rF   rG   rF   rd  rv  rk  r;  rS   r  r=  r   re  )rF   rF   rG   rG   rd  r}  )rk  rv  )rv  
descending)rd  rG   rG   rF   rF   )r   r   rD   r   r
   r   )rF   rF   rG   rX  )r   r  r  r  r   )r  carra  foobarrF   rG   )rF   r  )r  r  r  r   )r  r  ra  r  )rF   r  )r+   r4   r   r   rL  r>   rP  r   r   r   r   r*  r,   r1   r  r.   )rO  rd  r4   r   Z
sorted_tabZsorted_tab_dictr'   r'   r(   test_dataset_sort_by  sV   
r  c                 C   s  t dg di}t j }|jdd}| d }tj||||d tjdd}t jj|d}tj||d	 }||ks=J | d
 }t	|| t
| }	t|	dksTJ |	d }
t|
 }|d |d kshJ |d |d |d< |d< |
| tjdd}t jj|d}tj||d	 }||ksJ |t dg diksJ tjtdd tj||d	 }W d   dS 1 sw   Y  dS )zwCheck that checksum verification works for datasets created with
    ds.write_dataset and read with ds.dataset.to_tablerF   rL  T)Zwrite_page_checksumZcorrect_dir)r%   r  rV   r  r|  )Zdefault_fragment_scan_optionsr   Zcorrupted_dirr
   r      $   F)r
   rD   r   r   zCRC checksum verificationr   N)r+   r4   r>   r   rK  r   rL  r  r   r   rY   iterdirr   	bytearray
read_byteswrite_bytesr   r   r  )rO  Z
table_origZpq_write_formatr  Zoriginal_dir_pathZpq_scan_opts_crcZpq_read_format_crcZtable_checkZcorrupted_dir_pathZcorrupted_file_path_listZcorrupted_file_pathZbin_dataZpq_scan_opts_no_crcZpq_read_format_no_crcZtable_corruptr  r'   r'   r(   1test_checksum_write_dataset_read_dataset_to_table1  sn   


"r  r  r6   )r   r
   r   )rz   r   r  r  r  sysrJ  r  r  r7  shutilr   urllib.parser   numpyr   r   r   r+   r  r  r*  Zpyarrow.csvr  rw   rS   Zpyarrow.jsonZpyarrow.tests.utilr   r   r   r   r	   r   r   r  r  r>   r   Zpyarrow.parquetr   r]   markZ
pytestmarkr)   r5   rA   Zfixturer_   r|   r   r   r   r   r  r	  r  r  r  r"  r#  r'  rR  rW  r^  rf  rq  rt  r  r  r  Zparametrizer   r  r  Zs3r  r  r  r  r  r  r  r  r  r  r  r  r  r+  r8  r9  r=  r?  r@  rE  rF  rG  rQ  rW  rc  ri  ry  r{  r  r  r  r  r  r  r  r  r  r  r  r  rU  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r	  r  r  r  r   r$  r&  r'  r,  r-  r2  r4  r5  r:  r=  r@  rB  rG  rH  rJ  rL  rP  r]  rc  rd  rg  rp  rs  rt  r~  r  r  r  r  r  Zskipifplatformr  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r&  r'  r(  r)  r*  r3  r4  r6  r;  r=  r?  r@  rA  rE  rF  rG  r]  r`  rc  rn  rq  rr  rt  r|  r  r  r  r  r  r  r'   r'   r'   r(   <module>   s`  
"

 
1

.
G
;

0



y


4'H
8
%
 
<


(
8*




)



## 
d
U%












	B

$



	9

B






B 9& /
'=#K0$#D&/$+#
/J

N-%
 0