o
    ީZhR[                     @   s   d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
mZmZmZ ddlmZ ddlmZmZmZmZmZmZ dZdZdZdZeeeeef f Zed	eeef f Zer^dd
lmZ eefdedededefddZ efdedededefddZ!dedededededefddZ"efdede#defddZ$efdede#defdd Z%	d6dedededefd!d"Z&d#ede	e fd$d%Z'd&e	e de	e	e  fd'd(Z(G d)d* d*e)Z*G d+d, d,e*Z+G d-d. d.e)Z,g d/Z-g d0Z.G d1d2 d2e/Z0e0d Z1eG d3d	 d	Z2G d4d5 d5e)Z3dS )7    N)	dataclass)
itemgetter)TYPE_CHECKINGAnyDictListOptionalSetTupleUnion   )utils)T_bboxT_numT_obj
T_obj_iter
T_obj_listT_point   TableSettings)Pageedgesx_tolerancey_tolerancereturnc                 C   sR   g g d}| D ]}||d   | qt|d d|}t|d d|}|| S )zs
    Given a list of edges, snap any within `tolerance` pixels of one another
    to their positional average.
    vhorientationr   x0r   top)appendr   Zsnap_objects)r   r   r   Zby_orientationeZ	snapped_vZ	snapped_h r#   G/var/www/html/lang_env/lib/python3.10/site-packages/pdfplumber/table.py
snap_edges   s   
	r%   r   	tolerancec           	      C   s   |dkr	d\}}n|dkrd\}}nt dtt| t|d}|d g}|dd	 D ])}|d
 }|| || | krO|| || krNt|||| |d
< q+|| q+|S )z
    Given a list of edges along the same infinite line, join those that
    are within `tolerance` pixels of one another.
    r   )r   x1r   )r    bottomzOrientation must be 'v' or 'h'keyr   r   N)
ValueErrorlistsortedr   r   Zresize_objectr!   )	r   r   r&   Zmin_propZmax_propZsorted_edgesZjoinedr"   lastr#   r#   r$   join_edge_group'   s   


r0   snap_x_tolerancesnap_y_tolerancejoin_x_tolerancejoin_y_tolerancec           	         sv   dt dtttf fdd}|dks|dkrt| ||} t| |d}tj||d} fdd|D }ttj	| } | S )	z|
    Using the `snap_edges` and `join_edge_group` methods above,
    merge a list of edges into a more "seamless" list.
    edger   c                 S   s$   | d dkrd| d fS d| d fS )Nr   r   r    r   r   r#   )r5   r#   r#   r$   	get_groupP   s   zmerge_edges.<locals>.get_groupr   r)   c                 3   s4    | ]\}}t ||d  |d  dkr nV  qdS )r   r   N)r0   ).0kitemsr3   r4   r#   r$   	<genexpr>[   s    
zmerge_edges.<locals>.<genexpr>)
r   r
   strr   r%   r.   	itertoolsgroupbyr-   chain)	r   r1   r2   r3   r4   r6   _sortedZedge_groupsZedge_genr#   r:   r$   merge_edgesD   s   rA   wordsword_thresholdc           	   
      s   t | tdd}t fdd|}ttt j|}t|dkr"g S tttd|}t	ttd|}g }|D ]"}||||d |d || dd	|||d
 |d
 || dd	g7 }q8|S )zi
    Find (imaginary) horizontal lines that connect the tops
    of at least `word_threshold` words.
    r    r   c                       t |  kS NlenxrC   r#   r$   <lambda>m       z"words_to_edges_h.<locals>.<lambda>r   r   r'   r   )r   r'   r    r(   widthr   r(   )
r   cluster_objectsr   filterr-   mapZobjects_to_rectrG   minmax)	rB   rC   Zby_toplarge_clustersZrectsZmin_x0max_x1r   rr#   rJ   r$   words_to_edges_he   s4   rV   c                    sF  t | tdd}t | tdd}dtdtfdd}t | |d}|| | }t|dd	 d
}tfdd	|}ttt j	|}	g }
|	D ] t
 fdd|
D }|sY|
  qEt|
dkrbg S tt j|
}tt|tdd
}tttd|}tttd|tttd|fdd|D || ddg S )zy
    Find (imaginary) vertical lines that connect the left, right, or
    center of at least `word_threshold` words.
    r   r   r'   wordr   c                 S   s   t | d | d  d S )Nr   r'      )float)rW   r#   r#   r$   
get_center   s   z$words_to_edges_v.<locals>.get_centerc                 S   s
   t |  S rE   rF   rH   r#   r#   r$   rK      s   
 z"words_to_edges_v.<locals>.<lambda>r)   c                    rD   rE   rF   rH   rJ   r#   r$   rK      rL   c                 3   s    | ]	}t  |V  qd S rE   )r   Zget_bbox_overlapr7   cbboxr#   r$   r;      s    z#words_to_edges_v.<locals>.<genexpr>r   r    r(   c              	      s*   g | ]}|d  |d     ddqS )r   r   r   r'   r    r(   heightr   r#   )r7   b)
max_bottommin_topr#   r$   
<listcomp>   s    	z$words_to_edges_v.<locals>.<listcomp>r   r_   )r   rN   r   r   r   r.   rO   r-   rP   Zobjects_to_bboxanyr!   rG   Zbbox_to_rectrR   rQ   )rB   rC   Zby_x0Zby_x1rZ   Z	by_centerZclustersZsorted_clustersrS   ZbboxesZcondensed_bboxesoverlapZcondensed_rectsZsorted_rectsrT   r#   )r^   rb   rc   rC   r$   words_to_edges_v   sB   
	rg   c           	         s   i } fdddD \}}t |tdddD ][}t |tdddD ]O}|d |d | krp|d |d | krp|d |d | krp|d |d | krp|d |d f}||vr^g g d||< || d	 | || d
 | q!q|S )zi
    Given a list of edges, return the points at which they intersect
    within `tolerance` pixels.
    c                    s"   g | ] t t fd dqS )c                    s   | d  kS )Nr   r#   rH   or#   r$   rK      rL   z3edges_to_intersections.<locals>.<listcomp>.<lambda>)r-   rO   )r7   r   rh   r$   rd      s    z*edges_to_intersections.<locals>.<listcomp>r   r   r    r)   r(   r'   r   r   )r.   r   r!   )	r   r   r   intersectionsZv_edgesZh_edgesr   r   Zvertexr#   rj   r$   edges_to_intersections   s$   

rl   rk   c                    s   dt dt dtffdd tt tdtt  dtdtt	 f fdd	fd
dt
tD }ttd|S )a8  
    Given a list of points (`intersections`), return all rectangular "cells"
    that those points describe.

    `intersections` should be a dictionary with (x0, top) tuples as keys,
    and a list of edge objects as values. The edge objects should correspond
    to the edges that touch the intersection.
    p1p2r   c                    s   dt dtt fdd}| d |d kr*| |  d | | d }t|r*dS | d |d krI| |  d	 | | d	 }t|rIdS d
S )Nr   r   c                 S   s   t ttj| S rE   )setrP   r   Zobj_to_bboxrj   r#   r#   r$   edges_to_set   s   zCintersections_to_cells.<locals>.edge_connects.<locals>.edges_to_setr   r   Tr   r   F)r   r	   r   intersectionrG   )rm   rn   rp   common)rk   r#   r$   edge_connects   s   z-intersections_to_cells.<locals>.edge_connectspointsic                    s   |d krd S | |  | |d d  } fdd|D } fdd|D }|D ];} |s0q(|D ]0} |s:q2|d |d f}|v rb||rb||rb d  d |d |d f    S q2q(d S )Nr   c                        g | ]}|d   d  kr|qS r   r#   r7   rI   ptr#   r$   rd          zFintersections_to_cells.<locals>.find_smallest_cell.<locals>.<listcomp>c                    rv   r   r#   rx   ry   r#   r$   rd     r{   r   r#   )rt   ru   restZbelowrightZbelow_ptZright_ptbottom_right)rs   rk   n_pointsry   r$   find_smallest_cell
  s,   

$z2intersections_to_cells.<locals>.find_smallest_cellc                 3   s    | ]} |V  qd S rE   r#   )r7   ru   )r   rt   r#   r$   r;   %  s    z)intersections_to_cells.<locals>.<genexpr>N)r   boolr-   r.   keysrG   r   intr   r   rangerO   )rk   Zcell_genr#   )rs   r   rk   r   rt   r$   intersections_to_cells   s   
&r   cellsc                    s0  dt dtttttf fdd}t| }t  g }g }t|r|t|}t|D ]<}||}t|dkrC t|O  || || q&t fdd|D }|dkrb t|O  || || q&t||krx|t|  	  |	  t|st|r|t| t
|dd	 d
}	dd |	D }
|
S )z
    Given a list of bounding boxes (`cells`), return a list of tables that
    hold those cells most simply (and contiguously).
    r^   r   c                 S   s(   | \}}}}||f||f||f||ffS rE   r#   )r^   r   r    r'   r(   r#   r#   r$   bbox_to_corners/  s   z(cells_to_tables.<locals>.bbox_to_cornersr   c                 3   s    | ]}| v V  qd S rE   r#   r[   Zcurrent_cornersr#   r$   r;   H  s    z"cells_to_tables.<locals>.<genexpr>c                 S   s   t dd | D S )Nc                 s   s     | ]}|d  |d fV  qdS )r   r   Nr#   r[   r#   r#   r$   r;   a  s    z4cells_to_tables.<locals>.<lambda>.<locals>.<genexpr>)rQ   )tr#   r#   r$   rK   a  s    z!cells_to_tables.<locals>.<lambda>r)   c                 S   s   g | ]
}t |d kr|qS r|   rF   )r7   r   r#   r#   r$   rd   b      z#cells_to_tables.<locals>.<listcomp>)r   r
   r   r-   ro   rG   r!   removesumclearr.   )r   r   Zremaining_cellsZcurrent_cellstablesZinitial_cell_countcellZcell_cornersZcorner_countr@   filteredr#   r   r$   cells_to_tables)  s:   


r   c                   @   s"   e Zd Zdeee  fddZdS )	CellGroupr   c              	   C   sh   || _ tttdtd |tttdtd |tttdtd |tttdtd |f| _d S Nr   r   rX   r   )r   rQ   rP   r   rO   rR   r^   )selfr   r#   r#   r$   __init__g  s   
zCellGroup.__init__N)__name__
__module____qualname__r   r   r   r   r#   r#   r#   r$   r   f  s    r   c                   @      e Zd ZdS )RowNr   r   r   r#   r#   r#   r$   r   q      r   c                   @   sh   e Zd Zdddee fddZedefddZedee fd	d
Z	de
deeee   fddZdS )Tablepager   r   c                 C   s   || _ || _d S rE   )r   r   )r   r   r   r#   r#   r$   r   v  s   
zTable.__init__r   c                 C   sJ   | j }tttd|tttd|tttd|tttd|fS r   )r   rQ   rP   r   rR   )r   r\   r#   r#   r$   r^   z  s   z
Table.bboxc                    s   t | jtddd}tt tttd| j}g }t|tdD ]\}}dd |D  t fdd|D }|	| q"|S )Nr   r   r)   c                 S   s   i | ]}|d  |qS rw   r#   )r7   r   r#   r#   r$   
<dictcomp>      zTable.rows.<locals>.<dictcomp>c                    s   g | ]}  |qS r#   )getrx   Zxdictr#   r$   rd     r   zTable.rows.<locals>.<listcomp>)
r.   r   r   r-   ro   rP   r=   r>   r   r!   )r   r@   Zxsrowsy	row_cellsrowr#   r   r$   r     s   z
Table.rowskwargsc                    s   | j j}g }dtdtdtfdd| jD ][g }fdd|D }jD ]D  d u r.d }n6 fdd|D }t|rbd	|v rX d
  d  |d<  d  d  |d<  |d< tj	|fi |}nd}|
| q%|
| q|S )Ncharr^   r   c                 S   sX   | d | d  d }| d | d  d }|\}}}}t ||ko*||k o*||ko*||k S )Nr    r(   rX   r   r'   )r   )r   r^   Zv_midZh_midr   r    r'   r(   r#   r#   r$   char_in_bbox  s   z#Table.extract.<locals>.char_in_bboxc                    s   g | ]
} |j r|qS r#   r]   r7   r   )r   r   r#   r$   rd     r   z!Table.extract.<locals>.<listcomp>c                    s   g | ]	}| r|qS r#   r#   r   )r   r   r#   r$   rd     s
    
ZlayoutrX   r   Zlayout_widthr   r   Zlayout_heightZlayout_bbox )r   charsr   r   r   r   r   rG   r   Zextract_textr!   )r   r   r   Z	table_arrZarrZ	row_charsZ	cell_textZ
cell_charsr#   )r   r   r   r$   extract  s,   

zTable.extractN)r   r   r   r   r   r   propertyr^   r   r   r   r   r<   r   r#   r#   r#   r$   r   u  s    	"
r   )lineslines_stricttextexplicit)snap_tolerancer1   r2   join_tolerancer3   r4   edge_min_lengthmin_words_verticalmin_words_horizontalintersection_toleranceintersection_x_toleranceintersection_y_tolerancec                   @   r   )
UnsetFloatNr   r#   r#   r#   r$   r     r   r   c                   @   s*  e Zd ZU dZeed< dZeed< dZee	e
eef   ed< dZee	e
eef   ed< eZeed< eZeed< eZeed	< eZeed
< eZeed< eZeed< dZeed< eZeed< eZeed< dZeed< eZeed< eZeed< dZ ee!ee"f  ed< dddZ#e$dee% dd fddZ&dS )r   r   vertical_strategyhorizontal_strategyNexplicit_vertical_linesexplicit_horizontal_linesr   r1   r2   r   r3   r4   r   r   r   r   r   r   r   text_settingsr   c                 C   s   t D ]}t| |p
ddk rtd| dqdD ]}t| |d }|tvr2t| ddt dq| jd	u r;i | _d
D ]}|| jvrN| jdd| j|< q=d| jv rX| jd= dD ]\}}t| |tu rnt| |t| | qZ| S )a  Clean up user-provided table settings.

        Validates that the table settings provided consists of acceptable values and
        returns a cleaned up version. The cleaned up version fills out the missing
        values with the default values in the provided settings.

        TODO: Can be further used to validate that the values are of the correct
            type. For example, raising a value error when a non-boolean input is
            provided for the key ``keep_blank_chars``.

        :param table_settings: User-provided table settings.
        :returns: A cleaned up version of the user-provided table settings.
        :raises ValueError: When an unrecognised key is provided.
        r   zTable setting 'z' cannot be negative)
horizontalvertical	_strategyz_strategy must be one of{,}N)r   r   r&   r   ))r1   r   )r2   r   )r3   r   )r4   r   )r   r   )r   r   )	NON_NEGATIVE_SETTINGSgetattrr,   TABLE_STRATEGIESjoinr   r   UNSETsetattr)r   Zsettingr   strategyattrfallbackr#   r#   r$   __post_init__  s4   


zTableSettings.__post_init__settingsc                 C   s   |d u r|  S t || r|S t |tr@i }i }| D ]\}}|d d dkr0|||dd  < q|||< q||d< | di |S td| )N   Ztext_r   zCannot resolve settings: r#   )
isinstancedictr9   r,   )clsr   Zcore_settingsr   r8   r   r#   r#   r$   resolve  s   


zTableSettings.resolve)r   r   )'r   r   r   r   r<   __annotations__r   r   r   r   r   r   r   r   DEFAULT_SNAP_TOLERANCEr   r   r1   r2   DEFAULT_JOIN_TOLERANCEr   r3   r4   r   DEFAULT_MIN_WORDS_VERTICALr   r   DEFAULT_MIN_WORDS_HORIZONTALr   r   r   r   r   r   r   r   classmethodT_table_settingsr   r#   r#   r#   r$   r     s*   
 
5c                   @   s6   e Zd ZdZddddee fddZdefd	d
ZdS )TableFindera0  
    Given a PDF page, find plausible table structures.

    Largely borrowed from Anssi Nurminen's master's thesis:
    http://dspace.cc.tut.fi/dpub/bitstream/handle/123456789/21520/Nurminen.pdf?sequence=3

    ... and inspired by Tabula:
    https://github.com/tabulapdf/tabula-extractor/issues/16
    Nr   r   r   c                    s^   | _ t| _   _t j jj jj _	t
 j	 _ fddt jD  _d S )Nc                    s   g | ]}t  j|qS r#   )r   r   )r7   Z
cell_groupr   r#   r$   rd   @  s    z(TableFinder.__init__.<locals>.<listcomp>)r   r   r   r   	get_edgesr   rl   r   r   rk   r   r   r   r   )r   r   r   r#   r   r$   r   6  s   

zTableFinder.__init__r   c              
   C   s  | j }dD ]'}t||d }|dkr,t|d| d }t|dk r,td| d| d	q|j}|j}|d
ks;|d
krG| jjdi |jpDi }g }|j	pMg D ]9}	t
|	trit|	D ]}
|
d dkrg||
 qZqN||	|	| jjd | jjd | jjd | jjd  dd qN|dkrt| jjd}n!|dkrtj| jjddd}n|d
krt||jd}n|dkrg }|| }g }|jpg D ]9}	t
|	trt|	D ]}
|
d dkr||
 qq|| jjd | jjd | jjd | jjd  |	|	dd q|dkr	t| jjd}n$|dkrtj| jjddd}n|d
kr&t||jd}n|dkr-g }|| }t|t| }t||j|j|j|jd}tj||jdS )N)r   r   r   r   Z	explicit__linesrX   zIf z"_strategy == 'explicit', explicit_zD_lines must be specified as a list/tuple of two or more floats/ints.r   r   r   r   r   r_   r   r   line)Z	edge_typerJ   r   r   )r   r'   rM   r    r(   r   )r1   r2   r3   r4   )
min_lengthr#   )r   r   rG   r,   r   r   r   Zextract_wordsr   r   r   r   r   Zobj_to_edgesr!   r^   Zfilter_edgesr   rg   r   r   rV   r   r-   rA   r1   r2   r3   r4   r   )r   r   r   r   r   Zv_stratZh_stratrB   Z
v_explicitZdescr"   Zv_baser   Z
h_explicitZh_baser   r   r#   r#   r$   r   D  s   











zTableFinder.get_edgesrE   )	r   r   r   __doc__r   r   r   r   r   r#   r#   r#   r$   r   +  s    
r   )r   r   )4r=   dataclassesr   operatorr   typingr   r   r   r   r   r	   r
   r   r   r   Z_typingr   r   r   r   r   r   r   r   r   r   r<   ZT_intersectionsr   r   r   r%   r0   rA   r   rV   rg   rl   r   r   objectr   r   r   r   r   rY   r   r   r   r   r#   r#   r#   r$   <module>   s    ( 


"
,
@
?=A\