o
    ީZh^                  	   @   s  d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZmZmZ d dlmZ d dlmZmZmZmZmZmZmZ d dlmZmZ d dlmZ d dlmZ d	d
lm Z  d	dl!m"Z"m#Z#m$Z$m%Z% d	dl&m'Z' d	dl(m)Z)m*Z* d	dl+m,Z,m-Z-m.Z.m/Z/ d	dl m0Z0m1Z1m2Z2 d	dl3m4Z4 e 5dZ6e7g dZ8erd	dl9m:Z: d	dl;m<Z< ddddddZ=de>de?fddZ@deed f dee
eeeAeBf d f  e
e? f fd!d"ZCdedee
eeeAeBf d f  e
e? f fd#d$ZDd%ee?ef dee?ef fd&d'ZEG d(d) d)eZFd=d*e"d+e#de"fd,d-ZGd*e"d.e#de"fd/d0ZHG d1d2 d2e'ZIG d3d4 d4eIZJd5e"d6e"ddfd7d8ZKG d9d: d:eJZLG d;d< d<eJZMdS )>    N)	lru_cache)
TYPE_CHECKINGAnyCallableDict	GeneratorListOptionalPatternTupleUnion)PDFPageAggregator)LTCharLTComponentLTContainerLTCurveLTItemLTPageLTTextContainer)PDFPageInterpreter	PDFStackT)PDFPage)	PSLiteral   )utils)T_bboxT_numT_obj
T_obj_list)	Container)PDFStructTreeStructTreeMissing)T_table_settingsTableTableFinderTableSettings)decode_textresolve_allresolve_and_decode)TextMapz^LT)advheightZ	linewidthptssizeZsrcsizewidthx0x1y0y1bitsmatrixZuprightfontnametextZ	imagemaskZ
colorspaceZevenoddfillnon_stroking_colorstrokestroking_colorstreammcidtag)	PageImage)PDFzSimSun,RegularzSimHei,RegularzSimKai,RegularzSimFang,RegularzSimLi,Regular)s   s   s   _GB2312s   _GB2312s   r5   returnc                 C   sh   d| v r|  dd }| d | | |d  }}nd| }}t|t|dd }t|dd | S )N   +r          )indexCP936_FONTNAMESgetstr)r5   Zsplit_atprefixsuffixZ
suffix_new rK   F/var/www/html/lang_env/lib/python3.10/site-packages/pdfplumber/page.pyfix_fontname_bytesW   s   
rM   color.c                 C   s4   t | d tr| d d pd t| d jfS | d fS )NrD   )
isinstancer   r&   name)rN   rK   rK   rL   separate_patternb   s   rQ   c                 C   sJ   | d u rdS t | tr| }t|S t | trt| }t|S | f}t|S )N)NN)rO   tuplelistrQ   )rN   Z	tuplefiedrK   rK   rL   normalize_colork   s   

rT   kwargsc                 C   s   dd |   D S )Nc                 S   s(   i | ]\}}|t |trt|n|qS rK   )rO   rS   rR   ).0keyvaluerK   rK   rL   
<dictcomp>z   s    z'tuplify_list_kwargs.<locals>.<dictcomp>)items)rU   rK   rK   rL   tuplify_list_kwargsy   s   r[   c                       s   e Zd ZU dZdZee ed< dZee	 ed< dde
dee ddfdd	Zdd
dZdddZdef fddZd fddZd fddZ  ZS )"PDFPageAggregatorWithMarkedContentzZExtract layout from a specific page, adding marked-content IDs to
    objects where found.Ncur_mcidcur_tagr=   propsr@   c                 C   s6   t |j| _t|trd|v r|d | _dS d| _dS )z5Handle beginning of tag, setting current MCID if any.ZMCIDN)r&   rP   r^   rO   dictr]   )selfr=   r_   rK   rK   rL   	begin_tag   s   
z,PDFPageAggregatorWithMarkedContent.begin_tagc                 C   s   d| _ d| _dS )z/Handle beginning of tag, clearing current MCID.N)r^   r]   ra   rK   rK   rL   end_tag   s   
z*PDFPageAggregatorWithMarkedContent.end_tagc                 C   s,   | j jr| j jd }| j|_| j|_dS dS )z^Add current MCID to what we hope to be the most recent object created
        by pdfminer.six.rD   N)Zcur_item_objsr]   r<   r^   r=   )ra   Zcur_objrK   rK   rL   tag_cur_item   s
   	z/PDFPageAggregatorWithMarkedContent.tag_cur_itemc                    s   t  j|i |}|   |S )z;Hook for rendering characters, adding the `mcid` attribute.)superrender_charrf   )ra   argsrU   r*   	__class__rK   rL   rh      s   z.PDFPageAggregatorWithMarkedContent.render_charc                       t  j|i | |   dS )z7Hook for rendering images, adding the `mcid` attribute.N)rg   render_imagerf   ra   ri   rU   rj   rK   rL   rm         z/PDFPageAggregatorWithMarkedContent.render_imagec                    rl   )zAHook for rendering lines and curves, adding the `mcid` attribute.N)rg   
paint_pathrf   rn   rj   rK   rL   rp      ro   z-PDFPageAggregatorWithMarkedContent.paint_pathNr@   N)__name__
__module____qualname____doc__r]   r	   int__annotations__r^   rH   r   r   rb   rd   rf   floatrh   rm   rp   __classcell__rK   rK   rj   rL   r\      s   
 

r\   box_rawrotationc                 C   sP   t | d | d f\}}t | d | d f\}}|dv r"||||fS ||||fS )Nr   rC   r      )Z   i  )sorted)r{   r|   r/   r0   r1   r2   rK   rK   rL   _normalize_box   s
   r   	mb_heightc                 C   s    | \}}}}||| ||| fS rq   rK   )r{   r   r/   r1   r0   r2   rK   rK   rL   _invert_box      r   c                   @   s  e Zd ZU ejdg Zee ed< dZe	ed< dZ
	dgddd	ed
edefddZdhddZedefddZedefddZedeeeef  fddZedefddZedefddZedefddZedeeef fddZdeeef deeef fd d!Zd"edefd#d$Z d%ee! de"eddf fd&d'Z#deeef fd(d)Z$	did*e%e& de'fd+d,Z(	did*e%e& dee) fd-d.Z*	did*e%e& de%e) fd/d0Z+	did*e%e& deeee%e    fd1d2Z,	did*e%e& de%eee%e    fd3d4Z-d5ede.fd6d7Z/					djd8e0ee1e f d9e	d:e	d;ed<e	d=e	d5edeeeef  fd>d?Z2d5edefd@dAZ3d5edefdBdCZ4d5edefdDdEZ5	dkdFe	d<e	d5edefdGdHZ6	dldJe7dKe	dLe	ddMfdNdOZ8	dldJe7dKe	dLe	ddMfdPdQZ9	dldJe7dKe	dLe	ddMfdRdSZ:dTe;ege	f ddUfdVdWZ<d5eddUfdXdYZ=				I	IdmdZe%e0ee>f  d[e%e0ee>f  d\e%e0ee>f  d]e	d^e	dd_fd`daZ?didbe%ee  deeef fdcddZ@defdedfZAdS )nPage_layoutcached_propertiesTis_originalNr   pdfr?   page_objpage_numberinitial_doctopc           	         s   || _ | | _ | _|| _|| _ddtdtdtf fdd}|dd}|d | _t|d	| j}|d
 |d  }t	||| _
d jv rOt	t|d| j|| _n| j
| _| j
| _t | j| _d S )NrW   defaultr@   c                    s     j | }|d u r|S t|S rq   )attrsrG   r'   )rW   r   refr   rK   rL   get_attr   r   zPage.__init__.<locals>.get_attrZRotater   ih  ZMediaBoxr}   r   ZCropBoxrq   )r   	root_pager   r   r   rH   r   r|   r   r   mediaboxr   cropboxbboxr   _get_textmapget_textmap)	ra   r   r   r   r   r   Z	_rotationZmb_rawr   rK   r   rL   __init__   s$   


zPage.__init__r@   c                 C   s   |    | j  d S rq   )flush_cacher   cache_clearrc   rK   rK   rL   close   s   z
Page.closec                 C      | j d | j d  S )NrC   r   r   rc   rK   rK   rL   r.         z
Page.widthc                 C   r   )Nr}   r   r   rc   rK   rK   rL   r+      r   zPage.heightc                 C   s0   zdd t | j| D W S  ty   g  Y S w )z-Return the structure tree for a page, if any.c                 S   s   g | ]}|  qS rK   )to_dict)rV   elemrK   rK   rL   
<listcomp>  s    z'Page.structure_tree.<locals>.<listcomp>)r    r   r!   rc   rK   rK   rL   structure_tree  s
   zPage.structure_treec                 C   sR   t | dr| jS t| jj| j| jjd}t| jj|}|| j	 |
 | _| jS )Nr   )Zpagenolaparams)hasattrr   r\   r   Zrsrcmgrr   r   r   Zprocess_pager   Z
get_result)ra   ZdeviceinterpreterrK   rK   rL   layout
  s   

zPage.layoutc                    s`   dt ttf dtdt ttf ffdd dtdtf fdd}tjjp(g }tt||S )	Nptrr@   c                    sF   |d }t |D ]}| \}}||d kr jn j}||| f} q| S )Nr~   rC   )ranger.   r+   )r   r   Zturnsixycomprc   rK   rL   rotate_point  s   z!Page.annots.<locals>.rotate_pointannotc                    s(  | d \}}}} ||fj } ||fj }ttg ||R j\}}}	}
| di }|d| d| dd}| D ]"\}}|d urdz	|d||< W qB tyc   |d||< Y qBw qBjd	|j|
 |	j| j	| ||
|	| |
| d
}|
| d| v r| d< | |d< |S )NZRectAURITZContents)urititlecontentszutf-8zutf-16r   )r   object_typer/   r1   r0   r2   doctoptopbottomr.   r+   Pdata)r|   r   r   r+   rG   rZ   decodeUnicodeDecodeErrorr   r   update)r   _a_bZ_c_dZpt0Zpt1r/   r   r0   r   aextraskvparsedr   ra   rK   rL   parse"  sD   "
zPage.annots.<locals>.parse)	r   ry   rw   r   r'   r   annotsrS   map)ra   r   rawrK   r   rL   r     s   *(zPage.annotsc                 C   s   dd | j D S )Nc                 S   s   g | ]
}|d  dur|qS )r   NrK   )rV   r   rK   rK   rL   r   O  s    z#Page.hyperlinks.<locals>.<listcomp>)r   rc   rK   rK   rL   
hyperlinksM  s   zPage.hyperlinksc                 C   s    t | dr| jS |  | _| jS )N_objects)r   r   parse_objectsrc   rK   rK   rL   objectsQ  s   

zPage.objectsr   c                 C   s   |d | j |d  fS )Nr   r   )r+   )ra   r   rK   rK   rL   point2coordX     zPage.point2coordobjc           	         s  t td|jj }dtttf dt	tttf  fdd}t
td t||j }||d<  j|d< dD ]}t||rGtt||j||< q6d	D ]\}}||v r^t|| \||< ||< qJt|ttfrl| |d
< t|tr|j}t|j\|d< |d< t|j\|d< |d< t|d trt|d |d< n#t|tfrtt j |d |d<  fdd|j!D |d< |j"|d< d|v r܈ j#|d  |d<  j#|d  |d<  j$|d  |d< |S )N itemr@   c                 S   s$   | \}}|t v rt|}||fS d S rq   )	ALL_ATTRSr'   )r   r   r   resrK   rK   rL   process_attr^  s
   z)Page.process_object.<locals>.process_attrr   r   )ncsZscs))r:   stroking_pattern)r8   non_stroking_patternr6   r:   r   r8   r   r5   r,   c                    s$   g | ]^}}|gt  j|R qS rK   )r   r   )rV   cmdr,   rc   rK   rL   r     s   $ z'Page.process_object.<locals>.<listcomp>pathdashr1   r2   r   r   r   )%resublt_patrk   rs   lowerr   rH   r   r	   r`   filterr   __dict__rZ   r   r   r(   getattrrP   rT   rO   r   r   Zget_textZgraphicstateZscolorZncolorbytesrM   r   rS   r   Zoriginal_pathZdashing_styler+   r   )	ra   r   kindr   attrcsZ
color_attrZpattern_attrgsrK   rc   rL   process_object[  sF   &



zPage.process_objectlayout_objectsc                 c   sR    |D ]#}t |tr | jjd ur| |V  | |jE d H  q| |V  qd S rq   )rO   r   r   r   r   iter_layout_objectsre   )ra   r   r   rK   rK   rL   r     s   
zPage.iter_layout_objectsc                 C   sR   i }|  | jjD ]}|d }|dv rq	||d u rg ||< || | q	|S )Nr   )anno)r   r   re   rG   append)ra   r   r   r   rK   rK   rL   r     s   zPage.parse_objectstable_settingsc                 C   s   t |}t| |S rq   )r%   resolver$   ra   r   tsetrK   rK   rL   debug_tablefinder  s   

zPage.debug_tablefinderc                 C   s   t |}t| |jS rq   )r%   r   r$   tablesr   rK   rK   rL   find_tables  s   
zPage.find_tablesc                 C   sX   t |}| |}t|dkrd S dtdttttf fdd}tt	||dd }|S )Nr   r   r@   c                 S   s   t | j | jd | jd fS )Nr   r   )lencellsr   r   rK   rK   rL   sorter  s   zPage.find_table.<locals>.sorter)rW   )
r%   r   r   r   r#   r   rw   r   rS   r   )ra   r   r   r   r   ZlargestrK   rK   rL   
find_table  s   

zPage.find_tablec                    s&   t | |  } fdd|D S )Nc                    s"   g | ]}|j d i  jpi qS )rK   )extracttext_settings)rV   tabler   rK   rL   r     s   " z'Page.extract_tables.<locals>.<listcomp>)r%   r   r   )ra   r   r   rK   r   rL   extract_tables  s   

zPage.extract_tablesc                 C   s6   t |}| |}|d u rd S |jdi |jpi S NrK   )r%   r   r   r   r   )ra   r   r   r   rK   rK   rL   extract_table  s
   

zPage.extract_tablerU   c                 K   s\   t | jd}d|vr|d| ji d|vr|d| ji i ||}tj| jfi |S )N)Zlayout_bboxZlayout_width_charsZlayout_widthZlayout_height_charsZlayout_height)r`   r   r   r.   r+   r   Zchars_to_textmapchars)ra   rU   defaultsZfull_kwargsrK   rK   rL   r     s   zPage._get_textmappatternregexcase
main_groupreturn_charsreturn_groupsc           	      K   s*   | j di t|}|j||||||dS )N)r  r  r  r  r  rK   )r   r[   search)	ra   r  r  r  r  r  r  rU   ZtextmaprK   rK   rL   r	    s   
zPage.searchc                 K   s   | j di t|jS r   )r   r[   	as_stringra   rU   rK   rK   rL   extract_text  r   zPage.extract_textc                 K      t j| jfi |S rq   )r   extract_text_simpler  r  rK   rK   rL   r       zPage.extract_text_simplec                 K   r  rq   )r   extract_wordsr  r  rK   rK   rL   r  
  r  zPage.extract_wordsstripc                 K   s   | j di t|j||dS )N)r  r  rK   )r   r[   extract_text_lines)ra   r  r  rU   rK   rK   rL   r    s   zPage.extract_text_linesFr   relativestrictCroppedPagec                 C   s   t | |||dS )N)r  r  )r  ra   r   r  r  rK   rK   rL   crop  s   z	Page.cropc                 C      t | |||tjdS zS
        Same as .crop, except only includes objects fully within the bbox
        )r  r  crop_fn)r  r   within_bboxr  rK   rK   rL   r       zPage.within_bboxc                 C   r  r  )r  r   outside_bboxr  rK   rK   rL   r  #  r  zPage.outside_bboxtest_functionFilteredPagec                 C   s
   t | |S rq   )r  )ra   r  rK   rK   rL   r   -     
zPage.filterc                 K   sB   t | dd }dd | j D |_tj| jfi ||jd< |S )u   
        Removes duplicate chars — those sharing the same text, fontname, size,
        and positioning (within `tolerance`) as other characters on the page.
        c                 S   s   dS )NTrK   r   rK   rK   rL   <lambda>5  s    z#Page.dedupe_chars.<locals>.<lambda>c                 S   s   i | ]\}}||qS rK   rK   )rV   r   objsrK   rK   rL   rY   6  s    z%Page.dedupe_chars.<locals>.<dictcomp>char)r  r   rZ   r   r   dedupe_charsr  )ra   rU   prK   rK   rL   r$  0  s   zPage.dedupe_chars
resolutionr.   r+   	antialiasforce_mediaboxr>   c           	      C   s   ddl m}m} tdd |||fD }|dkrtd| |dur+d| | j }n|dur6d| | j }|| |p;|||dS )	z
        You can pass a maximum of 1 of the following:
        - resolution: The desired number pixels per inch. Defaults to 72.
        - width: The desired image width in pixels.
        - height: The desired image width in pixels.
        r   )DEFAULT_RESOLUTIONr>   c                 s   s    | ]}|d uV  qd S rq   rK   )rV   r   rK   rK   rL   	<genexpr>J  s    z Page.to_image.<locals>.<genexpr>zUOnly one of these arguments can be provided: resolution, width, height. You provided NH   )r&  r'  r(  )displayr)  r>   sum
ValueErrorr.   r+   )	ra   r&  r.   r+   r'  r(  r)  r>   Z	num_specsrK   rK   rL   to_image:  s    zPage.to_imageobject_typesc              	   C   sl   |d u rt | j dg }n|}| j| j| j| j| j| j| j	| j
d}|D ]}t| |d ||d < q&|S )Nr   )r   r   r|   r   r   r   r.   r+   s)rS   r   keysr   r   r|   r   r   r   r.   r+   r   )ra   r0  Z_object_typesdtrK   rK   rL   r   [  s   
zPage.to_dictc                 C   s   d| j  dS )Nz<Page:>)r   rc   rK   rK   rL   __repr__n  s   zPage.__repr__r   rr   rq   )TTr   TT)TT)FT)NNNFF)Brs   rt   ru   r   r   r   rH   rx   r   boolZpagesr   rw   r   r   r   propertyr.   r+   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r	   r"   r$   r   r#   r   r   r   r   r)   r   r   r
   r	  r  r  r  r  r   r  r  r  r   r   r$  ry   r/  r   r6  rK   rK   rK   rL   r      s6  
 

)4"C






	






$!r   c                   @   s(   e Zd ZU dZeed< defddZdS )DerivedPageFr   parent_pagec                 C   sT   || _ |j| _|j| _|j| _|j| _|j| _|j| _| tj	 t
 | j| _d S rq   )r;  r   r   r   r   r   r   r   r   r   r   r   r   )ra   r;  rK   rK   rL   r   u  s   zDerivedPage.__init__N)rs   rt   ru   r   r8  rx   r   r   rK   rK   rK   rL   r:  r  s   
 r:  r   parent_bboxc                 C   st   t | }|dkrtd|  dt | |}|d u r%td|  d| t |}||k r8td|  d| d S )Nr   zBounding box z has an area of zero.z. is entirely outside parent page bounding box z. is not fully within parent page bounding box )r   Zcalculate_arear.  Zget_bbox_overlap)r   r<  Z	bbox_areaoverlapZoverlap_arearK   rK   rL   test_proposed_bbox  s$   

r>  c                       sb   e Zd Zejddfdededeeegef de	de	f
 fdd	Z
ed
eeef fddZ  ZS )r  FTr;  	crop_bboxr  r  r  c                    s   |r|j \}}}} \}	}
}}|	| |
| || || f |r%t |j  dtdtf fdd}t | || _tju rE|j | _ d S  | _ d S )Nr"  r@   c                    s
   |  S rq   rK   )r"  r?  r  rK   rL   _crop_fn  r   z&CroppedPage.__init__.<locals>._crop_fn)r   r>  r   rg   r   rA  r   r  )ra   r;  r?  r  r  r  Zo_x0Zo_top_r/   r   r0   r   rA  rj   r@  rL   r     s   

zCroppedPage.__init__r@   c                    2   t  dr jS  fdd jj D  _ jS )Nr   c                    s   i | ]
\}}|  |qS rK   )rA  rV   r   r   rc   rK   rL   rY     s    z'CroppedPage.objects.<locals>.<dictcomp>r   r   r;  r   rZ   rc   rK   rc   rL   r     s   


zCroppedPage.objects)rs   rt   ru   r   Zcrop_to_bboxr   r   r   r   r8  r   r9  r   rH   r   rz   rK   rK   rj   rL   r    s"     r  c                       sJ   e Zd Zdedeegef f fddZede	e
ef fddZ  ZS )r  r;  	filter_fnc                    s   |j | _ || _t | d S rq   )r   rF  rg   r   )ra   r;  rF  rj   rK   rL   r     s   zFilteredPage.__init__r@   c                    rC  )Nr   c                    s"   i | ]\}}|t t j|qS rK   )rS   r   rF  rD  rc   rK   rL   rY     s    z(FilteredPage.objects.<locals>.<dictcomp>rE  rc   rK   rc   rL   r     s   


zFilteredPage.objects)rs   rt   ru   r   r   r   r8  r   r9  r   rH   r   r   rz   rK   rK   rj   rL   r    s      r  r7  )Nr   	functoolsr   typingr   r   r   r   r   r   r	   r
   r   r   Zpdfminer.converterr   Zpdfminer.layoutr   r   r   r   r   r   r   Zpdfminer.pdfinterpr   r   Zpdfminer.pdfpager   Zpdfminer.psparserr   r   r   Z_typingr   r   r   r   	containerr   Z	structurer    r!   r   r"   r#   r$   r%   r&   r'   r(   Z
utils.textr)   compiler   setr   r,  r>   r   r?   rF   r   rH   rM   ry   rw   rQ   rT   r[   r\   r   r   r   r:  r>  r  r  rK   rK   rK   rL   <module>   sf    0$	
 	
"
	"
"3   -(