o
    ީZhL                     @   sf  d dl Z d dlZd dlZd dlmZ d dlmZmZmZ d dl	m
Z
mZmZmZmZmZmZmZmZmZmZ d dlmZ d dlmZ d dlmZmZ d dlmZ d	d
lmZm Z  d	dl!m"Z"m#Z# e$e%Z&e
rsd	dl'm(Z( d	dl)m*Z* edge+f Z,ded dee-ee- e,f ded fddZ.G dd dZ/eG dd de/Z0G dd de1Z2G dd de/Z3dS )    N)deque)asdict	dataclassfield)TYPE_CHECKINGAnyCallableDictIterableIteratorListOptionalPatternTupleUnion)
NumberTree)	PDFParser)	PDFObjRefresolve1)	PSLiteral   )T_bboxT_obj)decode_textgeometry)Page)PDFPDFStructElementelementsmatcherreturnc                 #   s    dddt f fdd}dddt f fdd}t tr|}nt tjr(|}n }t| }|rG| }||r;|V  |t|j	 |s0dS dS )	z=
    Common code for `find_all()` in trees and elements.
    xr   r    c                    s
   | j  kS )zMatch an element name.)typer!   r    K/var/www/html/lang_env/lib/python3.10/site-packages/pdfplumber/structure.py	match_tag/   s   
z_find_all.<locals>.match_tagc                    s     | jS )z,Match an element name by regular expression.)matchr"   r#   r$   r%   r&   match_regex3   s   z_find_all.<locals>.match_regexN)
bool
isinstancestrrer   r   popleft
extendleftreversedchildren)r   r   r'   r)   
match_funcdelr%   r$   r&   	_find_all'   s   
r5   c                   @   sf   e Zd ZU dZed ed< deeee e	f de
d fddZdeeee e	f ded fdd	Zd
S )FindablezRfind() and find_all() methods that can be inherited to avoid
    repeating oneselfr   r1   r   r    c                 C   s   t | j|S )zIterate depth-first over matching elements in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        )r5   r1   selfr   r%   r%   r&   find_allK   s   	zFindable.find_allc                 C   s(   z	t t| j|W S  ty   Y dS w )zFind the first matching element in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        N)nextr5   r1   StopIterationr7   r%   r%   r&   findV   s
   	zFindable.findN)__name__
__module____qualname____doc__r   __annotations__r   r,   r   	MatchFuncr   r9   r   r<   r%   r%   r%   r&   r6   E   s   
 
r6   c                   @   s   e Zd ZU eed< ee ed< ee ed< ee ed< ee ed< ee ed< ee ed< ee ed< eed	Z	e
eef ed
< eed	Zee ed< eed	Zed  ed< ded  fddZdeeee ef  fddZde
eef fddZdS )r   r"   revisionidlangalt_textactual_texttitlepage_number)default_factory
attributesmcidsr1   r    c                 C   
   t | jS Niterr1   r8   r%   r%   r&   __iter__s      
zPDFStructElement.__iter__c                 c   sd    | j D ]}| j|fV  qt| j}|r0| }|j D ]}|j|fV  q|t|j |sdS dS )zCollect all MCIDs (with their page numbers, if there are
        multiple pages in the tree) inside a structure element.
        N)rL   rI   r   r1   r.   r/   r0   )r8   mcidr3   r4   r%   r%   r&   	all_mcidsv   s   


zPDFStructElement.all_mcidsc                 C   sx   t | }t|g}|r:| }t| D ]}|| du s)|| g ks)|| i kr,||= qd|v r8||d  |s|S )z'Return a compacted dict representation.Nr1   )r   r   r.   listkeysextend)r8   rr3   r4   kr%   r%   r&   to_dict   s   
$zPDFStructElement.to_dictN)r=   r>   r?   r,   rA   r   intr   dictrK   r	   r   rV   rL   r   r1   r   rR   r   rU   r[   r%   r%   r%   r&   r   e   s   
 c                   @   s   e Zd ZdS )StructTreeMissingN)r=   r>   r?   r%   r%   r%   r&   r^      s    r^   c                   @   s   e Zd ZU dZed ed< dddded fddZd	eee	f d
ee
 deee	f fddZd	e	deee ee	 f fddZdee	 ddfddZd	eee	f defddZd ddZdeee	f ddfddZdee fddZdedefddZdS )!PDFStructTreeaz  Parse the structure tree of a PDF.

    The constructor takes a `pdfplumber.PDF` and optionally a
    `pdfplumber.Page`.  To avoid creating the entire tree for a large
    document it is recommended to provide a page.

    This class creates a representation of the portion of the
    structure tree that reaches marked content sections, either for a
    single page, or for the whole document.  Note that this is slightly
    different from the behaviour of other PDF libraries which will
    also include structure elements with no content.

    If the PDF has no structure, the constructor will raise
    `StructTreeMissing`.

    r   pageNdocr   c                    s(  |j | _ d| j jvrtdt| j jd | _t| jdi | _t| jdi | _g | _|d urw|| _	|j
|i| _d | _| jd}|d u rN|   d S t|}d| j	jjvr[d S | j	jjd  tt fdd|jD }| | d S d | _	d	d
 |jD | _dd
 | j D | _|   d S )NStructTreeRootzPDF has no structureZRoleMapZClassMapZ
ParentTreeZStructParentsc                 3   s     | ]\}}| kr|V  qd S rN   r%   ).0numarrayZ	parent_idr%   r&   	<genexpr>   s    z)PDFStructTree.__init__.<locals>.<genexpr>c                 S   s   i | ]}|j |qS r%   )rI   rc   r`   r%   r%   r&   
<dictcomp>   s    z*PDFStructTree.__init__.<locals>.<dictcomp>c                 S   s   i | ]}|j j|jqS r%   )page_objpageidrI   rh   r%   r%   r&   ri      s    )ra   catalogr^   r   rootgetrole_map	class_mapr1   r`   rI   pages	page_dict_parse_struct_treer   rj   attrsr:   values_parse_parent_tree)r8   ra   r`   Zparent_tree_objZparent_treeparent_arrayr%   rf   r&   __init__   s8   zPDFStructTree.__init__objrC   r    c                 C   s,  g }dD ]}||vrqt || }t|tr|| q|| qg }d }|D ]$}t|tr?||kr<|d ur<|| d }q(|d urH|| t |}q(|d urV|| i }	|D ]9}t|trwt|j}|| j	vrrt
d| qZ| j	| }| D ]\}
}t|trt|j|	|
< q{||
 |	|
< q{qZ|	S )N)CAzUnknown attribute class %s)r   r+   rV   rX   appendr\   r   r   namerp   loggerwarningitems)r8   ry   rC   Zattr_obj_listkeyZattr_objZ	attr_objsZprev_objZarefattrrZ   vr%   r%   r&   _make_attributes   sD   










zPDFStructTree._make_attributesc                 C   s  d|vs
J d| d|vsJ d| d }| j d ur4d|v r4|d j}|| j v s/J d| | j | }d}d|v rNt|d j}|| jv rNt| j| j}d	|v rXt|d	 ng }t|trc|g}n
t|trm|d	 g}|	d
}| 
||}d|v rt|d nd }d|v rt|d nd }	d|v rt|d nd }
d|v rt|d nd }d|v rt|d nd }t|||||
|	|||d	}||fS )NMCIDzUncaught MCR: %sObjzUncaught OBJR: %sPgzObject on unparsed page: %s SKRIDTZLangZAltZ
ActualText)	r"   rD   rI   rC   rE   rH   rF   rG   rK   )rr   objidr   r}   ro   r   r+   r\   r]   rn   r   r   )r8   ry   rI   
page_objidZobj_tagr1   rC   rK   Z
element_idrH   rE   rF   rG   elementr%   r%   r&   _make_element
  sH   






zPDFStructTree._make_elementrw   c           	      C   s   t |}i }d}|rM| }|tjkrqt||v rqt|}d|v r/t|d jdkr/d}n| |\}}|dus<J ||f|t|< |	|d  |s
|sQJ | 
| dS )zYPopulate the structure tree using the leaves of the parent tree for
        a given page.FTyperb   TNP)r   r.   r   ZKEYWORD_NULLreprr   r   r}   r   r|   _resolve_children)	r8   rw   r3   sZ
found_rootrefry   r   r1   r%   r%   r&   rv   2  s&   
z PDFStructTree._parse_parent_treec                 C   sJ   d|vrdS |d j }| jd ur|| jv S | jd ur#|| jjjkr#dS dS )Nr   TF)r   rr   r`   rj   rk   )r8   ry   r   r%   r%   r&   on_parsed_pageQ  s   



zPDFStructTree.on_parsed_pagec                    s2  t jd }t|trjd g}t|}i |r}| }t|v r%qt |}t|tr@d|v r@|s8q|d }t |}|\}}||ft|< |D ])}t |}t|trp|sbqQd|v rk|d }nd|v rpqQt|t	rz|
| qQ|sdtt dtt f fdd  |  dS )	zgPopulate the structure tree starting from the root, skipping
        unparsed pages and empty elements.r   r   r   r   r    c                    s   g }| D ]U}t |}t|tr|| qt|tr4|s qd|v r,||d  qd|v r4|d }t| \}} |}|d u sF|sLt|= q||ft|< || q|S )Nr   r   )r   r+   r\   r|   r]   r   r   )r   Znext_elementsr   ry   r   r1   pruner   r8   r%   r&   r     s*   



z/PDFStructTree._parse_struct_tree.<locals>.pruneN)r   rm   r+   r]   r   r.   r   r   r   r   r|   r   r   r   )r8   rm   r3   r   ry   r   r1   childr%   r   r&   rs   ]  sB   






"z PDFStructTree._parse_struct_treeseenc                    sr  t | jd }t|tr| jd g}g | _g }|D ]$}t |}t|tr2d|v r2| |s.q|d }t| v r=|| qt|}|r|	 } t| \}}|dusXJ d|D ]P}	t |	}t|t
rl|j| n t|tr| |swqZd|v r|j|d  nd|v r|d }	t|	tr t|	d\}
}|
dur|j|
 ||	 qZ|sD fdd|D | _dS )	z|Resolve children starting from the tree root based on references we
        saw when traversing the structure tree.
        r   r   NzUnparsed elementr   )NNc                    s   g | ]
} t | d  qS )r   )r   )rc   r   r   r%   r&   
<listcomp>  s    z3PDFStructTree._resolve_children.<locals>.<listcomp>)r   rm   r+   r]   r1   r   r   r|   r   r.   r\   rL   r   rn   )r8   r   rm   Zparsed_rootr   ry   r3   r   r1   r   Zchild_element_r%   r   r&   r     sL   







zPDFStructTree._resolve_childrenc                 C   rM   rN   rO   rQ   r%   r%   r&   rR     rS   zPDFStructTree.__iter__r4   c                 C   sH  d}| j dur| j }n|jdur| j|j }|jdd}|dur]|dur]ddl m}m}m} ||||jd |jd  }t	||r[t
|}||g}|sTtdt
|d S |S g }	| D ]5\}
}|
du r||durytj|j }ng }ntj| j|
 j }|D ]}|d |kr|	| qqc|	std	t
|	S )
z9Get the bounding box for an element for visual debugging.NZBBoxr   )CroppedPage_invert_box_normalize_box   zElement no longer on pager   rT   zNo objects found)r`   rI   rq   rK   rn   r   r   r   Zmediaboxr+   r   Zbbox_to_rectZ_crop_fn
IndexErrorZobj_to_bboxrU   	itertoolschainfrom_iterableobjectsru   r|   Zobjects_to_bbox)r8   r4   r`   Zbboxr   r   r   rectZrectsZ	mcid_objsrI   rT   r   cr%   r%   r&   element_bbox  sF   





zPDFStructTree.element_bboxrN   )r    N)r=   r>   r?   r@   r   rA   rx   r	   r,   r   r\   r   r   r   r   r   rv   r*   r   rs   r   r   rR   r   r   r%   r%   r%   r&   r_      s$   
 .


"1(
D,r_   )4r   loggingr-   collectionsr   dataclassesr   r   r   typingr   r   r   r	   r
   r   r   r   r   r   r   Zpdfminer.data_structuresr   Zpdfminer.pdfparserr   Zpdfminer.pdftypesr   r   Zpdfminer.psparserr   Z_typingr   r   utilsr   r   	getLoggerr=   r~   r`   r   Zpdfr   r*   rB   r,   r5   r6   r   
ValueErrorr^   r_   r%   r%   r%   r&   <module>   s:    4

 -