o
    Zh 1                     @   s   d Z ddlZddlZddlZddlmZ ddlmZ	 ddl
mZ ddlmZ ejZeeZG dd de	jZG dd de	jZdS )	)PdfTextPagePdfTextSearcher    N)PdfiumError)PDFIUM_INFOc                       s   e Zd ZdZ fddZedd ZdddZdddZdddZ	dd Z
d ddZdd Zd!ddZdd Zd"ddZ  ZS )#r   z
    Text page helper class.
    
    Attributes:
        raw (FPDF_TEXTPAGE): The underlying PDFium textpage handle.
        page (PdfPage): Reference to the page this textpage belongs to.
    c                       || _ || _t tj d S N)rawpagesuper__init__pdfium_cZFPDFText_ClosePage)selfr   r	   	__class__ R/var/www/html/lang_env/lib/python3.10/site-packages/pypdfium2/_helpers/textpage.pyr         zPdfTextPage.__init__c                 C      | j S r   )r	   r   r   r   r   parent!      zPdfTextPage.parentr   c                 C   sp   ||krdS t | |}|dkr| |d ||d |S t | |}|dkr2| ||d ||d S ||||fS )Nr      )r   Z"FPDFText_GetTextIndexFromCharIndex_get_active_text_range)r   Zc_startZc_end	l_passive	r_passivet_startt_endr   r   r   r   &   s   z"PdfTextPage._get_active_text_ranger   ignoreFc                 C   s  ||fdkr|st d | j|dS |dkr|  | }| ||| d }|dkr-dS |\}}}}	||7 }|||	 8 }|d | }
dtj  k rNd	k rTn n|
d
9 }
|
d7 }
t|
d
 }t	|t
tj}t| |||}|
|ksJ d|
 d| |jd|d d
  jd|dS )a  
        Warning:
            .. versionchanged:: 4.28
               For various reasons, calling this method with default params now implicitly translates to :meth:`.get_text_bounded` (pass ``force_this=True`` to circumvent).
        
        Extract text from a given range.
        
        Parameters:
            index (int): Index of the first char to include.
            count (int): Number of chars to cover, relative to the internal char list. Defaults to -1 for all remaining chars after *index*.
            errors (str): Error handling when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text in the range in question, or an empty string if no text was found.
        
        Note:
            * The returned text's length does not have to match *count*, even if it will for most PDFs.
              This is because the underlying API may exclude/insert chars compared to the internal list, although rare in practice.
              This means, if the char at ``i`` is excluded, ``get_text_range(i, 2)[1]`` will raise an index error.
              Pdfium provides raw APIs ``FPDFText_GetTextIndexFromCharIndex()`` / ``FPDFText_GetCharIndexFromTextIndex()`` to translate between the two views and identify excluded/inserted chars.
            * In case of leading/trailing excluded characters, pypdfium2 modifies *index* and *count* accordingly to prevent pdfium from unexpectedly reading beyond ``range(index, index+count)``.
        r   r   z]get_text_range() call with default params will be implicitly redirected to get_text_bounded()errorsr   r   r    i  i     zBuffer too small: z vs N	utf-16-le)warningswarnget_text_boundedcount_charsr   r   buildctypescreate_string_buffercastPOINTERc_ushortr   ZFPDFText_GetTextr   decode)r   indexcountr!   Z
force_thisZactive_ranger   r   r   r   Zin_countbuffer
buffer_ptrZ	out_countr   r   r   get_text_range6   s(   
 zPdfTextPage.get_text_rangeNc                 C   s   | j  }|du r|d }|du r|d }|du r|d }|du r%|d }| ||||f}tjg |ddR  }|dkr>dS t|d }	t|	ttj}
tjg ||
|R   |	j	j
d|dS )	a  
        Extract text from given boundaries in PDF coordinates.
        If a boundary value is None, it defaults to the corresponding value of :meth:`.PdfPage.get_bbox`.
        
        Parameters:
            errors (str): Error treatment when decoding the data (see :meth:`bytes.decode`).
        Returns:
            str: The text on the page area in question, or an empty string if no text was found.
        Nr   r   r#      r"   r$   r    )r	   Zget_bboxr   ZFPDFText_GetBoundedTextr*   r+   r,   r-   r.   r   r/   )r   leftbottomrighttopr!   Zbboxargsn_charsr2   r3   r   r   r   r'   q   s"   
zPdfTextPage.get_text_boundedc                 C   s   t | }|dkrtd|S )zV
        Returns:
            int: The number of characters on the text page.
        r   zFailed to get character count.)r   ZFPDFText_CountCharsr   )r   r;   r   r   r   r(      s   
zPdfTextPage.count_charsc                 C   s"   t | ||}|dkrtd|S )a  
        Parameters:
            index (int): Start character index.
            count (int): Character count to consider (defaults to -1 for all remaining).
        Returns:
            int: The number of text rectangles in the given character range.
        r   zFailed to count rectangles.)r   ZFPDFText_CountRectsr   )r   r0   r1   Zn_rectsr   r   r   count_rects   s   zPdfTextPage.count_rectsc                 C   s"   t | ||||}|dk rdS |S )a  
        Get the index of a character by position.
        
        Parameters:
            x (float): Horizontal position (in PDF canvas units).
            y (float): Vertical position.
            x_tol (float): Horizontal tolerance.
            y_tol (float): Vertical tolerance.
        Returns:
            int | None: The index of the character at or nearby the point (x, y).
            May be None if there is no character or an error occurred.
        r   N)r   ZFPDFText_GetCharIndexAtPos)r   xyZx_tolZy_tolr0   r   r   r   	get_index   s   zPdfTextPage.get_indexc           	      C   s   |rt  }t | ||}|j|j|j|jf\}}}}n&t t t t f\}}}}t | |||||}|j	|j	|j	|j	f\}}}}|sHt
d||||fS )a  
        Get the bounding box of a single character.
        
        Parameters:
            index (int):
                Index of the character to work with, in the page's character array.
            loose (bool):
                Get a more comprehensive box covering the entire font bounds, as opposed to the default tight box specific to the one character.
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zFailed to get charbox.)r   ZFS_RECTFZFPDFText_GetLooseCharBoxr6   r7   r8   r9   c_doubleZFPDFText_GetCharBoxvaluer   )	r   r0   looserectoklbrtr   r   r   get_charbox   s   zPdfTextPage.get_charboxc                 C   sP   t  t  t  t  f\}}}}t| |||||}|std|j|j|j|jfS )al  
        Get the bounding box of a text rectangle at the given index.
        Note that :meth:`.count_rects` must be called once with default parameters
        before subsequent :meth:`.get_rect` calls for this function to work (due to PDFium's API).
        
        Returns:
            Float values for left, bottom, right and top in PDF canvas units.
        zzFailed to get rectangle. (Make sure count_rects() was called with default params once before subsequent get_rect() calls.))r@   r   ZFPDFText_GetRectr   rA   )r   r0   rE   rF   rG   rH   rD   r   r   r   get_rect   s
   	zPdfTextPage.get_rectc                 C   s   t |dkr
tdd}|r|tjO }|r|tjO }|r!|tjO }|d d}t|t	tj
}t| |||}	t|	| }
| |
 |
S )au  
        Locate text on the page.
        
        Parameters:
            text (str):
                The string to search for.
            index (int):
                Character index at which to start searching.
            match_case (bool):
                If True, the search will be case-specific (upper and lower letters treated as different characters).
            match_whole_word (bool):
                If True, substring occurrences will be ignored (e. g. `cat` would not match `category`).
            consecutive (bool):
                If False (the default), :meth:`.search` will skip past the current match to look for the next match.
                If True, parts of the previous match may be caught again (e. g. searching for `aa` in `aaaa` would match 3 rather than 2 times).
        Returns:
            PdfTextSearcher: A helper object to search text.
        r   z#Text length must be greater than 0. r$   )len
ValueErrorr   ZFPDF_MATCHCASEZFPDF_MATCHWHOLEWORDZFPDF_CONSECUTIVEencoder*   r,   r-   r.   ZFPDFText_FindStartr   Z_add_kid)r   textr0   
match_caseZmatch_whole_wordZconsecutiveflagsZenc_textZenc_text_ptrZraw_searcherZsearcherr   r   r   search   s   




zPdfTextPage.search)r   r   )r   r   r   F)NNNNr   r   )F)r   FFF)__name__
__module____qualname____doc__r   propertyr   r   r4   r'   r(   r<   r?   rI   rJ   rR   __classcell__r   r   r   r   r      s    



; 

r   c                       sD   e Zd ZdZ fddZedd Zdd Zdd	 Zd
d Z	  Z
S )r   z
    Text searcher helper class.
    
    Attributes:
        raw (FPDF_SCHHANDLE): The underlying PDFium searcher handle.
        textpage (PdfTextPage): Reference to the textpage this searcher belongs to.
    c                    r   r   )r   textpager
   r   r   ZFPDFText_FindClose)r   r   rY   r   r   r   r     r   zPdfTextSearcher.__init__c                 C   r   r   )rY   r   r   r   r   r     r   zPdfTextSearcher.parentc                 C   s,   || }|sd S t | }t | }||fS r   )r   ZFPDFText_GetSchResultIndexZFPDFText_GetSchCount)r   Z	find_funcrD   r0   r1   r   r   r   _get_occurrence#  s   

zPdfTextSearcher._get_occurrencec                 C      |  tjS )z
        Returns:
            (int, int): Start character index and count of the next occurrence,
            or None if the last occurrence was passed.
        )rZ   r   ZFPDFText_FindNextr   r   r   r   get_next+     zPdfTextSearcher.get_nextc                 C   r[   )z
        Returns:
            (int, int): Start character index and count of the previous occurrence (i. e. the one before the last valid occurrence),
            or None if the last occurrence was passed.
        )rZ   r   ZFPDFText_FindPrevr   r   r   r   get_prev3  r]   zPdfTextSearcher.get_prev)rS   rT   rU   rV   r   rW   r   rZ   r\   r^   rX   r   r   r   r   r     s    
r   )__all__r*   loggingr%   Zpypdfium2.rawr   r   Zpypdfium2.internalZinternalZpdfium_iZpypdfium2._helpers.miscr   Zpypdfium2.versionr   r@   	getLoggerrS   loggerZAutoCloseabler   r   r   r   r   r   <module>   s   
 ~