o
    Zh                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZ ddlmZ ddlZddlZdd	lmZ dd
lm Z  ddl!m"Z" ddl#m$Z$m%Z% erddl&Z&ddl'Z'ddl(Z(ddl)Z)ddl*m+Z+ g dZ,g dZ-d;ddZ.e/e0Z1dZ2dZ3dZ4dZ5h dZ6d<ddZ7d=d"d#Z8d=d$d%Z9d&d'gZ:d>d+d,Z;G d-d. d.e Z<G d/d0 d0e Z=G d1d2 d2e Z>G d3d4 d4e Z?G d5d6 d6e Z@G d7d8 d8e ZAG d9d: d:e ZBdS )?z(Module contains common parsers for PDFs.    )annotationsN)datetime)Path)TemporaryDirectory)TYPE_CHECKINGAnyBinaryIOIterableIteratorLiteralMappingOptionalSequenceUnioncast)urlparse)Document)BaseBlobParser)Blob)BaseImageBlobParserRapidOCRBlobParser)TextLinearizationConfig)Z	DCTDecodeZDCTZ	JPXDecode)Z	LZWDecodeZLZWZFlateDecodeZFlZASCII85DecodeZA85ZASCIIHexDecodeZAHxZRunLengthDecodeZRLZCCITTFaxDecodeZCCFZJBIG2Decodeimages,Sequence[Union[Iterable[np.ndarray], bytes]]returnstrc                 C  sl   zddl m} W n ty   tdw | }d}| D ]}||\}}|r3dd |D }|d|7 }q|S )zExtract text from images with RapidOCR.

    Args:
        images: Images to extract text from.

    Returns:
        Text extracted from images.

    Raises:
        ImportError: If `rapidocr-onnxruntime` package is not installed.
    r   )RapidOCRzc`rapidocr-onnxruntime` package not found, please install it with `pip install rapidocr-onnxruntime` c                 S  s   g | ]}|d  qS )    ).0textr   r   g/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/parsers/pdf.py
<listcomp>Z       z5extract_from_images_with_rapidocr.<locals>.<listcomp>
)Zrapidocr_onnxruntimer   ImportErrorjoin)r   r   Zocrr!   imgresult_r   r   r"   !extract_from_images_with_rapidocr@   s    r+   z

{image_text}

r%   z
>   producertotal_pagescreationdatesourcecreatorblobr   contentformatc                 C  s`   |r.| j pd}|dkr|dd}d| d| d}|S |dkr.d	tj|d
d d| d}|S )a  Format the content of the image with the source of the blob.

    blob: The blob containing the image.
    format::
      The format for the parsed output.
      - "text" = return the content as is
      - "markdown-img" = wrap the content into an image markdown link, w/ link
      pointing to (`![body)(#)`]
      - "html-img" = wrap the content as the `alt` text of an tag and link to
      (`<img alt="{body}" src="#"/>`)
    #zmarkdown-img]z\\]z![z]()zhtml-imgz
<img alt="T)quotez src="z" />)r/   replacehtmlescape)r1   r2   r3   r/   r   r   r"   _format_inner_imagei   s   
r;   metadatadict[str, Any]c                 C  s4   t |  stdt| ddtstd| S )zValidate that the metadata has all the standard keys and the page is an integer.

    The standard keys are:
    - source
    - total_page
    - creationdate
    - creator
    - producer

    Validate that page is an integer if it is present.
    z3The PDF parser must valorize the standard metadata.pager   z(The PDF metadata page must be a integer.)_STD_METADATA_KEYSissubsetkeys
ValueError
isinstancegetint)r<   r   r   r"   _validate_metadata   s
   rF   c              	   C  s   i }ddd}|   D ]f\}}t|ttfvrt|}|dr&|dd }| }|dv rMzt|dd	d
	d||< W q t
yL   |||< Y qw ||v r\|||| < |||< qt|trh| ||< qt|trq|||< q|S )zPurge metadata from unwanted keys and normalize key names.

    Args:
        metadata: The original metadata dictionary.

    Returns:
        The cleaned and normalized the key format of metadata dictionary.
    r-   r/   )Z
page_count	file_path/r   N)r.   Zmoddate'r   zD:%Y%m%d%H%M%S%zT)itemstyper   rE   
startswithlowerr   strptimer8   	isoformatrB   rC   strip)r<   Znew_metadataZmap_keykvr   r   r"   _purge_metadata   s:   	




rT   z




extras	list[str]text_from_pagec                   sR   d fd	d
  | |d}|s'd}d tdd | }|r#td | }|| }|S )a5  Insert extras such as image/table in a text between two paragraphs if possible,
    else at the end of the text.

    Args:
        extras: List of extra content (images/tables) to insert.
        text_from_page: The text content from the page.

    Returns:
        The merged text with extras inserted.
    rV   rW   rX   r   recursboolr   Optional[str]c           	        s   | rPt D ]G}||}|dkrKd }|r | |d | d}|r(|||d   }n d}dtdd | }|r:|| }|d | | ||d   } |S qd }|S |}|S )NFr   rU   c                 S     | S Nr   xr   r   r"   <lambda>       zO_merge_text_and_extras.<locals>._recurs_merge_text_and_extras.<locals>.<lambda>)_PARAGRAPH_DELIMITERrfindr'   filter)	rV   rX   rY   delimposZprevious_textall_text
all_extras
str_extras_recurs_merge_text_and_extrasr   r"   rl      s0   
z=_merge_text_and_extras.<locals>._recurs_merge_text_and_extrasTr   rU   c                 S  r]   r^   r   r_   r   r   r"   ra      rb   z(_merge_text_and_extras.<locals>.<lambda>r\   N)rV   rW   rX   r   rY   rZ   r   r[   )r'   re   rc   )rV   rX   rh   ri   rj   r   rk   r"   _merge_text_and_extras   s   rm   c                      sL   e Zd ZdZ		d#dedddddd$ fddZd%ddZd&d!d"Z  ZS )'PyPDFParsera  Parse a blob from a PDF using `pypdf` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images.
    It integrates the 'pypdf' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFParser

            parser = PyPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFr>   r!   plain)modepages_delimiterimages_parserimages_inner_formatextraction_modeextraction_kwargspasswordOptional[Union[str, bytes]]extract_imagesrZ   rp   Literal['single', 'page']rq   r   rr   Optional[BaseImageBlobParser]rs   +Literal['text', 'markdown-img', 'html-img']rt   Literal['plain', 'layout']ru   Optional[dict[str, Any]]c          	        s`   t    |dvrtd|| _|r|st }|| _|| _|| _|| _|| _	|| _
|p,i | _dS )u  Initialize a parser based on PyPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            extract_images: Whether to extract images from the PDF.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” extract text
                in a fixed width format that closely adheres to the rendered layout in
                the source pdf.
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Raises:
            ValueError: If the `mode` is not "single" or "page".
        singler>   mode must be single or pageN)super__init__rB   rx   r   rr   rs   rv   rp   rq   rt   ru   )	selfrv   rx   rp   rq   rr   rs   rt   ru   	__class__r   r"   r   $  s   
%zPyPDFParser.__init__r1   r   r   Iterator[Document]c              	   #  sN   zddl  W n ty   tdw d fdd	}| ~} j|jd
}tddddtt|jp4i B |j	t
|jdB }g }t|jD ]2\}}||d}	|}
t|
g|	 }jdkrut|t|||j| dB dV  qH|| qHjdkrtj|t|dV  W d   dS W d   dS 1 sw   Y  dS )m  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   NzE`pypdf` package not found, please install it with `pip install pypdf`r>   pypdf.PageObjectr   r   c                   s,    j dr
|  S | jddjijS )z
            Extract text from image given the version of pypdf.

            Args:
                page: The page object to extract text from.

            Returns:
                str: The extracted text.
            3rt   Nr   )__version__rM   extract_textrt   ru   r>   pypdfr   r   r"   _extract_text_from_pagem  s   
z7PyPDFParser.lazy_parse.<locals>._extract_text_from_pagerv   ZPyPDFr   )r,   r0   r.   )r/   r-   r   )r>   Z
page_labelpage_contentr<   r   )r>   r   r   r   )r   r&   as_bytes_ioZ	PdfReaderrv   rT   r   dictr<   r/   lenpages	enumerateextract_images_from_pagerm   rQ   rp   r   rF   Zpage_labelsappendrq   r'   )r   r1   r   pdf_file_obj
pdf_readerdoc_metadataZsingle_textspage_numberr>   rX   images_from_pagerh   r   r   r"   
lazy_parseW  sb   








"zPyPDFParser.lazy_parsepypdf._page.PageObjectc              	   C  sn  | j sdS ddlm} dtt|d  vrdS |d d  }g }|D ]}d}|| d dkr|| d	 d
d tv r[|| d || d }}tj	|| 
 tjd||d}n!|| d	 d
d tv rwt|t|| 
 }ntd |durt }	||j|	dd tj|	 dd}
t| j |
j}|t|
|| j q$tj t!"t#d|dS )Extract images from a PDF page and get the text using images_to_text.

        Args:
            page: The page object from which to extract images.

        Returns:
            str: The extracted text from the images on the page.
        r   r   Imagez/XObjectz
/ResourcesNz/Subtypez/Imagez/Filterr   z/Heightz/WidthZdtyper\   Unknown PDF Filter!ZPNG)r3   z	image/pngZ	mime_type
image_text)$rr   PILr   r   r   rA   Z
get_object_PDF_FILTER_WITHOUT_LOSSnp
frombufferget_datauint8reshape_PDF_FILTER_WITH_LOSSarrayopenioBytesIOloggerwarningZ	fromarraysaver   	from_datagetvaluenextr   r   r   r;   rs   _FORMAT_IMAGE_STRr3   _JOIN_IMAGESr'   re   )r   r>   r   ZxObjectr   objnp_imageheightwidthimage_bytesr1   r   r   r   r"   r     s@   	
 
z$PyPDFParser.extract_images_from_pageNF)rv   rw   rx   rZ   rp   ry   rq   r   rr   rz   rs   r{   rt   r|   ru   r}   r1   r   r   r   )r>   r   r   r   )	__name__
__module____qualname____doc___DEFAULT_PAGES_DELIMITERr   r   r   __classcell__r   r   r   r"   rn      s    2
3Mrn   c                      sp   e Zd ZdZdZ	d-ddeddddd. fddZed/ddZed0ddZ		 	!d1d2d&d'Z
d3d+d,Z  ZS )4PDFMinerParsera  Parse a blob from a PDF using `pdfminer.six` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'pdfminer.six' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pdfminer.six pillow

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PDFMinerParser

            parser = PDFMinerParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNr   r!   )rv   rp   rq   rr   rs   concatenate_pagesrx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r   Optional[bool]c                  s   t    |dvrtd|r|st }|| _|| _|| _|| _|| _|| _	|dur>t
js5dt
_td |r9dnd| _dS dS )aH  Initialize a parser based on PDFMiner.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: Extraction mode to use. Either "single" or "page" for page-wise
                extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from PDF.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            concatenate_pages: Deprecated. If True, concatenate all PDF pages
                into one a single document. Otherwise, return one document per page.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the `mode` is not "single" or "page".

        Warnings:
            `concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'
            instead.
        r~   r   NTzS`concatenate_pages` parameter is deprecated. Use `mode='single' or 'page'` instead.r   r>   )r   r   rB   r   rx   rr   rs   rv   rp   rq   r   _warn_concatenate_pagesr   r   )r   rx   rv   rp   rq   rr   rs   r   r   r   r"   r     s&   
(zPDFMinerParser.__init__sUnion[bytes, str]r   c                   sx   ddl m  t| tr| drt| dd ddS zdd	 | D }d
 fdd	|D W S  ty;   t|  Y S w )z
        Decodes a PDFDocEncoding string to Unicode.
        Adds py3 compatibility to pdfminer's version.

        Args:
            s: The string to decode.

        Returns:
            str: The decoded Unicode string.
        r   PDFDocEncodings      Nzutf-16beignorec                 s  s&    | ]}t |trt|n|V  qd S r^   )rC   r   ord)r    cr   r   r"   	<genexpr>S  s   $ z-PDFMinerParser.decode_text.<locals>.<genexpr>r   c                 3  s    | ]} | V  qd S r^   r   )r    or   r   r"   r   T  s    )Zpdfminer.utilsr   rC   bytesrM   r   r'   
IndexError)r   Zordsr   r   r"   decode_textB  s   zPDFMinerParser.decode_textr   r   c                 C  s   ddl m} t| dr|  } t| trtttj| S t| |r't	| j
S t| ttfr3t	| S t| trJ|  D ]\}}t|| |< q<| S | S )z
        Recursively resolve the metadata values.

        Args:
            obj: The object to resolve and decode. It can be of any type.

        Returns:
            The resolved and decoded object.
        r   )	PSLiteralresolve)Zpdfminer.psparserr   hasattrr   rC   listmapr   resolve_and_decoder   namer   r   r   rK   )r   r   rR   rS   r   r   r"   r   X  s   




z!PDFMinerParser.resolve_and_decoder   Tfpr   cachingr=   c                 C  s   ddl m}m}m} ||}||||d}i }	|jD ]}
|	|
 q|	 D ](\}}z	t||	|< W q& t	yN } zt
d|t| W Y d}~q&d}~ww tt|||	d< |	S )ag  
        Extract metadata from a PDF file.

        Args:
            fp: The file pointer to the PDF file.
            password: The password for the PDF file, if encrypted. Defaults to an empty
                string.
            caching: Whether to cache the PDF structure. Defaults to True.

        Returns:
            Metadata of the PDF file.
        r   )PDFDocumentPDFPage	PDFParser)rv   r   zD[WARNING] Metadata key "%s" could not be parsed due to exception: %sNr-   )pdfminer.pdfpager   r   r   infoupdaterK   r   r   	Exceptionr   r   r   r   r   Zcreate_pages)r   r   rv   r   r   r   r   parserdocr<   r   rR   rS   er   r   r"   _get_metadatat  s&   
zPDFMinerParser._get_metadatar1   r   r   c              
   #  s,   z7ddl }ddlm} ddlm}m mm}m}m	m
 ddlm}m} ddlm}	 t|jdk r7tdW n tyC   td	w | }
t |	j|
jpTd
d}| }tj|
jpbd
d}|j|d< G  fddd|}t ||||| d}g }t|D ]H\}}d d ||   }|! }j"dkrǈd d t#|t$|d|iB dV  q|%dr|dd }|&| qj"dkrj'(|}t#|t$|dV  W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )a  
        Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.

        Raises:
            ImportError: If the `pdfminer.six` or `pillow` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)PDFLayoutAnalyzer)LAParamsLTContainerLTImageLTItemLTPageLTText	LTTextBox)PDFPageInterpreterPDFResourceManager)r   i:>4zThis parser is tested with pdfminer.six version 20201018 or later. Remove pdfminer, and install pdfminer.six with `pip uninstall pdfminer && pip install pdfminer.six`.zMpdfminer package not found, please install it with `pip install pdfminer.six`r   r   r/   c                      s>   e Zd Z		dd fddZdfddZ  ZS )z*PDFMinerParser.lazy_parse.<locals>.Visitorr   Nrsrcmgrr   pagenorE   laparamsOptional[LAParams]r   Nonec                   s   t  j|||d d S )N)r   r   )r   r   )r   r   r   r   r   r   r"   r     s   z3PDFMinerParser.lazy_parse.<locals>.Visitor.__init__ltpager   c              	     s(   d fdd  | d S )Nitemr   r   r   c                   s   t |  r| D ]}| qnt | r|   t | r'd d S t | rcjraddlm} |}|| }tt	| }d|j
d< tj|j}t||j d S d S 	 d S )Nr%   r   )ImageWriterr4   r/   )rC   writeget_textrr   Zpdfminer.imager   Zexport_imager   	from_pathr   r<   r   r   r   r;   rs   )r   childr   Zimage_writerfilenamer1   r   )r   r   r   r   renderr   tempdirtext_ior   r"   r    s6   







zIPDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout.<locals>.render)r   r   r   r   r   )mer   r   r   r   r   r   r  r  )r  r"   receive_layout  s   z9PDFMinerParser.lazy_parse.<locals>.Visitor.receive_layout)r   N)r   r   r   rE   r   r   r   r   )r   r   r   r   )r   r   r   r   r  r   r   r
  r   r"   Visitor  s
    "r  )r   r>   r   r\   r   ))pdfminerZpdfminer.converterr   Zpdfminer.layoutr   r   r   r   r   r   r   Zpdfminer.pdfinterpr   r   r   r   rE   r   r&   r   r   Z	get_pagesrv   rT   r   r/   r   StringIOr   truncateseekZprocess_pager   rQ   rp   r   rF   endswithr   rq   r'   )r   r1   r  r   r   r   r   r   r   r   r   r   r   r   r  Zvisitor_for_allZall_contentir>   rh   Zdocument_contentr   r
  r"   r     sl   $	
 (








RzPDFMinerParser.lazy_parseF)rx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r   r   )r   r   r   r   )r   r   r   r   )r   T)r   r   rv   r   r   rZ   r   r=   r   )r   r   r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r   r"   r     s(    2<.r   c                	      s   e Zd ZdZe Z		d0ddedddddd1 fddZd2dd Z		d3d4d!d"Z
d5d'd(Zd6d*d+Zd7d,d-Zd8d.d/Z  ZS )9PyMuPDFParsera  Parse a blob from a PDF using `PyMuPDF` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyMuPDF' library for PDF processing and offers synchronous blob
    parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pymupdf

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyMuPDFParser

            parser = PyMuPDFParser(
                # password = None,
                mode = "single",
                pages_delimiter = "
",
                # images_parser = TesseractBlobParser(),
                # extract_tables="markdown",
                # extract_tables_settings=None,
                # text_kwargs=None,
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    NFr>   r!   )rv   rp   rq   rr   rs   extract_tablesextract_tables_settingstext_kwargsr}   rx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r  /Union[Literal['csv', 'markdown', 'html'], None]r  r   r   c          
        sz   t    |dvrtd|r|dvrtd|| _|| _|| _|p#i | _|r,|s,t }|| _|| _	|| _
|| _|	| _dS )a  Initialize a parser based on PyMuPDF.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extract_tables: Whether to extract tables in a specific format, such as
                "csv", "markdown", or "html".
            extract_tables_settings: Optional dictionary of settings for customizing
                table extraction.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
            ValueError: If the extract_tables format is not "markdown", "html",
            or "csv".
        r~   r   )markdownr9   csvzmode must be markdownN)r   r   rB   rp   rq   rv   r  r   rx   rs   rr   r  r  )
r   r  rx   rv   rp   rq   rr   rs   r  r  r   r   r"   r   V  s    
+

zPyMuPDFParser.__init__r1   r   r   c                 C  s
   |  |S r^   )_lazy_parse)r   r1   r   r   r"   r     s   zPyMuPDFParser.lazy_parsec              	   c  s   zXddl }|p
| j}| jsXddlm}m}m}m} i ddddddddd	dd
|ddddd|ddddddd|d|dddddddddddd| _W n tyd   tdw t	j
 | d}|jdu ry||}	n|j|dd}	|	jr|	| j | |	|}
g }|	D ]$}| |	|| }| jdkrt|t|
d|jiB dV  q|| q| jdkrt| j|t|
dV  W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )a  Lazily parse the blob.
        Insert image, if possible, between two paragraphs.
        In this way, a paragraph can be continued on the next page.

        Args:
            blob: The blob to parse.
            text_kwargs: Optional keyword arguments to pass to the `get_text` method.
                If provided at run time, it will override the default text_kwargs.

        Raises:
            ImportError: If the `pypdf` package is not found.

        Yield:
            An iterator over the parsed documents.
        r   N)DEFAULT_JOIN_TOLERANCEDEFAULT_MIN_WORDS_HORIZONTALDEFAULT_MIN_WORDS_VERTICALDEFAULT_SNAP_TOLERANCEZclipZvertical_strategylinesZhorizontal_strategyZvertical_linesZhorizontal_linesZsnap_toleranceZsnap_x_toleranceZsnap_y_toleranceZjoin_toleranceZjoin_x_toleranceZjoin_y_toleranceZedge_min_length   Zmin_words_verticalZmin_words_horizontalZintersection_toleranceZintersection_x_toleranceZintersection_y_tolerance)Ztext_toleranceZtext_x_toleranceZtext_y_toleranceZstrategyZ	add_lineszGpymupdf package not found, please install it with `pip install pymupdf`Zpdf)streamZfiletyper>   r   r   )pymupdfr  r  Zpymupdf.tabler  r  r   r!  r&   r  _lockr   datar   is_encryptedZauthenticaterv   _extract_metadata_get_page_contentrQ   rp   r   rF   numberr   rq   r'   )r   r1   r  r%  r  r  r   r!  rG   r   r   full_contentr>   rh   r   r   r"   r    s   
	







"zPyMuPDFParser._lazy_parser   pymupdf.Documentpymupdf.Pager=   c           	      C  s^   |j di i | j|}| ||}| |}g }|r!|| |r(|| t||}|S )a:  Get the text of the page using PyMuPDF and RapidOCR and issue a warning
        if it is empty.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.
            blob: The blob being parsed.

        Returns:
            str: The text content of the page.
        Nr   )r  r  _extract_images_from_page_extract_tables_from_pager   rm   )	r   r   r>   r  rX   r   Ztables_from_pagerV   rh   r   r   r"   r*    s   



zPyMuPDFParser._get_page_contentr   c              	     s\   t i ddd|j|jt d fdd jD }dD ]}| jv r+ j| ||< q|S )zExtract metadata from the document and page.

        Args:
            doc: The PyMuPDF document object.
            blob: The blob being parsed.

        Returns:
            dict: The extracted metadata.
        ZPyMuPDFr   )r,   r0   r.   r/   rG   r-   c                   s,   i | ]}t  j| ttfr| j| qS r   )rC   r<   r   rE   r    rR   r   r   r"   
<dictcomp>&  s    
z3PyMuPDFParser._extract_metadata.<locals>.<dictcomp>)ZmodDateZcreationDate)rT   r/   r   r<   )r   r   r1   r<   rR   r   r2  r"   r)    s(   

	
zPyMuPDFParser._extract_metadatac                 C  s   | j sdS ddl}| }g }|D ]E}| j rV|d }|||}tj|jtjd|j	|j
d}	t }
t|
|	 tj|
 dd}t| j |j}|t||| j qtjttd|dS )	a	  Extract images from a PDF page and get the text using images_to_text.

        Args:
            doc: The PyMuPDF document object.
            page: The PyMuPDF page object.

        Returns:
            str: The extracted text from the images on the page.
        r   r   Nr   r\   application/x-npyr   r   )rr   r%  Z
get_imagesZPixmapr   r   Zsamplesr   r   r   r   r   r   numpyr   r   r   r   r   r   r   r   r;   rs   r   r3   r   r'   re   )r   r   r>   r%  Zimg_listr   r(   ZxrefZpiximager   r1   r   r   r   r"   r/  2  s2   
z'PyMuPDFParser._extract_images_from_pagec                 C  s   | j du rdS ddl}t|jj|fi | j}|rP| j dkr)tdd |D S | j dkr8tdd |D S | j d	krGtd
d |D S td| j  ddS )zExtract tables from a PDF page.

        Args:
            page: The PyMuPDF page object.

        Returns:
            str: The extracted tables in the specified format.
        Nr   r   r  c                 S  s   g | ]}|  qS r   )Zto_markdownr    tabler   r   r"   r#   k  r$   z;PyMuPDFParser._extract_tables_from_page.<locals>.<listcomp>r9   c                 S  s    g | ]}|  jd d d dqS )F)headerindexZ	bold_rows)	to_pandasZto_htmlr7  r   r   r"   r#   n  s    r  c                 S  s   g | ]}|  jd d dqS )F)r9  r:  )r;  Zto_csvr7  r   r   r"   r#   y  s    zextract_tables z not implemented)	r  r%  r   r8  Zfind_tablesr  _JOIN_TABLESr'   rB   )r   r>   r%  Ztables_listr   r   r"   r0  Y  s2   
	




z'PyMuPDFParser._extract_tables_from_pager   )r  r}   rx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r  r  r  r}   r   r   r   r^   )r1   r   r  r}   r   r   )r   r-  r>   r.  r  r=   r   r   )r   r-  r1   r   r   r   )r   r-  r>   r.  r   r   )r>   r.  r   r   )r   r   r   r   	threadingLockr&  r   r   r   r  r*  r)  r/  r0  r   r   r   r   r"   r    s*    6
=

]

 'r  c                      sP   e Zd ZdZe Z	dddedddd  fddZd!ddZ	d"ddZ
  ZS )#PyPDFium2Parserao  Parse a blob from a PDF using `PyPDFium2` library.

    This class provides methods to parse a blob from a PDF document, supporting various
    configurations such as handling password-protected PDFs, extracting images, and
    defining extraction mode.
    It integrates the 'PyPDFium2' library for PDF processing and offers synchronous
    blob parsing.

    Examples:
        Setup:

        .. code-block:: bash

            pip install -U langchain-community pypdfium2

        Load a blob from a PDF file:

        .. code-block:: python

            from langchain_core.documents.base import Blob

            blob = Blob.from_path("./example_data/layout-parser-paper.pdf")

        Instantiate the parser:

        .. code-block:: python

            from langchain_community.document_loaders.parsers import PyPDFium2Parser

            parser = PyPDFium2Parser(
                # password=None,
                mode="page",
                pages_delimiter="
",
                # extract_images = True,
                # images_to_text = convert_images_to_text_with_tesseract(),
            )

        Lazily parse the blob:

        .. code-block:: python

            docs = []
            docs_lazy = parser.lazy_parse(blob)

            for doc in docs_lazy:
                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)
    FNr>   r!   )rv   rp   rq   rr   rs   rx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r   r   c                  sP   t    |dvrtd|| _|r|st }|| _|| _|| _|| _|| _	dS )uk  Initialize a parser based on PyPDFium2.

        Args:
            password: Optional password for opening encrypted PDFs.
            mode: The extraction mode, either "single" for the entire document or "page"
                for page-wise extraction.
            pages_delimiter: A string delimiter to separate pages in single-mode
                extraction.
            extract_images: Whether to extract images from the PDF.
            images_parser: Optional image blob parser.
            images_inner_format: The format for the parsed output.
                - "text" = return the content as is
                - "markdown-img" = wrap the content into an image markdown link, w/ link
                pointing to (`![body)(#)`]
                - "html-img" = wrap the content as the `alt` text of an tag and link to
                (`<img alt="{body}" src="#"/>`)
            extraction_mode: “plain” for legacy functionality, “layout” for experimental
                layout mode functionality
            extraction_kwargs: Optional additional parameters for the extraction
                process.

        Returns:
            This method does not directly return data. Use the `parse` or `lazy_parse`
            methods to retrieve parsed documents with content and metadata.

        Raises:
            ValueError: If the mode is not "single" or "page".
        r~   r   N)
r   r   rB   rx   r   rr   rs   rv   rp   rq   )r   rx   rv   rp   rq   rr   rs   r   r   r"   r     s   
&
zPyPDFium2Parser.__init__r1   r   r   c              
   c  s   zddl }W n ty   tdw tj | }d}z|j|| jdd}g }t| }|j	|d< t
||d< t|D ]I\}}| }	d|	  }
|	  | |}t|g|
 }|  | jd	kr|dss|d7 }t|ti |d	|id
V  q?|| q?| jdkrt| j|t|d
V  W |r|  n|r|  w w W d   n1 sw   Y  W d   dS W d   dS 1 sw   Y  dS )r   r   NzKpypdfium2 package not found, please install it with `pip install pypdfium2`T)rv   Z	autocloser/   r-   r%   r>   r   r   )	pypdfium2r&   r?  r&  r   ZPdfDocumentrv   rT   Zget_metadata_dictr/   r   r   Zget_textpager'   Zget_text_range
splitlinescloser/  rm   rQ   rp   r  r   rF   r   rq   )r   r1   r@  rG   r   r,  r   r   r>   Z	text_pagerX   Zimage_from_pagerh   r   r   r"   r     s~   










"zPyPDFium2Parser.lazy_parsepypdfium2._helpers.page.PdfPagec           
      C  s   | j sdS ddlm} t|j|jfd}|sdS g }|D ]<}t }| 	 }|j
dk r/qt|| 	  tj| dd}t| j |j}	|t||	| j |  qtjt|dS )	r   r   r   N)re   r#  r4  r   r   )rr   Zpypdfium2.rawrawr   Zget_objectsZFPDF_PAGEOBJ_IMAGEr   r   Z
get_bitmapZto_numpysizer5  r   r   r   r   r   r   r   r   r;   rs   rB  r   r3   r   r'   )
r   r>   Zpdfium_cr   Z
str_imagesr6  r   r   r1   Ztext_from_imager   r   r"   r/  <  s(   	

z)PyPDFium2Parser._extract_images_from_pager  )rx   rZ   rv   r[   rp   ry   rq   r   rr   rz   rs   r{   r   r   r   )r>   rC  r   r   )r   r   r   r   r=  r>  r&  r   r   r   r/  r   r   r   r   r"   r?    s    4
2Kr?  c                   @  s@   e Zd ZdZ			ddddZdddZdddZdddZdS )PDFPlumberParserzParse `PDF` with `PDFPlumber`.NFr  Optional[Mapping[str, Any]]deduperZ   rx   r   r   c                 C  s>   zddl }W n ty   tdw |pi | _|| _|| _dS )zInitialize the parser.

        Args:
            text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
            dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
        r   NzEpillow package not found, please install it with `pip install pillow`)r   r&   r  rH  rx   )r   r  rH  rx   r   r   r   r"   r   `  s   

zPDFPlumberParser.__init__r1   r   r   c                 #  s`    ddl }  }|| fddjD E dH  W d   dS 1 s)w   Y  dS )Lazily parse the blob.r   Nc              
     sb   g | ]-}t |d  | t j j|jd tjdfi fddjD dqS )r%   r   )r/   rG   r>   r-   c                   s.   i | ]}t  j| ttfv r| j| qS r   )rL   r<   r   rE   r1  r2  r   r"   r3    s
    
z:PDFPlumberParser.lazy_parse.<locals>.<listcomp>.<dictcomp>r   )	r   _process_page_contentr/  r   r/   r   r   r   r<   )r    r>   r1   r   r   r   r"   r#   }  s*    
z/PDFPlumberParser.lazy_parse.<locals>.<listcomp>)
pdfplumberr   r   r   )r   r1   rL  rG   r   rK  r"   r   v  s   

"zPDFPlumberParser.lazy_parser>   pdfplumber.page.Pager   c                 C  s.   | j r| jdi | jS |jdi | jS )z)Process the page content based on dedupe.Nr   )rH  Zdedupe_charsr   r  )r   r>   r   r   r"   rJ    s   z&PDFPlumberParser._process_page_contentc                 C  s   ddl m} | jsdS g }|jD ]g}|d d jtv r_|d d dkrB|t|	d|d d	 |d d
 f|d 
 d q|tj|d 
 tjd|d d
 |d d	 d q|d d jtv rr||d 
  qtd qt|S )z8Extract images from page and get the text with RapidOCR.r   r   r   r$  FilterZBitsPerComponentr   1ZWidthZHeightLr   r\   r   )r   r   rx   r   r   r   r   r   r   	frombytesr   convertr   r   r   r   warningswarnr+   )r   r>   r   r   r(   r   r   r"   r/    s4   


z*PDFPlumberParser._extract_images_from_page)NFF)r  rG  rH  rZ   rx   rZ   r   r   r   )r>   rM  r   r   )r   r   r   r   r   r   rJ  r/  r   r   r   r"   rF  ]  s    

rF  c                   @  s0   e Zd ZdZ		ddddddZdddZdS )AmazonTextractPDFParsera{  Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.

    The AmazonTextractPDFLoader calls the
    [Amazon Textract Service](https://aws.amazon.com/textract/)
    to convert PDFs into a Document structure.
    Single and multi-page documents are supported with up to 3000 pages
    and 512 MB of size.

    For the call to be successful an AWS account is required,
    similar to the
    [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html)
    requirements.

    Besides the AWS configuration, it is very similar to the other PDF
    loaders, while also supporting JPEG, PNG and TIFF and non-native
    PDF formats.

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    loader=AmazonTextractPDFLoader("example_data/alejandro_rosalez_sample-small.jpeg")
    documents = loader.load()
    ```

    One feature is the linearization of the output.
    When using the features LAYOUT, FORMS or TABLES together with Textract

    ```python
    from langchain_community.document_loaders import AmazonTextractPDFLoader
    # you can mix and match each of the features
    loader=AmazonTextractPDFLoader(
        "example_data/alejandro_rosalez_sample-small.jpeg",
        textract_features=["TABLES", "LAYOUT"])
    documents = loader.load()
    ```

    it will generate output that formats the text in reading order and
    try to output the information in a tabular structure or
    output the key/value pairs with a colon (key: value).
    This helps most LLMs to achieve better accuracy when
    processing these texts.

    N)linearization_configtextract_featuresOptional[Sequence[int]]clientOptional[Any]rV  !Optional[TextLinearizationConfig]r   r   c                  s   z:ddl  ddlm  m}  | _|| _|dur# fdd|D | _ng | _|dur.|| _n| jjddddd	| _W n t	yE   t	d
w |sazddl
}|d| _W dS  t	y`   t	dw || _dS )a5  Initializes the parser.

        Args:
            textract_features: Features to be used for extraction, each feature
                               should be passed as an int that conforms to the enum
                               `Textract_Features`, see `amazon-textract-caller` pkg
            client: boto3 textract client
            linearization_config: Config to be used for linearization of the output
                                  should be an instance of TextLinearizationConfig from
                                  the `textractor` pkg
        r   Nc                   s   g | ]}  |qS r   )ZTextract_Features)r    ftcr   r"   r#     s    
z4AmazonTextractPDFParser.__init__.<locals>.<listcomp>Tz# z## *)Zhide_figure_layoutZtitle_prefixZsection_header_prefixZlist_element_prefixzCould not import amazon-textract-caller or amazon-textract-textractor python package. Please install it with `pip install amazon-textract-caller` & `pip install amazon-textract-textractor`.ZtextractzRCould not import boto3 python package. Please install it with `pip install boto3`.)ZtextractcallerZtextractor.entities.documententitiesdocumentr^  
textractorrW  rV  r   r&   boto3rY  boto3_textract_client)r   rW  rY  rV  rb  rc  r   r]  r"   r     sD   


z AmazonTextractPDFParser.__init__r1   r   r   c                 c  s    |j rtt|j nd}|r&|jdkr&|jr&| jjt|j | j| jd}n| jj|	 | j| jj
j| jd}| jj|}t|jD ]\}}t|j| jd|j|d ddV  qCdS )	zIterates over the Blob pages and returns an Iterator with a Document
        for each page, like the other parsers If multi-page document, blob.path
        has to be set to the S3 URI and for single page docs
        the blob.data is taken
        NZs3)input_documentfeaturesrd  )re  rf  Z	call_moderd  )configr   r/   r>   r   )pathr   r   schemenetlocr^  Zcall_textractrW  rd  as_bytesZTextract_Call_ModeZ
FORCE_SYNCrb  r   r   r   r   r  rV  r/   )r   r1   Zurl_parse_resultZtextract_response_jsonra  idxr>   r   r   r"   r   (  s2   

z"AmazonTextractPDFParser.lazy_parse)NN)rW  rX  rY  rZ  rV  r[  r   r   r   )r   r   r   r   r   r   r   r   r   r"   rU    s    /?rU  c                   @  s.   e Zd ZdZdddZdddZdddZdS )DocumentIntelligenceParserzjLoads a PDF with Azure Document Intelligence
    (formerly Form Recognizer) and chunks at character level.rY  r   modelr   c                 C  s   t d || _|| _d S )Na<  langchain_community.document_loaders.parsers.pdf.DocumentIntelligenceParserand langchain_community.document_loaders.pdf.DocumentIntelligenceLoader are deprecated. Please upgrade to langchain_community.document_loaders.DocumentIntelligenceLoader for any file parsing purpose using Azure Document Intelligence service.)rS  rT  rY  ro  )r   rY  ro  r   r   r"   r   P  s
   
z#DocumentIntelligenceParser.__init__r1   r   r)   r   r   c                 c  sD    |j D ]}ddd |jD }t||j|jdd}|V  qd S )N c                 S  s   g | ]}|j qS r   )r2   )r    liner   r   r"   r#   ^  s    z=DocumentIntelligenceParser._generate_docs.<locals>.<listcomp>rh  r   )r   r'   r"  r   r/   r   )r   r1   r)   pr2   dr   r   r"   _generate_docs\  s   
z)DocumentIntelligenceParser._generate_docsc                 c  s^    |   }| j| j|}| }| ||}|E dH  W d   dS 1 s(w   Y  dS )rI  N)r   rY  Zbegin_analyze_documentro  r)   rt  )r   r1   Zfile_objZpollerr)   docsr   r   r"   r   i  s   
"z%DocumentIntelligenceParser.lazy_parseN)rY  r   ro  r   )r1   r   r)   r   r   r   r   )r   r   r   r   r   rt  r   r   r   r   r"   rn  L  s
    

rn  )r   r   r   r   )r1   r   r2   r   r3   r   r   r   )r<   r=   r   r=   )rV   rW   rX   r   r   r   )Cr   
__future__r   r9   r   loggingr=  rS  r   pathlibr   tempfiler   typingr   r   r   r	   r
   r   r   r   r   r   r   urllib.parser   r5  r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z3langchain_community.document_loaders.parsers.imagesr   r   rL  r%  r   r@  Z)textractor.data.text_linearization_configr   r   r   r+   	getLoggerr   r   r   r   r<  r   r?   r;   rF   rT   rc   rm   rn   r   r  r?  rF  rU  rn  r   r   r   r"   <module>   sl    4




'
5 _  N  m V^ 