o
    Zh%J                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ erVdd
lmZ ddlmZ ddlmZ eeZeG dd dZ G dd deZ!dS )zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGAnyIteratorListOptionalSequence)BaseBlobParser)Blob)Document)batch_iterate)get_client_info)	OperationDocumentProcessorServiceClient)ProcessOptionsc                   @   s"   e Zd ZU dZeed< eed< dS )DocAIParsingResultsz1A dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__ r   r   W/var/www/html/lang_env/lib/python3.10/site-packages/langchain_google_community/docai.pyr      s   
 r   c                   @   s  e Zd ZdZddddddded dee dee dee d	ee f
d
dZdedee	 fddZ
				d5dee deee  dee dee ddf
ddZ	d6dedee dedee	 fddZ			d7dee dee d ed!ededee	 fd"d#Zd$ee dee	 fd%d&Zd'ee ded( fd)d*Zd+ed( defd,d-Zddd.dd/dee dee d	ee d0edee deded( fd1d2Zd+ed( dee fd3d4ZdS )8DocAIParserz`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)client
project_idlocationgcs_output_pathprocessor_namer   r   r   r    r!   r"   c             
   C   s   t |t |krtdd}|rt||std| d|| _|| _|r+|| _d	S zddlm} ddl	m
} W n tyJ }	 ztd|	d	}	~	ww ||| d
d}
||
tddd| _| jj|dj| _| jdkrqd| _d	S d| _d	S )a  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Could not import google-cloud-documentai python package. Please, install docai dependency group: `pip install langchain-google-community[docai]`Nz-documentai.googleapis.com)Zquota_project_idZapi_endpointzdocument-ai)module)Zclient_optionsZclient_infonameZLAYOUT_PARSER_PROCESSORTF)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientZgoogle.api_core.client_optionsr#   google.cloud.documentair   ImportErrorr   Zget_processortypeZ_processor_type_use_layout_parser)selfr   r   r    r!   r"   patternr#   r   excoptionsr   r   r   __init__.   sH   




zDocAIParser.__init__blobreturnc                 c   s    | j |g| jdE dH  dS )zParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        )r!   N)batch_parser,   )r3   r8   r   r   r   
lazy_parseq   s   	zDocAIParser.lazy_parseT  enable_native_pdf_parsing
page_range
chunk_sizeinclude_ancestor_headingsr   c              
   C   s   z
ddl m}m} W n ty } ztd|d}~ww | jr=|j|jj||dd}|r3|j|dnd}	|||	d}
|
S |rD||d	nd}|rN|j|dnd}	|||	d
}
|
S )aM  Prepare process options for DocAI process request

        Args:
            enable_native_pdf_parsing: enable pdf embedded text extraction
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
            chunk_size: maximum number of characters per chunk (supported
                only with Document AI Layout Parser processor).
            include_ancestor_headings: whether or not to include ancestor
                headings when splitting (supported only
                with Document AI Layout Parser processor).
        r   )	OcrConfigr   ddocumentai package not found, please install it with `pip install langchain-google-community[docai]`N)r?   r@   )Zchunking_config)pages)layout_configindividual_page_selector)r=   )
ocr_configrE   ) google.cloud.documentai_v1.typesrA   r   r0   r2   ZLayoutConfigZChunkingConfigZIndividualPageSelector)r3   r=   r>   r?   r@   rA   r   r5   rD   rE   process_optionsrF   r   r   r   _prepare_process_options|   sL   z$DocAIParser._prepare_process_options
field_maskprocess_options_kwargsc              
   +   s    zddl m} W n ty } ztd|d}~ww zddlm  W n ty4 } ztd|d}~ww | jdi |}| j|j| j	|j
jjpLdd|d	|d
| jrifddjjjD E dH  dS  fddjjD E dH  dS )a  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            process_options_kwargs: optional parameters to pass to the Document
                AI processors
        r   
documentair$   N_text_from_layoutldocumentai_toolbox package not found, please install it with `pip install langchain-google-community[docai]`application/pdfgcs_uriZ	mime_typeT)r'   Zgcs_documentrH   skip_human_reviewrJ   c                 3   s(    | ]}t |j|j jd dV  qdS )chunk_idsourceZpage_contentmetadataN)r   contentrV   path).0chunk)r8   r   r   	<genexpr>   s    
z-DocAIParser.online_process.<locals>.<genexpr>c                 3   s2    | ]}t  |jjj|jjd dV  qdS )pagerW   rX   N)r   layoutdocumenttextpage_numberr[   )r\   r`   rO   r8   responser   r   r^      s    
r   )google.cloudrM   r0   -google.cloud.documentai_toolbox.wrappers.pagerO   rI   r.   Zprocess_documentZProcessRequestr-   GcsDocumentr[   mimetyper2   rb   chunked_documentchunksrC   )r3   r8   rJ   rK   rM   r5   rH   r   re   r   online_process   sR   
zDocAIParser.online_process  <   blobstimeout_seccheck_in_interval_secc                 k   s    |p| j }|std| j|fd|i|}dd |D }td| d}	| |rJt| |	|7 }	|	|kr@td| dtd	 | |s+| j	|d
}
| 
|
E dH  dS )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds.
            process_options_kwargs: optional parameters to pass to the Document
                AI processors

        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        :An output path on Google Cloud Storage should be provided.r!   c                 S   s   g | ]}|j jqS r   )Z	operationr'   r\   opr   r   r   
<listcomp>&  s    z+DocAIParser.batch_parse.<locals>.<listcomp>z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r,   r)   docai_parseloggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)r3   rp   r!   rq   rr   rK   output_pathrx   operation_namesZtime_elapsedresultsr   r   r   r:      s:   





	zDocAIParser.batch_parser   c              
   #   s    zddl m} ddlm} ddlm  W n ty& } ztd|d }~ww |D ]-|j\}}|||d }| jrIfdd|D E d H  q) fd	d|D E d H  q)d S )
Nr   )split_gcs_uri)_get_shardsrN   rP   /c                 3   s6    | ]}|j jD ]}t|j|j jd dV  qqdS rU   )rk   rl   r   rZ   rV   r   )r\   shardr]   )resultr   r   r^   K  s    	z1DocAIParser.parse_from_results.<locals>.<genexpr>c                 3   s<    | ]}|j D ]}t |j|j|jjd dV  qqdS r_   )rC   r   ra   rc   rd   r   )r\   r   r`   rO   r   r   r   r^   W  s    	)	Z7google.cloud.documentai_toolbox.utilities.gcs_utilitiesr   Z1google.cloud.documentai_toolbox.wrappers.documentr   rh   rO   r0   r   r2   )r3   r   r   r   r5   Zgcs_bucket_nameZ
gcs_prefixZshardsr   r   r   r   7  s0   
zDocAIParser.parse_from_resultsr   r   c              
      sH   zddl m  W n ty } ztd|d}~ww  fdd|D S )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzplong running operations package not found, please install it with`pip install langchain-google-community[docai]`Nc                    s    g | ]}j j |d dqS )r&   )request)r.   Zget_operation)r\   r'   r   r3   r   r   rv   o  s    z5DocAIParser.operations_from_names.<locals>.<listcomp>)Z!google.longrunning.operations_pb2r   r0   )r3   r   r5   r   r   r   operations_from_namesc  s   z!DocAIParser.operations_from_namesrx   c                 C   s   t dd |D S )Nc                 s   s    | ]}|   V  qd S N)donert   r   r   r   r^   u  s    z)DocAIParser.is_running.<locals>.<genexpr>)any)r3   rx   r   r   r   r|   t  s   zDocAIParser.is_runningi  )r!   r"   
batch_sizerJ   r   c                   s   zddl m  W n ty } ztd|d}~ww |p| j}|du r'td|p+| j}|du r4tdg }	t||dD ]8}
 j j fdd	|
D d
d} j	 j	j
||dd}| jdi |}|	| j j||||dd q<|	S )a[  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            process_options_kwargs: optional parameters to pass to the Document
                AI processors

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   rL   rB   Nrs   z0A Document AI processor name should be provided.)sizeiterablec                    s"   g | ]} j |j|jpd dqS )rQ   rR   )ri   r[   rj   )r\   r8   rL   r   r   rv     s    z+DocAIParser.docai_parse.<locals>.<listcomp>)Z	documents)Zgcs_documents)rS   rJ   )Zgcs_output_configT)r'   Zinput_documentsZdocument_output_configrH   rT   r   )rg   rM   r0   r,   r)   r-   r   ZBatchDocumentsInputConfigZGcsDocumentsZDocumentOutputConfigZGcsOutputConfigrI   appendr.   Zbatch_process_documentsZBatchProcessRequest)r3   rp   r!   r"   r   rJ   rK   r5   r   rx   batchZinput_configZoutput_configrH   r   rL   r   ry   w  sZ   


zDocAIParser.docai_parsec              
      sF   zddl m  W n ty } ztd|d }~ww  fdd|D S )Nr   BatchProcessMetadatarB   c                    sF   g | ]}t |j r|jjn |jjjD ]
}t|j|jd qqS ))r   r   )
isinstancerY   Zindividual_process_statusesZdeserializevaluer   Zinput_gcs_sourceZoutput_gcs_destination)r\   ru   statusr   r   r   rv     s    

z+DocAIParser.get_results.<locals>.<listcomp>)Zgoogle.cloud.documentai_v1r   r0   )r3   rx   r5   r   r   r   r     s   
zDocAIParser.get_results)TNr<   Tr   )Nrn   ro   )r   r   r   r   r   r   r7   r
   r   r   r;   r(   r   intrI   r   rm   r   r:   r   r   r   r|   ry   r   r   r   r   r   r   '   s    	
C

>
L
7
,	
Pr   )"r   loggingr*   r}   dataclassesr   typingr   r   r   r   r   r   Zlangchain_core.document_loadersr	   Z,langchain_core.document_loaders.blob_loadersr
   Zlangchain_core.documentsr   Zlangchain_core.utils.iterr   Z!langchain_google_community._utilsr   Zgoogle.api_core.operationr   r/   r   rG   r   	getLoggerr   rz   r   r   r   r   r   r   <module>   s&     
