o
    ®©Zh]<  ã                   @   sæ   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ erTddlmZ ddlmZ e e¡ZeG dd„ dƒƒZeddddG dd„ deƒƒZ dS )zÌModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
é    N)Ú	dataclass)ÚTYPE_CHECKINGÚIteratorÚListÚOptionalÚSequence)Ú
deprecated)ÚDocument)Úbatch_iterate)ÚBaseBlobParser)ÚBlob)Úget_client_info)Ú	Operation©ÚDocumentProcessorServiceClientc                   @   s"   e Zd ZU dZeed< eed< dS )ÚDocAIParsingResultsz/Dataclass to store Document AI parsing results.Úsource_pathÚparsed_pathN)Ú__name__Ú
__module__Ú__qualname__Ú__doc__ÚstrÚ__annotations__© r   r   úi/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/parsers/docai.pyr      s   
 r   z0.0.32z1.0z&langchain_google_community.DocAIParser)ZsinceZremovalZalternative_importc                   @   sp  e Zd ZdZdddddœded dee dee dee fd	d
„Zdedee	 fdd„Z
			d-dededee deee  dee	 f
dd„Z			d.dee dee dededee	 f
dd„Zdee dee	 fdd„Zdee ded  fd!d"„Zd#ed  defd$d%„Zddd&ddd'œdee dee dee d(ededee ded  fd)d*„Zd#ed  dee fd+d,„ZdS )/ÚDocAIParserz²`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)ÚclientÚlocationÚgcs_output_pathÚprocessor_namer   r   r   r   r    c          
   
   C   s¾   t |ƒt |ƒkrtdƒ‚d}|rt ||¡std|› dƒ‚|| _|| _|r+|| _d	S zddlm} ddl	m
} W n tyJ } ztdƒ|‚d	}~ww ||› d
d}	||	tddd| _d	S )aõ  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name zï has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ÚClientOptionsr   úZdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)Zapi_endpointzdocument-ai)Úmodule)Zclient_optionsZclient_info)ÚboolÚ
ValueErrorÚreÚ	fullmatchÚ_gcs_output_pathÚ_processor_nameÚ_clientZgoogle.api_core.client_optionsr!   Úgoogle.cloud.documentair   ÚImportErrorr   )
Úselfr   r   r   r    Úpatternr!   r   ÚexcÚoptionsr   r   r   Ú__init__2   s>   ÿ
ÿ
ÿý€ÿÿþzDocAIParser.__init__ÚblobÚreturnc                 c   s    | j |g| jdE dH  dS )zÜParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        ©r   N)Úbatch_parser(   )r-   r2   r   r   r   Ú
lazy_parsel   s   €	zDocAIParser.lazy_parseTÚenable_native_pdf_parsingÚ
field_maskÚ
page_rangec              
   #   sþ    zddl m} ddlm}m}m} W n ty$ }	 ztdƒ|	‚d}	~	ww zddlm‰  W n ty> }	 ztdƒ|	‚d}	~	ww |rF||dnd}
|rO||d	nd}| j	 
|j| j|jˆjˆjp`d
d||
|dd|d¡‰‡ ‡‡fdd„ˆjjD ƒE dH  dS )aÜ  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   ©Ú
documentai)ÚIndividualPageSelectorÚ	OcrConfigÚProcessOptionsr"   N©Ú_text_from_layoutújdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`©r7   )Úpagesúapplication/pdf©Úgcs_uriZ	mime_type)Ú
ocr_configÚindividual_page_selectorT)ÚnameZgcs_documentÚprocess_optionsÚskip_human_reviewr8   c                 3   s2    | ]}t ˆ |jˆjjƒ|jˆjd œdV  qdS ©)ÚpageÚsource)Zpage_contentÚmetadataN)r	   ÚlayoutÚdocumentÚtextÚpage_numberÚpath)Ú.0rM   ©r@   r2   Úresponser   r   Ú	<genexpr>´   s   € ùþþ
ÿz-DocAIParser.online_process.<locals>.<genexpr>)Úgoogle.cloudr;   Ú google.cloud.documentai_v1.typesr<   r=   r>   r,   Ú-google.cloud.documentai_toolbox.wrappers.pager@   r*   Zprocess_documentZProcessRequestr)   ÚGcsDocumentrT   ÚmimetyperQ   rC   )r-   r2   r7   r8   r9   r;   r<   r=   r>   r/   rG   rH   r   rV   r   Úonline_processw   s\   €ÿý€ÿÿý€ÿÿýÿþþõÿøzDocAIParser.online_processé  é<   ÚblobsÚtimeout_secÚcheck_in_interval_secc           
      c   s¬    |p| j }|stdƒ‚| j||d}dd„ |D ƒ}t d|¡ d}|  |¡rFt |¡ ||7 }||kr<td|› dƒ‚t d	¡ |  |¡s'| j	|d
}	|  
|	¡E dH  dS )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        ú:An output path on Google Cloud Storage should be provided.r4   c                 S   s   g | ]}|j j‘qS r   )Z	operationrI   ©rU   Úopr   r   r   Ú
<listcomp>ß   s    z+DocAIParser.batch_parse.<locals>.<listcomp>z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!Ú.)Ú
operationsN)r(   r%   Údocai_parseÚloggerÚdebugÚ
is_runningÚtimeÚsleepÚTimeoutErrorÚget_resultsÚparse_from_results)
r-   ra   r   rb   rc   Úoutput_pathri   Úoperation_namesZtime_elapsedÚresultsr   r   r   r5   ¿   s.   €
ÿÿ


ÿ

ù	zDocAIParser.batch_parseru   c              
   #   sŽ    zddl m} ddlm} ddlm‰  W n ty& } ztdƒ|‚d }~ww |D ]‰|ˆjƒ\}}|||ƒ}‡ ‡fdd„|D ƒE d H  q)d S )Nr   )Úsplit_gcs_uri)Ú_get_shardsr?   rA   c                 3   s<    | ]}|j D ]}tˆ |j|jƒ|jˆjd œdV  qqdS rL   )rC   r	   rP   rR   rS   r   )rU   ZshardrM   ©r@   Úresultr   r   rX     s   € úûþÿz1DocAIParser.parse_from_results.<locals>.<genexpr>)Z7google.cloud.documentai_toolbox.utilities.gcs_utilitiesrv   Z1google.cloud.documentai_toolbox.wrappers.documentrw   r[   r@   r,   r   )r-   ru   rv   rw   r/   Zgcs_bucket_nameZ
gcs_prefixZshardsr   rx   r   rr   ð   s(   €ÿý€ÿ
ûýzDocAIParser.parse_from_resultsrt   r   c              
      sH   zddl m‰  W n ty } ztdƒ|‚d}~ww ‡ ‡fdd„|D ƒS )z5Initializes Long-Running Operations from their names.r   )ÚGetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`Nc                    s    g | ]}ˆj jˆ |d d‘qS ))rI   )Úrequest)r*   Zget_operation)rU   rI   ©rz   r-   r   r   rg     s    ÿÿz5DocAIParser.operations_from_names.<locals>.<listcomp>)Z!google.longrunning.operations_pb2rz   r,   )r-   rt   r/   r   r|   r   Úoperations_from_names
  s   ÿý€ÿþz!DocAIParser.operations_from_namesri   c                 C   s   t dd„ |D ƒƒS )Nc                 s   s    | ]}|  ¡  V  qd S )N)Údonere   r   r   r   rX     s   € z)DocAIParser.is_running.<locals>.<genexpr>)Úany)r-   ri   r   r   r   rm     s   zDocAIParser.is_runningiè  )r   r    Ú
batch_sizer7   r8   r€   c                   s  zddl m‰  ddlm}m} W n ty! }	 ztdƒ|	‚d}	~	ww |p&| j}
|
du r/tdƒ‚|p3| j}|du r<tdƒ‚g }t	||dD ]<}ˆ j
ˆ j‡ fd	d
„|D ƒdd}ˆ jˆ jj|
|dd}|rm|||ddnd}| | j ˆ j||||dd¡¡ qD|S )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   r:   )r=   r>   r"   Nrd   z0A Document AI processor name should be provided.)ÚsizeÚiterablec                    s"   g | ]}ˆ j |j|jpd d‘qS )rD   rE   )r\   rT   r]   )rU   r2   r:   r   r   rg   O  s    üþÿz+DocAIParser.docai_parse.<locals>.<listcomp>)Z	documents)Zgcs_documents)rF   r8   )Zgcs_output_configrB   )rG   T)rI   Zinput_documentsZdocument_output_configrJ   rK   )rY   r;   rZ   r=   r>   r,   r(   r%   r)   r
   ZBatchDocumentsInputConfigZGcsDocumentsZDocumentOutputConfigZGcsOutputConfigÚappendr*   Zbatch_process_documentsZBatchProcessRequest)r-   ra   r   r    r€   r7   r8   r=   r>   r/   rs   ri   ÚbatchZinput_configZoutput_configrJ   r   r:   r   rj     sj   ÿý€ÿ
ÿ

ûÿÿÿÿûÿÿù	ûÿÿzDocAIParser.docai_parsec              
      sF   zddl m‰  W n ty } ztdƒ|‚d }~ww ‡ fdd„|D ƒS )Nr   ©ÚBatchProcessMetadatar"   c                    sF   g | ]}t |jˆ ƒr|jjnˆ  |jj¡jD ]
}t|j|jd ‘qqS ))r   r   )Ú
isinstancerO   Zindividual_process_statusesZdeserializeÚvaluer   Zinput_gcs_sourceZoutput_gcs_destination)rU   rf   Ústatusr…   r   r   rg   ~  s    

ÿÿõûþÿz+DocAIParser.get_results.<locals>.<listcomp>)Zgoogle.cloud.documentai_v1r†   r,   )r-   ri   r/   r   r…   r   rq   u  s   ÿý€ÿ
ûzDocAIParser.get_results)TNN)Nr_   r`   )r   r   r   r   r   r   r1   r   r   r	   r6   r$   r   Úintr^   r   r5   r   rr   r}   rm   rj   rq   r   r   r   r   r   &   s    	úýüû
ú:ûþýü
û
úKûþýüû
ú1ÿ
þøþüûúùø	
÷Wr   )!r   Úloggingr&   rn   Údataclassesr   Útypingr   r   r   r   r   Zlangchain_core._api.deprecationr   Zlangchain_core.documentsr	   Zlangchain_core.utils.iterr
   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z&langchain_community.utilities.vertexair   Zgoogle.api_core.operationr   r+   r   Ú	getLoggerr   rk   r   r   r   r   r   r   Ú<module>   s0    
ý