o
    Zh]<                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ erTddlmZ ddlmZ eeZeG dd dZeddddG dd deZ dS )zModule contains a PDF parser based on Document AI from Google Cloud.

You need to install two libraries to use this parser:
pip install google-cloud-documentai
pip install google-cloud-documentai-toolbox
    N)	dataclass)TYPE_CHECKINGIteratorListOptionalSequence)
deprecated)Document)batch_iterate)BaseBlobParser)Blob)get_client_info)	OperationDocumentProcessorServiceClientc                   @   s"   e Zd ZU dZeed< eed< dS )DocAIParsingResultsz/Dataclass to store Document AI parsing results.source_pathparsed_pathN)__name__
__module____qualname____doc__str__annotations__ r   r   i/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/parsers/docai.pyr      s   
 r   z0.0.32z1.0z&langchain_google_community.DocAIParser)ZsinceZremovalZalternative_importc                   @   sp  e Zd ZdZdddddded dee dee dee fd	d
Zdedee	 fddZ
			d-dededee deee  dee	 f
ddZ			d.dee dee dededee	 f
ddZdee dee	 fddZdee ded  fd!d"Zd#ed  defd$d%Zddd&ddd'dee dee dee d(ededee ded  fd)d*Zd#ed  dee fd+d,ZdS )/DocAIParserz`Google Cloud Document AI` parser.

    For a detailed explanation of Document AI, refer to the product documentation.
    https://cloud.google.com/document-ai/docs/overview
    N)clientlocationgcs_output_pathprocessor_namer   r   r   r   r    c          
   
   C   s   t |t |krtdd}|rt||std| d|| _|| _|r+|| _d	S zddlm} ddl	m
} W n tyJ } ztd|d	}~ww || d
d}	||	tddd| _d	S )a  Initializes the parser.

        Args:
            client: a DocumentProcessorServiceClient to use
            location: a Google Cloud location where a Document AI processor is located
            gcs_output_path: a path on Google Cloud Storage to store parsing results
            processor_name: full resource name of a Document AI processor or processor
                version

        You should provide either a client or location (and then a client
            would be instantiated).
        zGYou must specify either a client or a location to instantiate a client.z?projects\/[0-9]+\/locations\/[a-z\-0-9]+\/processors\/[a-z0-9]+zProcessor name z has the wrong format. If your prediction endpoint looks like https://us-documentai.googleapis.com/v1/projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID:process, use only projects/PROJECT_ID/locations/us/processors/PROCESSOR_ID part.r   )ClientOptionsr   Zdocumentai package not found, please install it with `pip install google-cloud-documentai`Nz-documentai.googleapis.com)Zapi_endpointzdocument-ai)module)Zclient_optionsZclient_info)bool
ValueErrorre	fullmatch_gcs_output_path_processor_name_clientZgoogle.api_core.client_optionsr!   google.cloud.documentair   ImportErrorr   )
selfr   r   r   r    patternr!   r   excoptionsr   r   r   __init__2   s>   

zDocAIParser.__init__blobreturnc                 c   s    | j |g| jdE dH  dS )zParses a blob lazily.

        Args:
            blobs: a Blob to parse

        This is a long-running operation. A recommended way is to batch
            documents together and use the `batch_parse()` method.
        r   N)batch_parser(   )r-   r2   r   r   r   
lazy_parsel   s   	zDocAIParser.lazy_parseTenable_native_pdf_parsing
field_mask
page_rangec              
   #   s    zddl m} ddlm}m}m} W n ty$ }	 ztd|	d}	~	ww zddlm  W n ty> }	 ztd|	d}	~	ww |rF||dnd}
|rO||d	nd}| j	
|j| j|jjjp`d
d||
|dd|d fddjjD E dH  dS )a  Parses a blob lazily using online processing.

        Args:
            blob: a blob to parse.
            enable_native_pdf_parsing: enable pdf embedded text extraction
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"
            page_range: list of page numbers to parse. If `None`,
                entire document will be parsed.
        r   
documentai)IndividualPageSelector	OcrConfigProcessOptionsr"   N_text_from_layoutjdocumentai_toolbox package not found, please install it with `pip install google-cloud-documentai-toolbox`r7   )pagesapplication/pdfgcs_uriZ	mime_type)
ocr_configindividual_page_selectorT)nameZgcs_documentprocess_optionsskip_human_reviewr8   c                 3   s2    | ]}t  |jjj|jjd dV  qdS )pagesource)Zpage_contentmetadataN)r	   layoutdocumenttextpage_numberpath).0rM   r@   r2   responser   r   	<genexpr>   s    
z-DocAIParser.online_process.<locals>.<genexpr>)google.cloudr;    google.cloud.documentai_v1.typesr<   r=   r>   r,   -google.cloud.documentai_toolbox.wrappers.pager@   r*   Zprocess_documentZProcessRequestr)   GcsDocumentrT   mimetyperQ   rC   )r-   r2   r7   r8   r9   r;   r<   r=   r>   r/   rG   rH   r   rV   r   online_processw   s\   zDocAIParser.online_process  <   blobstimeout_seccheck_in_interval_secc           
      c   s    |p| j }|std| j||d}dd |D }td| d}| |rFt| ||7 }||kr<td| dtd	 | |s'| j	|d
}	| 
|	E dH  dS )a  Parses a list of blobs lazily.

        Args:
            blobs: a list of blobs to parse.
            gcs_output_path: a path on Google Cloud Storage to store parsing results.
            timeout_sec: a timeout to wait for Document AI to complete, in seconds.
            check_in_interval_sec: an interval to wait until next check
                whether parsing operations have been completed, in seconds
        This is a long-running operation. A recommended way is to decouple
            parsing from creating LangChain Documents:
            >>> operations = parser.docai_parse(blobs, gcs_path)
            >>> parser.is_running(operations)
            You can get operations names and save them:
            >>> names = [op.operation.name for op in operations]
            And when all operations are finished, you can use their results:
            >>> operations = parser.operations_from_names(operation_names)
            >>> results = parser.get_results(operations)
            >>> docs = parser.parse_from_results(results)
        :An output path on Google Cloud Storage should be provided.r4   c                 S   s   g | ]}|j jqS r   )Z	operationrI   rU   opr   r   r   
<listcomp>   s    z+DocAIParser.batch_parse.<locals>.<listcomp>z9Started parsing with Document AI, submitted operations %sr   z#Timeout exceeded! Check operations z later!.)
operationsN)r(   r%   docai_parseloggerdebug
is_runningtimesleepTimeoutErrorget_resultsparse_from_results)
r-   ra   r   rb   rc   output_pathri   operation_namesZtime_elapsedresultsr   r   r   r5      s.   





	zDocAIParser.batch_parseru   c              
   #   s    zddl m} ddlm} ddlm  W n ty& } ztd|d }~ww |D ]|j\}}|||} fdd|D E d H  q)d S )Nr   )split_gcs_uri)_get_shardsr?   rA   c                 3   s<    | ]}|j D ]}t |j|j|jjd dV  qqdS rL   )rC   r	   rP   rR   rS   r   )rU   ZshardrM   r@   resultr   r   rX     s    z1DocAIParser.parse_from_results.<locals>.<genexpr>)Z7google.cloud.documentai_toolbox.utilities.gcs_utilitiesrv   Z1google.cloud.documentai_toolbox.wrappers.documentrw   r[   r@   r,   r   )r-   ru   rv   rw   r/   Zgcs_bucket_nameZ
gcs_prefixZshardsr   rx   r   rr      s(   
zDocAIParser.parse_from_resultsrt   r   c              
      sH   zddl m  W n ty } ztd|d}~ww  fdd|D S )z5Initializes Long-Running Operations from their names.r   )GetOperationRequestzhlong running operations package not found, please install it with `pip install gapic-google-longrunning`Nc                    s    g | ]}j j |d dqS ))rI   )request)r*   Zget_operation)rU   rI   rz   r-   r   r   rg     s    z5DocAIParser.operations_from_names.<locals>.<listcomp>)Z!google.longrunning.operations_pb2rz   r,   )r-   rt   r/   r   r|   r   operations_from_names
  s   z!DocAIParser.operations_from_namesri   c                 C   s   t dd |D S )Nc                 s   s    | ]}|   V  qd S )N)donere   r   r   r   rX     s    z)DocAIParser.is_running.<locals>.<genexpr>)any)r-   ri   r   r   r   rm     s   zDocAIParser.is_runningi  )r   r    
batch_sizer7   r8   r   c                   s  zddl m  ddlm}m} W n ty! }	 ztd|	d}	~	ww |p&| j}
|
du r/td|p3| j}|du r<tdg }t	||dD ]<} j
 j fd	d
|D dd} j jj|
|dd}|rm|||ddnd}|| j j||||dd qD|S )a3  Runs Google Document AI PDF Batch Processing on a list of blobs.

        Args:
            blobs: a list of blobs to be parsed
            gcs_output_path: a path (folder) on GCS to store results
            processor_name: name of a Document AI processor.
            batch_size: amount of documents per batch
            enable_native_pdf_parsing: a config option for the parser
            field_mask: a comma-separated list of which fields to include in the
                Document AI response.
                suggested: "text,pages.pageNumber,pages.layout"

        Document AI has a 1000 file limit per batch, so batches larger than that need
        to be split into multiple requests.
        Batch processing is an async long-running operation
        and results are stored in a output GCS bucket.
        r   r:   )r=   r>   r"   Nrd   z0A Document AI processor name should be provided.)sizeiterablec                    s"   g | ]} j |j|jpd dqS )rD   rE   )r\   rT   r]   )rU   r2   r:   r   r   rg   O  s    z+DocAIParser.docai_parse.<locals>.<listcomp>)Z	documents)Zgcs_documents)rF   r8   )Zgcs_output_configrB   )rG   T)rI   Zinput_documentsZdocument_output_configrJ   rK   )rY   r;   rZ   r=   r>   r,   r(   r%   r)   r
   ZBatchDocumentsInputConfigZGcsDocumentsZDocumentOutputConfigZGcsOutputConfigappendr*   Zbatch_process_documentsZBatchProcessRequest)r-   ra   r   r    r   r7   r8   r=   r>   r/   rs   ri   batchZinput_configZoutput_configrJ   r   r:   r   rj     sj   


	zDocAIParser.docai_parsec              
      sF   zddl m  W n ty } ztd|d }~ww  fdd|D S )Nr   BatchProcessMetadatar"   c                    sF   g | ]}t |j r|jjn |jjjD ]
}t|j|jd qqS ))r   r   )
isinstancerO   Zindividual_process_statusesZdeserializevaluer   Zinput_gcs_sourceZoutput_gcs_destination)rU   rf   statusr   r   r   rg   ~  s    

z+DocAIParser.get_results.<locals>.<listcomp>)Zgoogle.cloud.documentai_v1r   r,   )r-   ri   r/   r   r   r   rq   u  s   
zDocAIParser.get_results)TNN)Nr_   r`   )r   r   r   r   r   r   r1   r   r   r	   r6   r$   r   intr^   r   r5   r   rr   r}   rm   rj   rq   r   r   r   r   r   &   s    	
:

K
1
	
Wr   )!r   loggingr&   rn   dataclassesr   typingr   r   r   r   r   Zlangchain_core._api.deprecationr   Zlangchain_core.documentsr	   Zlangchain_core.utils.iterr
   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   Z&langchain_community.utilities.vertexair   Zgoogle.api_core.operationr   r+   r   	getLoggerr   rk   r   r   r   r   r   r   <module>   s0    
