o
    Zhn                     @   sz   d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZ e eZG dd deZG dd	 d	e
ZdS )
    N)DictIteratorListUnion)Document)BaseBlobParser)Blobc                   @   s   e Zd ZdZdS )ServerUnavailableExceptionz7Exception raised when the Grobid server is unavailable.N)__name__
__module____qualname____doc__ r   r   j/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/parsers/grobid.pyr	      s    r	   c                	   @   s^   e Zd ZdZ	ddededdfddZd	ed
ededee fddZ	de
dee fddZdS )GrobidParserz)Load  article `PDF` files using `Grobid`.1http://localhost:8070/api/processFulltextDocumentsegment_sentencesgrobid_serverreturnNc                 C   s>   || _ || _zt| W d S  tjjy   td tw )NzyGROBID server does not appear up and running,                 please ensure Grobid is installed and the server is running)	r   r   requestsget
exceptionsRequestExceptionloggererrorr	   )selfr   r   r   r   r   __init__   s   zGrobidParser.__init__	file_pathxml_datac                 #   s   zddl m} W n ty   tdw ||d}|d}|d}|r,|d jndg }|D ]}	|	d}
|
d	urt|	d
D ]\}}g }g }t|dD ]l\}}||j g }|dd	ur|d	dD ]}|	d}||d |d |d |d |d d qn|| |du rt
|dkr|d d |d d }}|jt||g|
j|
d||fd}|| qS|dur|d d d |d d d }}d|t|||
j|
d||fd}|| qDq2 fdd|D E d	H  d	S )z!Process the XML file from Grobin.r   )BeautifulSoupzA`bs4` package not found, please install it with `pip install bs4`xmldivtitlezNo title foundheadNpsZcoords;,            )pagexyhwTr,   n)textparabboxessection_titlesection_numberpages c                    sj   g | ]1}t |d  tt|d  t|d t|d t|d t|d t|d tt ddqS )r3   r4   r5   r8   r6   r7   )r3   r4   r5   r8   r6   r7   Zpaper_titler   )Zpage_contentmetadata)r   dictstr).0chunkr   r"   r   r   
<listcomp>i   s"    





z,GrobidParser.process_xml.<locals>.<listcomp>)Zbs4r   ImportErrorZfind_allr3   find	enumerateappendr   splitlenr<   join)r   r   r   r   r   ZsoupsectionstitleschunkssectionsectiZ	paragraphZchunk_bboxesZparagraph_textZsentenceZsbboxesZbboxboxZfpageZlpageZsentence_dictZparagraph_dictr   r?   r   process_xml&   s~   





	

zGrobidParser.process_xmlblobc           	   	   C   s   |j }|d u rtdt|d}d||dddifi}z'i }dD ]}d||< qd	d
g|d< |p/i }tjd| jd d ||dd}|j}W n tjjyS   t	
d d }Y nw |d u r\tg S | ||| jS )Nzblob.source cannot be None.rbinputzapplication/pdfZExpires0)ZgenerateIDsZconsolidateHeaderZsegmentSentences1r#   r%   ZteiCoordinatesPOST<   )headersparamsfilesdatatimeoutz%GROBID server timed out. Return None.)source
ValueErroropenr   requestr   r3   r   ReadTimeoutr   r   iterrO   r   )	r   rP   r   ZpdfrY   rZ   paramrr   r   r   r   
lazy_parse|   s8   


	
zGrobidParser.lazy_parse)r   )r
   r   r   r   boolr<   r   r   r   rO   r   rd   r   r   r   r   r      s(    

Vr   )loggingtypingr   r   r   r   r   Zlangchain_core.documentsr   Z)langchain_community.document_loaders.baser   Z1langchain_community.document_loaders.blob_loadersr   	getLoggerr
   r   	Exceptionr	   r   r   r   r   r   <module>   s    
