o
    Zhc                     @  s  d dl mZ d dlZd dlZd dlZd dlZd dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZ d d
lmZmZ d dlmZ d dlmZ e e!Z"dZ#dZ$dZ%dZ&g dZ'g dZ(dgZ)g dZ*e'e(e)e*dZ+G dd de,eZ-G dd deZ.G dd deZ/G dd deZ0G dd deZ1G d d! d!eZ2dDd%d&Z3dEd(d)Z4dFd+d,Z5dGd.d/Z6dHd0d1Z7	dIdJd7d8Z8dKd:d;Z9dLd=d>Z:dMd@dAZ;G dBdC dCeZ<dS )N    )annotationsN)Enum)
HTTPStatus)AnyDictListOptionalTuple)Document)get_runtime_environment)get_from_dict_or_env)	BaseModel)Responserequest)RequestException)
BaseLoaderz0.1.1zhttp://localhost:8000zhttps://api.daxa.ai  )Z
JSONLoaderS3FileLoaderZUnstructuredMarkdownLoaderZUnstructuredPDFLoaderZUnstructuredFileLoaderZUnstructuredJsonLoaderZPyPDFLoaderGCSFileLoaderZAmazonTextractPDFLoaderZ	CSVLoaderZUnstructuredExcelLoaderZUnstructuredEmailLoader)ZDirectoryLoaderZS3DirLoaderZSlackDirectoryLoaderZPyPDFDirectoryLoaderZNotionDirectoryLoaderDataFrameLoader)NotionDBLoaderGoogleDriveLoaderSharePointLoader)filedir	in-memoryzcloud-folderc                   @  s   e Zd ZdZdZdZdS )Routesz2Routes available for the Pebblo API as enumerator.z/v1/loader/docz/v1/app/discoverN)__name__
__module____qualname____doc__
loader_docloader_app_discover r#   r#   [/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/utilities/pebblo.pyr   C   s    r   c                   @  s   e Zd ZU dZded< dS )IndexedDocumentzPebblo Indexed Document.strpb_idNr   r   r   r    __annotations__r#   r#   r#   r$   r%   J   s   
 r%   c                   @  s   e Zd ZU dZdZded< 	 ded< 	 ded< 	 dZded	< 	 ded
< 	 ded< 	 ded< 	 ded< 	 ded< 	 dZded< dS )RuntimezPebblo Runtime.localr&   typehostpath Optional[str]ipplatformos
os_versionlanguagelanguage_versionruntimeN)r   r   r   r    r,   r)   r1   r7   r#   r#   r#   r$   r*   Q   s,   
 r*   c                   @  s$   e Zd ZU dZded< 	 ded< dS )	FrameworkzPebblo Framework instance.r&   nameversionNr(   r#   r#   r#   r$   r8   j   s   
 r8   c                   @  s`   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< 	 ded	< 	 d
ed< 	 ded< 	 d
ed< dS )AppzPebblo AI application.r&   r9   ownerr0   descriptionload_idr*   r7   r8   	frameworkplugin_versionclient_versionNr(   r#   r#   r#   r$   r;   s   s$   
 r;   c                   @  st   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< 	 ded< 	 d	ed
< 	 ded< 	 ded< 	 ded< 	 ded< dS )DoczPebblo document.r&   r9   r<   listdocsr@   r>   dictloader_detailsboolloading_endsource_ownerclassifier_locationanonymize_snippetsNr(   r#   r#   r#   r$   rB      s,   
 rB   r.   r&   returnc                 C  sF   | rd| v sd| d ks| dv r| S t | }| r| }t|S )zReturn an absolute local path for a local file/directory,
    for a network related path, return as is.

    Args:
        path (str): Relative path to be resolved.

    Returns:
        str: Resolved absolute path.
    z:///r   )unknown-r   )pathlibPathexistsresolver&   )r.   	full_pathr#   r#   r$   get_full_path   s   
rU   loaderc                 C  s&   t  D ]\}}| |v r|  S qdS )zReturn loader type among, file, dir or in-memory.

    Args:
        loader (str): Name of the loader, whose type is to be resolved.

    Returns:
        str: One of the loader type among, file/dir/in-memory.
    unsupported)LOADER_TYPE_MAPPINGitems)rV   loader_typeloadersr#   r#   r$   get_loader_type   s
   	r\   r   c                 C  s  ddl m}m}m}m} d}t| tstd |S | j	}zd|v rBt| |r2d| j
 d| j }nt| |rAd| j
 d| j }nd	|v r^|d	 }|r]d
|v r]|d
 }|r]| d| }nd|v rg|d }nxd|v rp|d }nod|v r|d }|rt|trt|dkr|d }nUt| |rd}nMt| |rd| j }nA| jjdkr|dr|d}	d|	 }n+|dr|dg }
ddd |
D }n|dr|dg }ddd |D }W n	 ty   Y nw tt|S )zReturn an absolute source path of source of loader based on the
    keys present in Document.

    Args:
        loader (BaseLoader): Langchain document loader, derived from Baseloader.
    r   )r   r   r   r   rO   zGloader is not derived from BaseLoader, source location will be unknown!bucketzgc://rM   zs3://sourcechannelr.   	file_path	web_pathsr   znotiondb://r   	folder_idz+https://drive.google.com/drive/u/2/folders/file_idsz, c                 S     g | ]}d | dqS )z https://drive.google.com/file/d/z/viewr#   ).0Zfile_idr#   r#   r$   
<listcomp>       
z(get_loader_full_path.<locals>.<listcomp>document_idsc                 S  rd   )z#https://docs.google.com/document/d/z/editr#   )re   doc_idr#   r#   r$   rf     rg   )Z$langchain_community.document_loadersr   r   r   r   
isinstancer   loggererror__dict__r]   ZblobkeyrC   lenZdatabase_id	__class__r   getjoin	ExceptionrU   r&   )rV   r   r   r   r   locationZloader_dictr_   ra   rb   rc   rh   r#   r#   r$   get_loader_full_path   st   










ru   Tuple[Framework, Runtime]c                  C  s   t  } td| ddd}t }t|jtjd | dd|j	|j
t | dd| d	dd
}d|jv r;d|_d|_td|  td|  ||fS )zFetch the current Framework and Runtime details.

    Returns:
        Tuple[Framework, Runtime]: Framework and Runtime for the current app instance.
    Z	langchainZlibrary_versionN)r9   r:   ZPWDr2   rN   r7   Zruntime_version)r-   r.   r2   r3   r4   r1   r5   r6   DarwinZdesktopzMac OSXz
framework zruntime )r   r8   rq   r2   unamer*   noder3   environsystemr:   get_ipr,   r7   rk   debug)Zruntime_envr?   rx   r7   r#   r#   r$   get_runtime  s*   



r~   c                  C  s@   ddl } |  }z| |}W |S  ty   | d}Y |S w )zJFetch local runtime ip address.

    Returns:
        str: IP address
    r   N	localhost)socketgethostnamegethostbynamers   )r   r-   Z	public_ipr#   r#   r$   r|   .  s   r|   rD   List[Document]max_batch_sizeintList[List[Document]]c                 C  s~   g }g }d}| D ]-}t |jd}||kr||g q|| |kr,|| g }d}|| ||7 }q|r=|| |S )a  
    Generate batches of documents based on page_content size.
    Args:
        docs: List of documents to be batched.
        max_batch_size: Maximum size of each batch in bytes. Defaults to 100*1024(100KB)
    Returns:
        List[List[Document]]: List of batches of documents
    r   utf-8)ro   page_contentencodeappend)rD   r   ZbatchesZcurrent_batchZcurrent_batch_sizedocZdoc_sizer#   r#   r$   generate_size_based_batches>  s    



r   r`   c                 C  s@   zddl }t| j}||j}W |S  ty   d}Y |S w )zFetch owner of local file path.

    Args:
        file_path (str): Local file path.

    Returns:
        str: Name of owner.
    r   NrN   )pwdr3   statst_uidgetpwuidpw_namers   )r`   r   Zfile_owner_uidZfile_owner_namer#   r#   r$   get_file_owner_from_pathf  s   	r   source_pathc                 C  s   | sdS d}t j| rt j| }|S t j| rCd}t | D ]\}}}|D ]}t j||}t j|s?|t j|7 }q(q!|}|S )zFetch size of source path. Source can be a directory or a file.

    Args:
        source_path (str): Local path of data source.

    Returns:
        int: Source size in bytes.
    r   )r3   r.   isfilegetsizeisdirwalkrr   islink)r   size
total_sizedirpath_	filenamesffpr#   r#   r$   get_source_sizey  s"   		r   datac                 C  s   |  d}t|}|S )zCalculate the content size in bytes:
    - Encode the string to bytes using a specific encoding (e.g., UTF-8)
    - Get the length of the encoded bytes.

    Args:
        data (str): Data string.

    Returns:
        int: Size of string in bytes.
    r   )r   ro   )r   Zencoded_contentr   r#   r#   r$   calculate_content_size  s   
r   c                      s   e Zd ZU dZded< 	 dZded< 	 ded< 	 ded< 	 d	Zd
ed< 	 d; fddZd<ddZ		d=d>ddZ	d?ddZ
d=d@d!d"ZdAd(d)Ze	*	+dBdCd2d3ZedDd6d7ZedEd9d:Z  ZS )FPebbloLoaderAPIWrapperzWrapper for Pebblo Loader API.r0   api_keyr+   r&   rJ   classifier_url	cloud_urlFrG   rK   kwargsr   c                   sL   t |ddd|d< t |ddt|d< t |ddt|d< t jd	i | dS )
z%Validate that api key in environment.r   ZPEBBLO_API_KEYr/   r   ZPEBBLO_CLASSIFIER_URLr   ZPEBBLO_CLOUD_URLNr#   )r   _DEFAULT_CLASSIFIER_URL_DEFAULT_PEBBLO_CLOUD_URLsuper__init__)selfr   rp   r#   r$   r     s   zPebbloLoaderAPIWrapper.__init__appr;   rL   Nonec           	      C  s   d}|j dd}| jdkr"|  }| j tjj }| d|||}| jrW| jdd}|r=t	
|jd}|d|i |dti | j tjj }| d|||}dS dS )	z
        Send app discovery request to Pebblo server & cloud.

        Args:
            app (App): App instance to be discovered.
        NTZexclude_unsetr+   POSTcloud_requestpebblo_server_versionZpebblo_client_version)rE   rJ   _make_headersr   r   r"   valuemake_requestr   jsonloadstextrq   updatePLUGIN_VERSIONr   )	r   r   pebblo_resppayloadheadersZapp_discover_urlr   pebblo_cloud_urlr   r#   r#   r$   send_loader_discover  s$   
z+PebbloLoaderAPIWrapper.send_loader_discoverdocs_with_idList[IndexedDocument]rF   rE   rH   c              
   C  s4  | dd}t|}| |||\}}| ||||||}	i }
| jdkrm|  }| j tjj	 }z#| 
d|||	d}|rSt|j dg D ]}|
|d |i qGW n tyl } ztd| W Y d	}~nd	}~ww | jr| jdkr}| |	d |
 |	d
d	 | |	 |
S | jdkrtd td|
S )a  
        Send documents to Pebblo server for classification.
        Then send classified documents to Daxa cloud(If api_key is present).

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            app (App): App instance.
            loader_details (dict): Loader details.
            loading_end (bool): Boolean, indicating the halt of data loading by loader.
        r   r/   r+   r   i,  rD   r'   z3An Exception caught in classify_documents: local %sNrK   zpebblo-cloudz4API key is missing for sending docs to Pebblo cloud.)rq   r   prepare_docs_for_classificationbuild_classification_payloadrJ   r   r   r   r!   r   r   r   r   r   r   rs   rk   warningr   update_doc_datapopsend_docs_to_pebblo_cloud	NameError)r   r   r   rF   rH   r   rI   rD   source_aggregate_sizer   classified_docsr   Zload_doc_urlr   Zclassified_docer#   r#   r$   classify_documents  sH   






z)PebbloLoaderAPIWrapper.classify_documentsr   c              
   C  sh   | j dd}| j tjj }z| d|||}W dS  ty3 } ztd| W Y d}~dS d}~ww )z
        Send documents to Pebblo cloud.

        Args:
            payload (dict): The payload containing documents to be sent.
        Tr   r   z3An Exception caught in classify_documents: cloud %sN)	r   r   r   r!   r   r   rs   rk   r   )r   r   r   r   r   r   r#   r#   r$   r     s   z0PebbloLoaderAPIWrapper.send_docs_to_pebblo_cloudr   c                 C  s6   ddd}|r| j r|d| j i |S td |S )z
        Generate headers for the request.

        args:
            cloud_request (bool): flag indicating whether the request is for Pebblo
            cloud.
        returns:
            dict: Headers for the request.

        zapplication/json)AcceptzContent-Typez	x-api-keyz,API key is missing for Pebblo cloud request.)r   r   rk   r   )r   r   r   r#   r#   r$   r   (  s   
z$PebbloLoaderAPIWrapper._make_headersrD   
List[dict]rI   r   r   c                 C  sb   |j |j|t|j|d|| j| jd
}|du r$d|d< d|v r$||d d< td
i |jdd}|S )a  
        Build the payload for document classification.

        Args:
            app (App): App instance.
            docs (List[dict]): List of documents to be classified.
            loader_details (dict): Loader details.
            source_owner (str): Owner of the source.
            source_aggregate_size (int): Aggregate size of the source.
            loading_end (bool): Boolean indicating the halt of data loading by loader.

        Returns:
            dict: Payload for document classification.
        false)
r9   r<   rD   r@   r>   rF   rH   rI   rJ   rK   TtruerH   rF   r   r   Nr#   )r9   r<   r   r>   rJ   rK   rB   rE   )r   r   rD   rF   rI   r   rH   r   r#   r#   r$   r   ?  s$   
z3PebbloLoaderAPIWrapper.build_classification_payloadN   methodurlr   Optional[dict]timeoutOptional[Response]c              
   C  s  zYt | ||||d}td| |j jtt|j jr|j jng t|j |jtj	kr6t
d|j  |W S |jtjkrHt
d|j  |W S |jtjkrWt
d|j  |W S  tyi   t
d| Y dS  ty } zt
d| W Y d}~dS d}~ww )	a  
        Make a request to the Pebblo API

        Args:
            method (str): HTTP method (GET, POST, PUT, DELETE, etc.).
            url (str): URL for the request.
            headers (dict): Headers for the request.
            payload (Optional[dict]): Payload for the request (for POST, PUT, etc.).
            timeout (int): Timeout for the request in seconds.

        Returns:
            Optional[Response]: Response object if the request is successful.
        )r   r   r   r   r   z5Request: method %s, url %s, len %s response status %szPebblo Server: Error z$Pebblo received an invalid payload: z-Pebblo returned an unexpected response code: zUnable to reach server %sz'An Exception caught in make_request: %sN)r   rk   r}   r   r&   ro   bodystatus_coder   INTERNAL_SERVER_ERRORr   BAD_REQUESTr   OKr   rs   )r   r   r   r   r   responser   r#   r#   r$   r   k  s@   
	z#PebbloLoaderAPIWrapper.make_requestr   Tuple[List[dict], int]c              
   C  s6  g }d}dd | D }d}|D ]}| di }| dg }	|d dkr.t| d	|d
 }
nt| d| d	|}
| dt|
}| dt|
}t| d}t|}||7 }| ddp`d}|||
|| di  d|d|	rwd|	ini |durd|ini  |d dkr|s| d|d
< d}q||fS )a  
        Prepare documents for classification.

        Args:
            docs_with_id (List[IndexedDocument]): List of documents to be classified.
            source_path (str): Source path of the documents.
            loader_details (dict): Contains loader info.

        Returns:
            Tuple[List[dict], int]: Documents and the aggregate size
            of the source.
        r   c                 S  s   g | ]}|  qS r#   )rE   )re   r   r#   r#   r$   rf     s    zJPebbloLoaderAPIWrapper.prepare_docs_for_classification.<locals>.<listcomp>FmetadataZauthorized_identitiesrV   r   r^   r   rT   r<   r   r   r'   Nlast_modified)r   r   r'   r   Z
file_ownerZsource_path_sizeZsource_full_urlT)rq   rU   r   r   r&   r   r   )r   r   rF   rD   r   Zdoc_contentZsource_path_updater   Zdoc_metadataZdoc_authorized_identitiesZdoc_source_pathZdoc_source_ownerZdoc_source_sizer   Zpage_content_sizeri   r#   r#   r$   r     s`   

z6PebbloLoaderAPIWrapper.prepare_docs_for_classificationr   c              
   C  sX   | D ]'}| |d i }|| d| d| di | di d |d qdS )	z
        Update the document data with classified information.

        Args:
            docs (List[dict]): List of document data to be updated.
            classified_docs (dict): The dictionary containing classified documents.
        r'   pb_checksumloader_source_pathentitiestopics)r   r   r   r   r   N)rq   r   r   )rD   r   Zdoc_dataZclassified_datar#   r#   r$   r     s   	

	z&PebbloLoaderAPIWrapper.update_doc_data)r   r   )r   r;   rL   r   )F)
r   r   r   r;   rF   rE   rH   rG   rL   rE   )r   rE   rL   r   )r   rG   rL   rE   )r   r;   rD   r   rF   rE   rI   r&   r   r   rH   rG   rL   rE   )Nr   )r   r&   r   r&   r   rE   r   r   r   r   rL   r   )r   r   r   r&   rF   rE   rL   r   )rD   r   r   rE   rL   r   )r   r   r   r    r)   rJ   rK   r   r   r   r   r   r   staticmethodr   r   r   __classcell__r#   r#   r   r$   r     s6   
 
$
>
,1Fr   )r.   r&   rL   r&   )rV   r&   rL   r&   )rV   r   rL   r&   )rL   rv   )rL   r&   )r   )rD   r   r   r   rL   r   )r`   r&   rL   r&   )r   r&   rL   r   )r   r&   rL   r   )=
__future__r   r   loggingr3   rP   r2   enumr   httpr   typingr   r   r   r   r	   Zlangchain_core.documentsr
   Zlangchain_core.envr   Zlangchain_core.utilsr   Zpydanticr   requestsr   r   Zrequests.exceptionsr   Z)langchain_community.document_loaders.baser   	getLoggerr   rk   r   r   r   ZBATCH_SIZE_BYTESZfile_loaderZ
dir_loaderZ	in_memoryZcloud_folderrX   r&   r   r%   r*   r8   r;   rB   rU   r\   ru   r~   r|   r   r   r   r   r   r#   r#   r#   r$   <module>   s^    
	



H

(

