o
    ZhQ,                     @   s   d Z ddlZddlZddlZddlmZ ddlmZmZm	Z	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZ eeZ G dd	 d	eZ!G d
d deZ"dS )z:Pebblo's safe dataloader is a wrapper for document loaders    N)version)AnyDictIterableIteratorListOptional)Document)
BaseLoader)BATCH_SIZE_BYTESPLUGIN_VERSIONApp	FrameworkIndexedDocumentPebbloLoaderAPIWrappergenerate_size_based_batchesget_full_pathget_loader_full_pathget_loader_typeget_runtimeget_source_sizec                   @   s  e Zd ZU dZdZeed< 					d+dddded	ed
edede	e dede	e dedefddZ
dee fddZd,ddZdee fddZed,ddZdefddZdee fddZd edee fd!d"Zdee fd#d$Zd%ed&edefd'd(Zd eddfd)d*ZdS )-PebbloSafeLoaderzkPebblo Safe Loader class is a wrapper around document loaders enabling the data
    to be scrutinized.
    F_discover_sent Nlocal)classifier_locationanonymize_snippetslangchain_loadernameownerdescriptionapi_keyload_semanticclassifier_urlr   r   c                C   s   |rt |tstd|| _tt | _|| _tj	
dp|| _|| _|| _t| j| _g | _g | _tt| jdd dd }
t|
| _t| j| _t| _|
| j| jd| jdkredt| jini | _|  | _t||||	d	| _| j| j d S )
NzMust specify a valid name.ZPEBBLO_LOAD_SEMANTIC.'r   )loadersource_pathsource_typesource_path_size)r!   r   r#   r   ) 
isinstancestr	NameErrorapp_nameuuiduuid4load_idr'   osenvirongetr"   r   r    r   r(   docsdocs_with_idtypesplitr   r)   r   r*   r   
batch_sizeloader_details_get_app_detailsappr   	pb_clientZsend_loader_discover)selfr   r   r   r    r!   r"   r#   r   r   Zloader_name r?   b/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/document_loaders/pebblo.py__init__%   s>   "


zPebbloSafeLoader.__init__returnc                 C   s   | j  | _|   | jS )zxLoad Documents.

        Returns:
            list: Documents fetched from load method of the wrapped `loader`.
        )r'   loadr5   classify_in_batches)r>   r?   r?   r@   rC   V   s   zPebbloSafeLoader.loadc           	      C   s   t | j| j}g }t|}t|D ]6\}}||d k}|| _|  | _| jj| j| j	| j
|d}| | | jr>| |}n|  }|| q|| _dS )z
        Classify documents in batches.
        This is to avoid API timeouts when sending large number of documents.
        Batches are generated based on the page_content size.
           )Zloading_endN)r   r5   r9   len	enumerate_index_docsr6   r=   classify_documentsr<   r:   _add_pebblo_specific_metadatar"   _add_semantic_to_docs_unindex_docsextend)	r>   ZbatchesZprocessed_docsZtotal_batchesibatchZis_last_batchclassified_docsZbatch_processed_docsr?   r?   r@   rD   a   s*   


z$PebbloSafeLoader.classify_in_batchesc              
   c   s    z| j  }W n ty& } z| j jj d}t| t||d}~ww 	 zt|}W n ty;   g | _	Y dS w t
|f| _	|  | _| j| j| j| j}| | | jra| || _	n|  | _	| j	d V  q()zLoad documents in lazy fashion.

        Raises:
            NotImplementedError: raised when lazy_load id not implemented
            within wrapped loader.

        Yields:
            list: Documents from loader's lazy loading.
        z does not implement lazy_load()NTr   )r'   	lazy_loadNotImplementedError	__class____name__loggererrornextStopIterationr5   listrH   r6   r=   rI   r<   r:   rJ   r"   rK   rL   )r>   Zdoc_iteratorexcZerr_strdocclassified_docr?   r?   r@   rQ      s6   





zPebbloSafeLoader.lazy_loadc                 C   s
   d| _ d S )NT)r   )clsr?   r?   r@   set_discover_sent   s   
z"PebbloSafeLoader.set_discover_sentc                 C   s:   t  \}}t| j| j| j| j||ttdtddd}|S )z\Fetch app details. Internal method.

        Returns:
            App: App details.
        Zlangchain_community)r   r   )r   r   r    r1   runtime	frameworkZplugin_versionclient_version)	r   r   r.   r   r    r1   r   r   r   )r>   r`   r_   r<   r?   r?   r@   r;      s   
z!PebbloSafeLoader._get_app_detailsc                 C      dd t | jD }|S )z
        Indexes the documents and returns a list of IndexedDocument objects.

        Returns:
            List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
        c                 S   s*   g | ]\}}t dd t|i| qS )pb_idr?   )r   r,   dict.0rN   r[   r?   r?   r@   
<listcomp>   s    z0PebbloSafeLoader._index_docs.<locals>.<listcomp>)rG   r5   )r>   r6   r?   r?   r@   rH         zPebbloSafeLoader._index_docsrP   c                 C   sV   dd | j D }| D ]}|d}||v r| || | qdd | D }|S )aF  
        Adds semantic metadata to the given list of documents.

        Args:
            classified_docs (Dict): A dictionary of dictionaries containing the
                classified documents with pb_id as key.

        Returns:
            List[Document]: A list of Document objects with added semantic metadata.
        c                 S   s    i | ]}|j t|j|jd qS )page_contentmetadata)rc   r	   rj   rk   rf   r[   r?   r?   r@   
<dictcomp>   s    z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<dictcomp>rc   c                 S   s   g | ]}|qS r?   r?   rl   r?   r?   r@   rg      s    z:PebbloSafeLoader._add_semantic_to_docs.<locals>.<listcomp>)r6   valuesr4   _add_semantic_to_doc)r>   rP   Zindexed_docsr\   Zdoc_idZsemantic_metadata_docsr?   r?   r@   rK      s   
z&PebbloSafeLoader._add_semantic_to_docsc                 C   rb   )z
        Converts a list of IndexedDocument objects to a list of Document objects.

        Returns:
            List[Document]: A list of Document objects.
        c                 S   s    g | ]\}}t |j|jd qS ri   )r	   rj   rk   re   r?   r?   r@   rg      s    z2PebbloSafeLoader._unindex_docs.<locals>.<listcomp>)rG   r6   )r>   r5   r?   r?   r@   rL      rh   zPebbloSafeLoader._unindex_docsr[   r\   c                 C   s8   t |di  |jd< t |di  |jd< |S )a4  
        Adds semantic metadata to the given document in-place.

        Args:
            doc (Document): A Document object.
            classified_doc (dict): A dictionary containing the classified document.

        Returns:
            Document: The Document object with added semantic metadata.
        entitiesZpebblo_semantic_entitiesZtopicsZpebblo_semantic_topics)rY   r4   keysrk   )r>   r[   r\   r?   r?   r@   ro      s   

z%PebbloSafeLoader._add_semantic_to_docc              	   C   st   | j D ]4}|j}| jjjdkrt|d| j|d< nt|d|d| j|d< ||ji dd|d< qdS )z*Add Pebblo specific metadata to documents.ZSharePointLoadersource	full_pathZpb_checksumN)	r6   rk   r'   rS   rT   r   r4   r(   rc   )r>   rP   r[   Zdoc_metadatar?   r?   r@   rJ     s   


z.PebbloSafeLoader._add_pebblo_specific_metadata)r   r   NFN)rB   N)rT   
__module____qualname____doc__r   bool__annotations__r
   r,   r   rA   r   r	   rC   rD   r   rQ   classmethodr^   r   r;   r   rH   r   rK   rL   rd   ro   rJ   r?   r?   r?   r@   r      sR   
 


1
 "r   c                   @   s   e Zd ZdZddddddee dee deee  deeee	f  deeeee	f   d	dfd
dZ
d	ee fddZd	ee fddZdS )PebbloTextLoaderz
    Loader for text data.

    Since PebbloSafeLoader is a wrapper around document loaders, this loader is
    used to load text data directly into Documents.
    N)rr   idsrk   	metadatastextsrr   r{   rk   r|   rB   c                C   s"   || _ || _|| _|| _|| _dS )a  
        Args:
            texts: Iterable of text data.
            source: Source of the text data.
                Optional. Defaults to None.
            ids: List of unique identifiers for each text.
                Optional. Defaults to None.
            metadata: Metadata for all texts.
                Optional. Defaults to None.
            metadatas: List of metadata for each text.
                Optional. Defaults to None.
        N)r}   rr   r{   rk   r|   )r>   r}   rr   r{   rk   r|   r?   r?   r@   rA     s
   
zPebbloTextLoader.__init__c                 c   s    t | jD ]9\}}d}| jpi }| jr(|t| jk r(| j| r(|| j|  | jr7|t| jk r7| j| }t|||dV  qdS )zi
        Lazy load text data into Documents.

        Returns:
            Iterator of Documents
        N)idrj   rk   )rG   r}   rk   r|   rF   updater{   r	   )r>   rN   textZ_idrk   r?   r?   r@   rQ   9  s   

zPebbloTextLoader.lazy_loadc                 C   s    g }|   D ]}|| q|S )z`
        Load text data into Documents.

        Returns:
            List of Documents
        )rQ   append)r>   Z	documentsr[   r?   r?   r@   rC   I  s   zPebbloTextLoader.load)rT   rt   ru   rv   r   r,   r   r   r   r   rA   r   r	   rQ   rC   r?   r?   r?   r@   rz     s*    

rz   )#rv   loggingr2   r/   importlib.metadatar   typingr   r   r   r   r   r   Zlangchain_core.documentsr	   Z)langchain_community.document_loaders.baser
   Z$langchain_community.utilities.pebblor   r   r   r   r   r   r   r   r   r   r   r   	getLoggerrT   rU   r   rz   r?   r?   r?   r@   <module>   s     8
 y