o
    Zht                     @  s  d Z ddlmZ ddlZddlZddlZddlmZmZm	Z	m
Z
mZmZmZ ddlZddlmZ ddlmZ ddlmZ ddlmZ dd	lmZ ed
gZd
ZdZdZee dj!Z"e#e dj!Z$ej%j!Z&d ddZ'd!ddZ(d!ddZ)d"ddZ*d"ddZ+G dd deZ,dS )#z&Wrapper around TileDB vector database.    )annotationsN)AnyDictIterableListMappingOptionalTuple)Document)
Embeddingsguard_import)VectorStore)maximal_marginal_relevanceZ	euclideanZ	documentsvectorsuint64float32returnr   c                   C  s   t dt dfS )z@Import tiledb-vector-search if available, otherwise raise error.tiledb.vector_searchtiledbr    r   r   ^/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/vectorstores/tiledb.pydependable_tiledb_import   s   r   groupstrc                 C  
   | t  jS ) Get the URI of the vector index.)VECTOR_INDEX_NAMEurir   r   r   r   get_vector_index_uri_from_group#   s   
r    c                 C  r   )zGet the URI of the documents array from group.

    Args:
        group: TileDB group object.

    Returns:
        URI of the documents array.
    )DOCUMENTS_ARRAY_NAMEr   r   r   r   r   "get_documents_array_uri_from_group(   s   
	r"   r   c                 C     |  dt  S )r   /)r   r   r   r   r   get_vector_index_uri4      r&   c                 C  r#   )z#Get the URI of the documents array.r$   )r!   r%   r   r   r   get_documents_array_uri9   r'   r(   c                   @  s  e Zd ZdZdddddddgddZedhddZddeddid'd(Zddd)d*djd,d-Z	ddd)d*dkd/d0Z
			)dldmd2d3Z			)dldnd4d5Zdd)d6dd7dod9d:Z		)	6	dpdqd;d<Z		)	6	dpdrd=d>Zed?dd@dsdGdHZeddedIddJdKdtdSdTZ	JdudvdVdWZ			JdwdxdYdZZedded[dIddJfdyd\d]ZeddedIddJdKdzd`daZeedddbd{dcddZd|dedfZdS )}TileDBa2  TileDB vector store.

    To use, you should have the ``tiledb-vector-search`` python package installed.

    Example:
        .. code-block:: python

            from langchain_community import TileDB
            embeddings = OpenAIEmbeddings()
            db = TileDB(embeddings, index_uri, metric)

     NF)vector_index_uridocs_array_uriconfig	timestampallow_dangerous_deserialization	embeddingr   	index_urir   metricr+   r,   r-   Optional[Mapping[str, Any]]r.   r   r/   boolkwargsc                K  sV  |st d|| _|j| _|| _|| _|| _tdtd}
}|j|d| |	| jd}|dkr3|nt
|| _|dkr>|nt|| _|  |	| jd}|jd| _|  || _| jdkrs|
jjd| j| j| jd	|	| _n&| jd
kr|
jjd| j| j| jd	|	| _W d   dS W d   dS W d   dS 1 sw   Y  dS )a  Initialize with necessary components.

        Args:
            allow_dangerous_deserialization: whether to allow deserialization
                of the data which involves loading data using pickle.
                data can be modified by malicious actors to deliver a
                malicious payload that results in execution of
                arbitrary code on your machine.
        a  TileDB relies on pickle for serialization and deserialization. This can be dangerous if the data is intercepted and/or modified by malicious actors prior to being de-serialized. If you are sure that the data is safe from modification, you can  set allow_dangerous_deserialization=True to proceed. Loading of compromised data using pickle can result in execution of arbitrary code on your machine.r   r   Zctx_or_configrr*   
index_typeFLAT)r   r-   r.   IVF_FLATNr   )
ValueErrorr0   Zembed_queryembedding_functionr1   r2   r-   r   	scope_ctxGroupr    r+   r"   r,   closemetagetr8   r.   
flat_indexZ	FlatIndexvector_indexivf_flat_indexZIVFFlatIndex)selfr0   r1   r2   r+   r,   r-   r.   r/   r5   	tiledb_vsr   index_groupr   r   r   r   __init__L   s`   	



"zTileDB.__init__r   Optional[Embeddings]c                 C  s   | j S N)r0   rE   r   r   r   
embeddings   s   zTileDB.embeddings   )kfilterscore_thresholdids	List[int]scoresList[float]rN   intrO   Optional[Dict[str, Any]]rP   floatList[Tuple[Document, float]]c                  sV  t d}g }|j| jd| j| jd}t||D ]\}	}
|	dkr$|
dkr$q|	tkr-|
tkr-q||	 }|du s=t|d dkrGt	d|	 d| |
d	}tt|d d d
 |durntt| tj }| _|durdd | D }t fdd| D r| |
f q| |
f q|  fdd|D }|d| S )a  Turns TileDB results into a list of documents and scores.

        Args:
            ids: List of indices of the documents in the index.
            scores: List of distances of the documents in the index.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
            score_threshold: Optional, a floating point value to filter the
                resulting set of retrieved docs
        Returns:
            List of Documents and scores.
        r   r7   r.   r-   r   NtextzCould not find document for id z, got metadata)page_contentc                 S  s&   i | ]\}}|t |ts|gn|qS r   )
isinstancelist.0keyvaluer   r   r   
<dictcomp>   s    z0TileDB.process_index_results.<locals>.<dictcomp>c                 3  s$    | ]\}} j ||v V  qd S rJ   )r[   rA   r_   )
result_docr   r   	<genexpr>   s
    
z/TileDB.process_index_results.<locals>.<genexpr>c                   s    g | ]\}}| kr||fqS r   r   )r`   docscore)rP   r   r   
<listcomp>        z0TileDB.process_index_results.<locals>.<listcomp>)r   openr,   r.   r-   zip
MAX_UINT64MAX_FLOAT_32lenr;   rA   r
   r   pickleloadsnparraytolistastypeuint8tobytesr[   itemsallappendr?   )rE   rQ   rS   rN   rO   rP   r   docs
docs_arrayidxrg   rf   Zpickled_metadatar[   r   )rd   rP   r   process_index_results   sB   
zTileDB.process_index_results   rN   rO   fetch_kr   c          	      K  sz   d|v r
| d}nt}| jjtt|tjgtjfd|du r'|n|i|\}}| j|d |d |||dS )a[  Return docs most similar to query.

        Args:
            embedding: Embedding vector to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.
            **kwargs: kwargs to be passed to similarity search. Can include:
                nprobe: Optional, number of partitions to check if using IVF_FLAT index
                score_threshold: Optional, a floating point value to filter the
                    resulting set of retrieved docs

        Returns:
            List of documents most similar to the query text and distance
            in float for each. Lower score represents more similarity.
        rP   rN   Nr   rQ   rS   rO   rN   rP   )	pop	MAX_FLOATrC   queryrq   rr   rt   r   r}   )	rE   r0   rN   rO   r   r5   rP   dir   r   r   &similarity_search_with_score_by_vector   s    
z-TileDB.similarity_search_with_score_by_vectorr   c                K  s(   |  |}| j|f|||d|}|S )a  Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of documents most similar to the query text with
            Distance as float. Lower score represents more similarity.
        r   )r<   r   )rE   r   rN   rO   r   r5   r0   rz   r   r   r   similarity_search_with_score   s   
z#TileDB.similarity_search_with_scoreList[Document]c                 K  (   | j |f|||d|}dd |D S )a  Return docs most similar to embedding vector.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of Documents most similar to the embedding.
        r   c                 S     g | ]\}}|qS r   r   r`   rf   _r   r   r   rh   5      z6TileDB.similarity_search_by_vector.<locals>.<listcomp>)r   )rE   r0   rN   rO   r   r5   docs_and_scoresr   r   r   similarity_search_by_vector  s   z"TileDB.similarity_search_by_vectorc                 K  r   )a  Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of Documents most similar to the query.
        r   c                 S  r   r   r   r   r   r   r   rh   N  r   z,TileDB.similarity_search.<locals>.<listcomp>)r   )rE   r   rN   rO   r   r5   r   r   r   r   similarity_search7  s   zTileDB.similarity_search      ?rN   r   lambda_multrO   r   c                  s   d|v r
| d}nt} jjtt|tjgtjfd|du r'|n|d i|\}}	 j|	d |d ||du r@|n|d |d}
 fdd|
D }t	tj|gtjd	|||d
}g }|D ]	}|
|
|  qc|S )az  Return docs and their similarity scores selected using the maximal marginal
            relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch before filtering to
                     pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents and similarity scores selected by maximal marginal
                relevance and score for each.
        rP   rN   N   r   r   c                   s$   g | ]\}} j |jgd  qS r   )r0   embed_documentsr\   r   rK   r   r   rh   }  s    zMTileDB.max_marginal_relevance_search_with_score_by_vector.<locals>.<listcomp>dtype)rN   r   )r   r   rC   r   rq   rr   rt   r   r}   r   ry   )rE   r0   rN   r   r   rO   r5   rP   rS   indicesresultsrL   Zmmr_selectedr   r   r   rK   r   2max_marginal_relevance_search_with_score_by_vectorP  s<    

z9TileDB.max_marginal_relevance_search_with_score_by_vectorc                 K  s*   | j |f||||d|}dd |D S )a  Return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch before filtering to
                     pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        r   c                 S  r   r   r   r   r   r   r   rh     r   zBTileDB.max_marginal_relevance_search_by_vector.<locals>.<listcomp>)r   )rE   r0   rN   r   r   rO   r5   r   r   r   r   'max_marginal_relevance_search_by_vector  s   z.TileDB.max_marginal_relevance_search_by_vectorc           	      K  s*   |  |}| j|f||||d|}|S )a  Return docs selected using the maximal marginal relevance.

        Maximal marginal relevance optimizes for similarity to query AND diversity
        among selected documents.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            fetch_k: Number of Documents to fetch before filtering (if needed) to
                     pass to MMR algorithm.
            lambda_mult: Number between 0 and 1 that determines the degree
                        of diversity among the results with 0 corresponding
                        to maximum diversity and 1 to minimum diversity.
                        Defaults to 0.5.
        Returns:
            List of Documents selected by maximal marginal relevance.
        r   )r<   r   )	rE   r   rN   r   r   rO   r5   r0   rz   r   r   r   max_marginal_relevance_search  s   
z$TileDB.max_marginal_relevance_searchT)	metadatasr-   r8   
dimensionsvector_typenp.dtyper   Nonec                C  sv  t dt d}}|j|d z|| W n |jy& }	 z|	d }	~	ww ||d}
t|
j}t|
j}|dkrF|jj	||||d n|dkrT|j
j	||||d |
j|td |jd	d
td fttjd}||}|jdtddd}|g}|r|jdtjdd}|| |j|dd|d}|j	|| |
j|td |
  W d    d S 1 sw   Y  d S )Nr   r   r6   wr9   )r   r   r   r-   r:   )nameidr      )r   domainr   rZ   ZU1T)r   r   varr[   F)r   sparseZallows_duplicatesattrs)r   r=   Zgroup_createZTileDBErrorr>   r&   r   r(   rB   createrD   addr   ZDimrl   rq   r   r   DomainZAttrru   ry   ZArraySchemaZArrayr!   r?   )clsr1   r8   r   r   r   r-   rF   r   errr   r+   docs_uridimdomZ	text_attrr   metadata_attrZschemar   r   r   r     sb   






"zTileDB.creater9   r   )r   rQ   r2   r8   r-   index_timestamptexts	List[str]rL   List[List[float]]Optional[List[dict]]Optional[List[str]]r   c             
   K  s  |t vrtd| dtt  tdtd}}t|tj}| j|||j	d |j
|d u|	d |j|	d |s@tdt|}t|}|d u rSd	d
 |D }t|tj}|jjd|||||
dkrj|
nd |	d| ||dZ}|d u rtjt|tjd}tt|D ]}|||< qi }t||d< |d urtjt|gtd}d}|D ]}tjt|tjd||< |d7 }q||d< |||< W d    n1 sw   Y  W d    n1 sw   Y  | d||||	d|S )NzUnsupported distance metric: z. Expected one of r   r   r   )r1   r8   r   r   r   r-   r6   z3embeddings must be provided to build a TileDB indexc                 S      g | ]}t td td qS r   r   r   randomrandintrl   r`   r   r   r   r   rh   ?  ri   z!TileDB.__from.<locals>.<listcomp>r   )r8   r1   input_vectorsexternal_idsr   r-   r   r   rZ   r[   )r0   r1   r2   r-   r   )INDEX_METRICSr;   r^   r   rq   rr   rt   r   r   shaper   r=   r&   r(   r   Z	ingestionZingestrj   Zzerosrn   rangeemptyobject
frombufferro   dumpsru   )r   r   rL   r0   r1   r   rQ   r2   r8   r-   r   r5   rF   r   r   r+   r   r   Ar   datar   r[   r   r   r   Z__from  s   	



%zTileDB.__fromOptional[bool]c                 K  s2   t |t j}| jj||dkr|ndd dS )am  Delete by vector ID or other criteria.

        Args:
            ids: List of ids to delete.
            timestamp: Optional timestamp to delete with.
            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
            Optional[bool]: True if deletion is successful,
            False otherwise, None if not implemented.
        r   N)r   r.   T)rq   rr   rt   r   rC   Zdelete_batch)rE   rQ   r.   r5   r   r   r   r   deletee  s
   zTileDB.deleteIterable[str]c                 K  s6  t d}| jt|}|du rdd |D }t|tj}tjt	|dd}	t
t	|D ]}
tj||
 tjd|	|
< q/| jj|	||dkrI|ndd i }t||d	< |durtjt	|gtd}d}
|D ]}tjt|tjd||
< |
d
7 }
qh||d< |j| jd|dkr|nd| jd}|||< |  |S )a  Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            ids: Optional ids of each text object.
            timestamp: Optional timestamp to write new texts with.
            kwargs: vectorstore specific parameters

        Returns:
            List of ids from adding the texts into the vectorstore.
        r   Nc                 S  r   r   r   r   r   r   r   rh     ri   z$TileDB.add_texts.<locals>.<listcomp>Or   r   )r   r   r.   rZ   r   r[   r   rY   )r   r0   r   r^   rq   rr   rt   r   r   rn   r   r   rC   Zupdate_batchr   r   ro   r   ru   rj   r,   r-   r?   )rE   r   r   rQ   r.   r5   r   rL   r   r   r   rz   r   r[   r{   r   r   r   	add_textsz  s>   
zTileDB.add_textsz/tmp/tiledb_arrayc
                 K  s4   g }| |}| jd||||||||||	d
|
S )a  Construct a TileDB index from raw documents.

        Args:
            texts: List of documents to index.
            embedding: Embedding function to use.
            metadatas: List of metadata dictionaries to associate with documents.
            ids: Optional ids of each text object.
            metric: Metric to use for indexing. Defaults to "euclidean".
            index_uri: The URI to write the TileDB arrays
            index_type: Optional,  Vector index type ("FLAT", IVF_FLAT")
            config: Optional, TileDB config
            index_timestamp: Optional, timestamp to write new texts with.

        Example:
            .. code-block:: python

                from langchain_community import TileDB
                from langchain_community.embeddings import OpenAIEmbeddings
                embeddings = OpenAIEmbeddings()
                index = TileDB.from_texts(texts, embeddings)
        
r   rL   r0   r   rQ   r2   r1   r8   r-   r   Nr   )r   _TileDB__from)r   r   r0   r   rQ   r2   r1   r8   r-   r   r5   rL   r   r   r   
from_texts  s    #
zTileDB.from_textstext_embeddingsList[Tuple[str, List[float]]]c                K  sB   dd |D }dd |D }| j d||||||||||	d
|
S )a  Construct TileDB index from embeddings.

        Args:
            text_embeddings: List of tuples of (text, embedding)
            embedding: Embedding function to use.
            index_uri: The URI to write the TileDB arrays
            metadatas: List of metadata dictionaries to associate with documents.
            metric: Optional, Metric to use for indexing. Defaults to "euclidean".
            index_type: Optional, Vector index type ("FLAT", IVF_FLAT")
            config: Optional, TileDB config
            index_timestamp: Optional, timestamp to write new texts with.

        Example:
            .. code-block:: python

                from langchain_community import TileDB
                from langchain_community.embeddings import OpenAIEmbeddings
                embeddings = OpenAIEmbeddings()
                text_embeddings = embeddings.embed_documents(texts)
                text_embedding_pairs = list(zip(texts, text_embeddings))
                db = TileDB.from_embeddings(text_embedding_pairs, embeddings)
        c                 S     g | ]}|d  qS r   r   r`   tr   r   r   rh   	  r   z*TileDB.from_embeddings.<locals>.<listcomp>c                 S  r   )r   r   r   r   r   r   rh   
  r   r   Nr   )r   )r   r   r0   r1   r   rQ   r2   r8   r-   r   r5   r   rL   r   r   r   from_embeddings  s    %zTileDB.from_embeddings)r2   r-   r.   c                K  s   | d|||||d|S )a}  Load a TileDB index from a URI.

        Args:
            index_uri: The URI of the TileDB vector index.
            embedding: Embeddings to use when generating queries.
            metric: Optional, Metric to use for indexing. Defaults to "euclidean".
            config: Optional, TileDB config
            timestamp: Optional, timestamp to use for opening the arrays.
        )r0   r1   r2   r-   r.   Nr   r   )r   r1   r0   r2   r-   r.   r5   r   r   r   load  s   zTileDB.loadc                 K  s   | j jdi || _ d S )Nr   )rC   consolidate_updates)rE   r5   r   r   r   r   7  s   zTileDB.consolidate_updates)r0   r   r1   r   r2   r   r+   r   r,   r   r-   r3   r.   r   r/   r4   r5   r   )r   rI   )rQ   rR   rS   rT   rN   rU   rO   rV   rP   rW   r   rX   )r0   rT   rN   rU   rO   rV   r   rU   r5   r   r   rX   )r   r   rN   rU   rO   rV   r   rU   r5   r   r   rX   )rM   Nr~   )r0   rT   rN   rU   rO   rV   r   rU   r5   r   r   r   )r   r   rN   rU   rO   rV   r   rU   r5   r   r   r   )r0   rT   rN   rU   r   rU   r   rW   rO   rV   r5   r   r   rX   )rM   r~   r   N)r0   rT   rN   rU   r   rU   r   rW   rO   rV   r5   r   r   r   )r   r   rN   rU   r   rU   r   rW   rO   rV   r5   r   r   r   )r1   r   r8   r   r   rU   r   r   r   r4   r-   r3   r   r   )r   r   rL   r   r0   r   r1   r   r   r   rQ   r   r2   r   r8   r   r-   r3   r   rU   r5   r   r   r)   )Nr   )rQ   r   r.   rU   r5   r   r   r   )NNr   )r   r   r   r   rQ   r   r.   rU   r5   r   r   r   )r   r   r0   r   r   r   rQ   r   r2   r   r1   r   r8   r   r-   r3   r   rU   r5   r   r   r)   )r   r   r0   r   r1   r   r   r   rQ   r   r2   r   r8   r   r-   r3   r   rU   r5   r   r   r)   )r1   r   r0   r   r2   r   r-   r3   r.   r   r5   r   r   r)   )r5   r   r   r   )__name__
__module____qualname____doc__rH   propertyrL   r   r}   r   r   r   r   r   r   r   classmethodr   DEFAULT_METRICr   r   r   r   r   r   r   r   r   r   r   r)   >   s    J=+# >'%?Q725r)   )r   r   )r   r   r   r   )r   r   r   r   )-r   
__future__r   ro   r   systypingr   r   r   r   r   r   r	   numpyrq   Zlangchain_core.documentsr
   Zlangchain_core.embeddingsr   Zlangchain_core.utilsr   Zlangchain_core.vectorstoresr   Z&langchain_community.vectorstores.utilsr   	frozensetr   r   r!   r   Ziinfor   maxrl   Zfinform   
float_infor   r   r    r"   r&   r(   r)   r   r   r   r   <module>   s2    $





