o
    ZhQ                     @  s   d dl mZ d dlZd dlZd dlZd dlmZ d dlmZm	Z	m
Z
mZmZmZmZ d dlZd dlmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZ dddZ dddZ!G dd deZ"dS )    )annotationsN)Path)AnyCallableDictIterableListOptionalTuple)Document)
Embeddingsguard_import)VectorStore)AddableMixinDocstore)InMemoryDocstore)DistanceStrategyx
np.ndarrayreturnc                 C  s$   | t t jj| ddddd } | S )z!Normalize vectors to unit length.T)ZaxisZkeepdimsg-q=N)npZclipZlinalgZnorm)r    r   ]/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/vectorstores/scann.py	normalize   s    r   r   c                   C  s   t dS )z=
    Import `scann` if available, otherwise raise error.
    scannr   r   r   r   r   dependable_scann_import   s   r   c                   @  s(  e Zd ZdZddejdfdVddZ		dWdXd!d"Z		dWdYd#d$Z		dWdZd'd(Z	d[d\d*d+Z
	,		-d]d^d5d6Z	,		-d]d_d9d:Z	,		-d]d`d<d=Z	,		-d]dad>d?Ze			dbdcdAdBZe		dWdddCdDZe		dWdedFdGZdfdgdKdLZe	dfddMdhdOdPZdidRdSZ	,		-d]d_dTdUZdS )jScaNNa  `ScaNN` vector store.

    To use, you should have the ``scann`` python package installed.

    Example:
        .. code-block:: python

            from langchain_community.embeddings import HuggingFaceEmbeddings
            from langchain_community.vectorstores import ScaNN

            model_name = "sentence-transformers/all-mpnet-base-v2"
            db = ScaNN.from_texts(
                ['foo', 'bar', 'barz', 'qux'],
                HuggingFaceEmbeddings(model_name=model_name))
            db.similarity_search('foo?', k=1)
    NF	embeddingr   indexr   docstorer   index_to_docstore_idDict[int, str]relevance_score_fn"Optional[Callable[[float], float]]normalize_L2booldistance_strategyr   scann_configOptional[str]c	           	      C  s4   || _ || _|| _|| _|| _|| _|| _|| _dS )z%Initialize with necessary components.N)r   r    r!   r"   r(   override_relevance_score_fn_normalize_L2Z_scann_config)	selfr   r    r!   r"   r$   r&   r(   r)   r   r   r   __init__3   s   
zScaNN.__init__textsIterable[str]
embeddingsIterable[List[float]]	metadatasOptional[List[dict]]idsOptional[List[str]]kwargsr   	List[str]c                 K  s&   t | jtstd| j dtd)NSIf trying to add texts, the underlying docstore should support adding items, which 	 does notz(Updates are not available in ScaNN, yet.)
isinstancer!   r   
ValueErrorNotImplementedError)r-   r/   r1   r3   r5   r7   r   r   r   Z__addH   s   zScaNN.__addc                 K  s*   | j t|}| j||f||d|S )al  Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            ids: Optional list of unique IDs.

        Returns:
            List of ids from adding the texts into the vectorstore.
        r3   r5   )r   embed_documentslist_ScaNN__add)r-   r/   r3   r5   r7   r1   r   r   r   	add_textsW   s   zScaNN.add_textstext_embeddings!Iterable[Tuple[str, List[float]]]c                 K  sD   t | jtstd| j dt| \}}| j||f||d|S )a  Run more texts through the embeddings and add to the vectorstore.

        Args:
            text_embeddings: Iterable pairs of string and embedding to
                add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            ids: Optional list of unique IDs.

        Returns:
            List of ids from adding the texts into the vectorstore.
        r9   r:   r>   )r;   r!   r   r<   ziprA   )r-   rC   r3   r5   r7   r/   r1   r   r   r   add_embeddingsl   s   zScaNN.add_embeddingsOptional[bool]c                 K  s   t d)a3  Delete by vector ID or other criteria.

        Args:
            ids: List of ids to delete.
            **kwargs: Other keyword arguments that subclasses might use.

        Returns:
            Optional[bool]: True if deletion is successful,
            False otherwise, None if not implemented.
        z*Deletions are not available in ScaNN, yet.)r=   )r-   r5   r7   r   r   r   delete   s   zScaNN.delete      List[float]kintfilterOptional[Dict[str, Any]]fetch_kList[Tuple[Document, float]]c                   sH  t j|gt jd}| jrt|}| j||du r|n|\}}g }	t|d D ]T\}
}|dkr0q'| j| }| j	
|ttsJtd| d |durpdd | D }tfd	d
| D ro|	|d |
 f q'|	|d |
 f q'|ddur| jtjtjfv rtjntj  fdd|	D }	|	d| S )a  Return docs most similar to query.

        Args:
            embedding: Embedding vector to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.
            **kwargs: kwargs to be passed to similarity search. Can include:
                score_threshold: Optional, a floating point value between 0 to 1 to
                    filter the resulting set of retrieved docs

        Returns:
            List of documents most similar to the query text and L2 distance
            in float for each. Lower score represents more similarity.
        ZdtypeNr   r   zCould not find document for id z, got c                 S  s&   i | ]\}}|t |ts|gn|qS r   )r;   r@   .0keyvaluer   r   r   
<dictcomp>   s    z@ScaNN.similarity_search_with_score_by_vector.<locals>.<dictcomp>c                 3  s$    | ]\}} j ||v V  qd S N)metadatagetrS   )docr   r   	<genexpr>   s   " z?ScaNN.similarity_search_with_score_by_vector.<locals>.<genexpr>score_thresholdc                   s"   g | ]\}} |r||fqS r   r   rT   r[   Z
similarity)cmpr]   r   r   
<listcomp>   s    z@ScaNN.similarity_search_with_score_by_vector.<locals>.<listcomp>)r   arrayfloat32r,   r   r    Zsearch_batched	enumerater"   r!   searchr;   r   r<   itemsallappendrZ   r(   r   MAX_INNER_PRODUCTZJACCARDoperatorgele)r-   r   rL   rN   rP   r7   vectorindicesZscoresdocsjiZ_idr   )r_   r[   r]   r   &similarity_search_with_score_by_vector   sD   



z,ScaNN.similarity_search_with_score_by_vectorquerystrc                 K  s*   | j |}| j||f||d|}|S )a  Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of documents most similar to the query text with
            L2 distance in float. Lower score represents more similarity.
        rN   rP   )r   Zembed_queryrq   )r-   rr   rL   rN   rP   r7   r   rn   r   r   r   similarity_search_with_score   s   z"ScaNN.similarity_search_with_scoreList[Document]c                 K  (   | j ||f||d|}dd |D S )a  Return docs most similar to embedding vector.

        Args:
            embedding: Embedding to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of Documents most similar to the embedding.
        rt   c                 S     g | ]\}}|qS r   r   rT   r[   _r   r   r   r`         z5ScaNN.similarity_search_by_vector.<locals>.<listcomp>)rq   )r-   r   rL   rN   rP   r7   docs_and_scoresr   r   r   similarity_search_by_vector   s   z!ScaNN.similarity_search_by_vectorc                 K  rw   )a  Return docs most similar to query.

        Args:
            query: Text to look up documents similar to.
            k: Number of Documents to return. Defaults to 4.
            filter: (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
            fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
                      Defaults to 20.

        Returns:
            List of Documents most similar to the query.
        rt   c                 S  rx   r   r   ry   r   r   r   r`   )  r{   z+ScaNN.similarity_search.<locals>.<listcomp>)ru   )r-   rr   rL   rN   rP   r7   r|   r   r   r   similarity_search  s   zScaNN.similarity_searchList[List[float]]c                 K  sH  t d}|dtj}	|dd }
tj|tjd}|rt|}|
d ur+|j	||
}n|	tj
kr=|j|dd  }n|j|dd  }g }|d u rVdd	 |D }t|D ]\}}|rd|| ni }|t||d
 qZtt|}t|t|krtt| dt| dttt| |}| ||||fd|i|S )Nr   r(   r)   rR      Zdot_productZ
squared_l2c                 S  s   g | ]}t t qS r   )rs   uuiduuid4)rT   rz   r   r   r   r`   Q  s    z ScaNN.__from.<locals>.<listcomp>)Zpage_contentrY   z ids provided for z, documents. Each document should have an id.r&   )r   rZ   r   EUCLIDEAN_DISTANCEr   ra   rb   r   scann_ops_pybindZcreate_searcherrh   builderZscore_brute_forcebuildrc   rg   r   dictlen	Exceptionr   rE   values)clsr/   r1   r   r3   r5   r&   r7   r   r(   r)   rl   r    Z	documentsrp   textrY   Zindex_to_idr!   r   r   r   Z__from+  sR   
zScaNN.__fromc                 K  s&   | |}| j|||f||d|S )aN  Construct ScaNN wrapper from raw documents.

        This is a user friendly interface that:
            1. Embeds documents.
            2. Creates an in memory docstore
            3. Initializes the ScaNN database

        This is intended to be a quick way to get started.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import ScaNN
                from langchain_community.embeddings import OpenAIEmbeddings
                embeddings = OpenAIEmbeddings()
                scann = ScaNN.from_texts(texts, embeddings)
        r>   )r?   _ScaNN__from)r   r/   r   r3   r5   r7   r1   r   r   r   
from_textsg  s   
zScaNN.from_textsList[Tuple[str, List[float]]]c                 K  s8   dd |D }dd |D }| j |||f||d|S )a  Construct ScaNN wrapper from raw documents.

        This is a user friendly interface that:
            1. Embeds documents.
            2. Creates an in memory docstore
            3. Initializes the ScaNN database

        This is intended to be a quick way to get started.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import ScaNN
                from langchain_community.embeddings import OpenAIEmbeddings
                embeddings = OpenAIEmbeddings()
                text_embeddings = embeddings.embed_documents(texts)
                text_embedding_pairs = list(zip(texts, text_embeddings))
                scann = ScaNN.from_embeddings(text_embedding_pairs, embeddings)
        c                 S     g | ]}|d  qS )r   r   rT   tr   r   r   r`     r{   z)ScaNN.from_embeddings.<locals>.<listcomp>c                 S  r   )r   r   r   r   r   r   r`     r{   r>   )r   )r   rC   r   r3   r5   r7   r/   r1   r   r   r   from_embeddings  s   zScaNN.from_embeddingsfolder_path
index_nameNonec                 C  s   t |}|dj|d }|jddd | jt| t|dj|d d}t| j	| j
f| W d   dS 1 s<w   Y  dS )zSave ScaNN index, docstore, and index_to_docstore_id to disk.

        Args:
            folder_path: folder path to save index, docstore,
                and index_to_docstore_id to.
        {index_name}.scannr   Texist_okparents{index_name}.pklwbN)r   formatmkdirr    	serializers   openpickledumpr!   r"   )r-   r   r   path
scann_pathfr   r   r   
save_local  s   "zScaNN.save_local)allow_dangerous_deserializationr   c                K  s   |st dt|}|dj|d }|jddd td}|jt|}	t|dj|d d}
t	
|
\}}W d	   n1 sBw   Y  | ||	||fi |S )
a  Load ScaNN index, docstore, and index_to_docstore_id from disk.

        Args:
            folder_path: folder path to load index, docstore,
                and index_to_docstore_id from.
            embedding: Embeddings to use when generating queries
            index_name: for saving with a specific index file name
            allow_dangerous_deserialization: whether to allow deserialization
                of the data which involves loading a pickle file.
                Pickle files can be modified by malicious actors to deliver a
                malicious payload that results in execution of
                arbitrary code on your machine.
        aB  The de-serialization relies loading a pickle file. Pickle files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to enable deserialization. If you do this, make sure that you trust the source of the data. For example, if you are loading a file that you created, and know that no one else has modified the file, then this is safe to do. Do not set this to `True` if you are loading a file from an untrusted source (e.g., some random site on the internet.).r   r   Tr   r   r   rbN)r<   r   r   r   r   r   Zload_searcherrs   r   r   load)r   r   r   r   r   r7   r   r   r   r    r   r!   r"   r   r   r   
load_local  s$   zScaNN.load_localCallable[[float], float]c                 C  s<   | j dur| j S | jtjkr| jS | jtjkr| jS td)a8  
        The 'correct' relevance function
        may differ depending on a few things, including:
        - the distance / similarity metric used by the VectorStore
        - the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
        - embedding dimensionality
        - etc.
        NzJUnknown distance strategy, must be cosine, max_inner_product, or euclidean)r+   r(   r   rh   Z%_max_inner_product_relevance_score_fnr   Z_euclidean_relevance_score_fnr<   )r-   r   r   r   _select_relevance_score_fn  s   
	z ScaNN._select_relevance_score_fnc                   sn   | dd|    du rtd| j|f|||d|} fdd|D }dur5fdd|D }|S )z?Return docs and their similarity scores on a scale from 0 to 1.r]   NzLnormalize_score_fn must be provided to ScaNN constructor to normalize scores)rL   rN   rP   c                   s   g | ]
\}}| |fqS r   r   )rT   r[   Zscore)r$   r   r   r`   ,  s    zBScaNN._similarity_search_with_relevance_scores.<locals>.<listcomp>c                   s    g | ]\}}| kr||fqS r   r   r^   )r]   r   r   r`   0  s
    )popr   r<   ru   )r-   rr   rL   rN   rP   r7   r|   Zdocs_and_rel_scoresr   )r$   r]   r   (_similarity_search_with_relevance_scores  s.   

z.ScaNN._similarity_search_with_relevance_scores)r   r   r    r   r!   r   r"   r#   r$   r%   r&   r'   r(   r   r)   r*   )NN)r/   r0   r1   r2   r3   r4   r5   r6   r7   r   r   r8   )
r/   r0   r3   r4   r5   r6   r7   r   r   r8   )
rC   rD   r3   r4   r5   r6   r7   r   r   r8   rX   )r5   r6   r7   r   r   rG   )rI   NrJ   )r   rK   rL   rM   rN   rO   rP   rM   r7   r   r   rQ   )rr   rs   rL   rM   rN   rO   rP   rM   r7   r   r   rQ   )r   rK   rL   rM   rN   rO   rP   rM   r7   r   r   rv   )rr   rs   rL   rM   rN   rO   rP   rM   r7   r   r   rv   )NNF)r/   r8   r1   r   r   r   r3   r4   r5   r6   r&   r'   r7   r   r   r   )r/   r8   r   r   r3   r4   r5   r6   r7   r   r   r   )rC   r   r   r   r3   r4   r5   r6   r7   r   r   r   )r    )r   rs   r   rs   r   r   )r   rs   r   r   r   rs   r   r'   r7   r   r   r   )r   r   )__name__
__module____qualname____doc__r   r   r.   rA   rB   rF   rH   rq   ru   r}   r~   classmethodr   r   r   r   r   r   r   r   r   r   r   r   !   sr    C" ;#&
5r   )r   r   r   r   )r   r   )#
__future__r   ri   r   r   pathlibr   typingr   r   r   r   r   r	   r
   numpyr   Zlangchain_core.documentsr   Zlangchain_core.embeddingsr   Zlangchain_core.utilsr   Zlangchain_core.vectorstoresr   Z!langchain_community.docstore.baser   r   Z&langchain_community.docstore.in_memoryr   Z&langchain_community.vectorstores.utilsr   r   r   r   r   r   r   r   <module>   s"    $

