o
    Zhf                     @  s   d dl mZ d dlZd dlmZ d dlmZmZmZm	Z	m
Z
 d dlmZ d dlmZ d dlmZ d dlmZ G d	d
 d
eZdS )    )annotationsN)Path)AnyDictIterableListOptional)CallbackManagerForRetrieverRunDocument)BaseRetriever)
ConfigDictc                   @  s   e Zd ZU dZdZded< 	 ded< 	 dZded< 	 dZd	ed
< 	 eddZ	e
		d.d/ddZe
ddd0ddZd1d d!Z	"d2d3d&d'Ze
d(d"d)d4d,d-ZdS )5TFIDFRetrieverz`TF-IDF` retriever.

    Largely based on
    https://github.com/asvskartheek/Text-Retrieval/blob/master/TF-IDF%20Search%20Engine%20(SKLEARN).ipynb
    Nr   
vectorizerList[Document]docstfidf_array   intkT)Zarbitrary_types_allowedtextsIterable[str]	metadatasOptional[Iterable[dict]]tfidf_paramsOptional[Dict[str, Any]]kwargsreturnc           	      K  s   zddl m} W n ty   tdw |pi }|d	i |}||}|p,dd |D }dd t||D }| d	|||d|S )
Nr   )TfidfVectorizerzNCould not import scikit-learn, please install with `pip install scikit-learn`.c                 s  s    | ]}i V  qd S N ).0_r    r    [/var/www/html/lang_env/lib/python3.10/site-packages/langchain_community/retrievers/tfidf.py	<genexpr>4   s    z,TFIDFRetriever.from_texts.<locals>.<genexpr>c                 S  s   g | ]
\}}t ||d qS )Zpage_contentmetadatar
   )r!   tmr    r    r#   
<listcomp>5   s    z-TFIDFRetriever.from_texts.<locals>.<listcomp>r   r   r   r    )Zsklearn.feature_extraction.textr   ImportErrorZfit_transformzip)	clsr   r   r   r   r   r   r   r   r    r    r#   
from_texts!   s   
zTFIDFRetriever.from_texts)r   	documentsIterable[Document]c                K  s.   t dd |D  \}}| jd|||d|S )Nc                 s  s    | ]	}|j |jfV  qd S r   r%   )r!   dr    r    r#   r$   @   s    z0TFIDFRetriever.from_documents.<locals>.<genexpr>)r   r   r   r    )r,   r.   )r-   r/   r   r   r   r   r    r    r#   from_documents8   s   zTFIDFRetriever.from_documentsquerystrrun_managerr	   c                  s\   ddl m}  j|g}| j|d} fdd|  j d  d d d D }|S )Nr   )cosine_similarity)c                   s   g | ]} j | qS r    )r   )r!   iselfr    r#   r)   P   s    z:TFIDFRetriever._get_relevant_documents.<locals>.<listcomp>r7   )Zsklearn.metrics.pairwiser6   r   Z	transformr   ZreshapeZargsortr   )r:   r3   r5   r6   Z	query_vecresultsZreturn_docsr    r9   r#   _get_relevant_documentsE   s   ,z&TFIDFRetriever._get_relevant_documentstfidf_vectorizerfolder_path	file_nameNonec                 C  s   zdd l }W n ty   tdw t|}|jddd || j|| d  t|| d d}t| j| j	f| W d    d S 1 sIw   Y  d S )Nr   BCould not import joblib, please install with `pip install joblib`.T)exist_okparents.joblib.pklwb)
joblibr+   r   mkdirdumpr   openpickler   r   )r:   r>   r?   rG   pathfr    r    r#   
save_localS   s   "zTFIDFRetriever.save_localF)allow_dangerous_deserializationr?   rO   boolc          
      C  s   zddl }W n ty   tdw |stdt|}||| d }t|| d d}t|\}}	W d   n1 sBw   Y  | |||	dS )	a  Load the retriever from local storage.

        Args:
            folder_path: Folder path to load from.
            allow_dangerous_deserialization: Whether to allow dangerous deserialization.
                Defaults to False.
                The deserialization relies on .joblib and .pkl files, which can be
                modified to deliver a malicious payload that results in execution of
                arbitrary code on your machine. You will need to set this to `True` to
                use deserialization. If you do this, make sure you trust the source of
                the file.
            file_name: File name to load from. Defaults to "tfidf_vectorizer".

        Returns:
            TFIDFRetriever: Loaded retriever.
        r   NrA   a  The de-serialization of this retriever is based on .joblib and .pkl files.Such files can be modified to deliver a malicious payload that results in execution of arbitrary code on your machine.You will need to set `allow_dangerous_deserialization` to `True` to load this retriever. If you do this, make sure you trust the source of the file, and you are responsible for validating the file came from a trusted source.rD   rE   rbr*   )rG   r+   
ValueErrorr   loadrJ   rK   )
r-   r>   rO   r?   rG   rL   r   rM   r   r   r    r    r#   
load_locali   s"   zTFIDFRetriever.load_local)NN)
r   r   r   r   r   r   r   r   r   r   )r/   r0   r   r   r   r   r   r   )r3   r4   r5   r	   r   r   )r=   )r>   r4   r?   r4   r   r@   )r>   r4   rO   rP   r?   r4   r   r   )__name__
__module____qualname____doc__r   __annotations__r   r   r   Zmodel_configclassmethodr.   r2   r<   rN   rT   r    r    r    r#   r      s6   
 
r   )
__future__r   rK   pathlibr   typingr   r   r   r   r   Zlangchain_core.callbacksr	   Zlangchain_core.documentsr   Zlangchain_core.retrieversr   Zpydanticr   r   r    r    r    r#   <module>   s    