o
    Zhg                     @  s   d dl mZ d dlZd dlZd dlmZmZmZmZm	Z	m
Z
mZmZmZmZmZmZ d dlZd dlZd dlmZ d dlmZmZmZ d dlmZmZmZmZmZ d dlm Z  e!e"Z#dddZ$G dd deeZ%dS )    )annotationsN)AnyDictIterableListLiteralMappingOptionalSequenceSetTupleUnioncast)
Embeddings)from_envget_pydantic_field_namessecret_from_env)	BaseModel
ConfigDictField	SecretStrmodel_validator)Self	num_textsinttokensList[Union[List[int], str]]batched_embeddingsList[List[float]]indices	List[int]
skip_emptyboolreturnList[Optional[List[float]]]c           	        s"  dd t | D }dd t | D t t|D ]% |r%t|  dkr%q||   |   |   t|   qg }t | D ]J |  }t|dkrV|d  qDt|dkrd||d  qDt   fddt| D }tdd |D d	 |fd
d|D  qD|S )Nc                 S     g | ]}g qS  r&   .0_r&   r&   W/var/www/html/lang_env/lib/python3.10/site-packages/langchain_openai/embeddings/base.py
<listcomp>'       z7_process_batched_chunked_embeddings.<locals>.<listcomp>c                 S  r%   r&   r&   r'   r&   r&   r*   r+   ,   r,      r   c                   s,   g | ]}t d d t|  D  qS )c                 s  s    | ]	\}}|| V  qd S Nr&   )r(   valweightr&   r&   r*   	<genexpr>K   s
    
zA_process_batched_chunked_embeddings.<locals>.<listcomp>.<genexpr>)sumzip)r(   	embedding)inum_tokens_in_batchtotal_weightr&   r*   r+   J   s    c                 s  s    | ]}|d  V  qdS )   Nr&   r(   r/   r&   r&   r*   r1   U       z6_process_batched_chunked_embeddings.<locals>.<genexpr>g      ?c                   s   g | ]}|  qS r&   r&   r9   )	magnituder&   r*   r+   V   s    )rangelenappendr2   r3   )	r   r   r   r   r!   results
embeddings_resultZaverager&   )r5   r;   r6   r7   r*   #_process_batched_chunked_embeddings   s.   	
rB   c                   @  s  e Zd ZU dZedddZded< edddZded< dZd	ed
< dZ	ded< 	 eZ
ded< eedddddZded< 	 ededdddZded< 	 eeddddZded< eeddddZded< dZded< 	 ed ed!dddZd"ed#< 	 ed$ed%d&gdddZded'< 	 dZd(ed)< dZd*ed+< d,Zded-< 	 d.Zded/< 	 edd0d1Zd2ed3< 	 dZded4< dZd5ed6< 	 dZded7< 	 d8Zd5ed9< 	 eedZd:ed;< 	 d8Zd5ed<< 	 dZ d=ed>< dZ!d?ed@< dAZ"dedB< 	 dCZ#dedD< 	 dZ$dEedF< 	 dZ%dEedG< 	 dZ&d5edH< 	 e'dIddJdKZ(e)dLdMe*dodPdQZ+e)dRdMdpdTdUZ,e-dqdVdWZ.drd[d\Z/dd]dsd`daZ0dd]dsdbdcZ1	dtdudedfZ2	dtdudgdhZ3dvdkdlZ4dvdmdnZ5dS )wOpenAIEmbeddingsu	  OpenAI embedding model integration.

    Setup:
        Install ``langchain_openai`` and set environment variable ``OPENAI_API_KEY``.

        .. code-block:: bash

            pip install -U langchain_openai
            export OPENAI_API_KEY="your-api-key"

    Key init args — embedding params:
        model: str
            Name of OpenAI model to use.
        dimensions: Optional[int] = None
            The number of dimensions the resulting output embeddings should have.
            Only supported in `text-embedding-3` and later models.

    Key init args — client params:
        api_key: Optional[SecretStr] = None
            OpenAI API key.
        organization: Optional[str] = None
            OpenAI organization ID. If not passed in will be read
            from env var OPENAI_ORG_ID.
        max_retries: int = 2
            Maximum number of retries to make when generating.
        request_timeout: Optional[Union[float, Tuple[float, float], Any]] = None
            Timeout for requests to OpenAI completion API

    See full list of supported init args and their descriptions in the params section.

    Instantiate:
        .. code-block:: python

            from langchain_openai import OpenAIEmbeddings

            embed = OpenAIEmbeddings(
                model="text-embedding-3-large"
                # With the `text-embedding-3` class
                # of models, you can specify the size
                # of the embeddings you want returned.
                # dimensions=1024
            )

    Embed single text:
        .. code-block:: python

            input_text = "The meaning of life is 42"
            vector = embeddings.embed_query("hello")
            print(vector[:3])

        .. code-block:: python

            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Embed multiple texts:
        .. code-block:: python

            vectors = embeddings.embed_documents(["hello", "goodbye"])
            # Showing only the first 3 coordinates
            print(len(vectors))
            print(vectors[0][:3])

        .. code-block:: python

            2
            [-0.024603435769677162, -0.007543657906353474, 0.0039630369283258915]

    Async:
        .. code-block:: python

            await embed.aembed_query(input_text)
            print(vector[:3])

            # multiple:
            # await embed.aembed_documents(input_texts)

        .. code-block:: python

            [-0.009100092574954033, 0.005071679595857859, -0.0029193938244134188]
    NT)defaultexcluder   clientasync_clientztext-embedding-ada-002strmodelOptional[int]
dimensionszOptional[str]
deploymentZOPENAI_API_VERSION)rD   api_version)default_factoryaliasopenai_api_versionbase_urlZOPENAI_API_BASE)rO   rN   openai_api_baseZOPENAI_API_TYPE)rN   openai_api_typeZOPENAI_PROXYopenai_proxyi  r   embedding_ctx_lengthapi_keyZOPENAI_API_KEYzOptional[SecretStr]openai_api_keyorganizationZOPENAI_ORG_IDZOPENAI_ORGANIZATIONopenai_organizationz%Union[Literal['all'], Set[str], None]allowed_specialz4Union[Literal['all'], Set[str], Sequence[str], None]disallowed_speciali  
chunk_sizer8   max_retriestimeout)rD   rO   z0Optional[Union[float, Tuple[float, float], Any]]request_timeoutheadersr"   tiktoken_enabledtiktoken_model_nameFshow_progress_barDict[str, Any]model_kwargsr!   zUnion[Mapping[str, str], None]default_headersz!Union[Mapping[str, object], None]default_query   retry_min_seconds   retry_max_secondszUnion[Any, None]http_clienthttp_async_clientcheck_embedding_ctx_lengthZforbidr&   )extraZpopulate_by_nameZprotected_namespacesbefore)modevaluesr#   c              
   C  s   t | }|di }t|D ](}||v rtd| d||vr6td| d| d| d ||||< q|| }|rHtd| d	||d< |S )
z>Build extra kwargs from additional params that were passed in.re   zFound z supplied twice.z	WARNING! z/ is not default parameter.
                    zJ was transferred to model_kwargs.
                    Please confirm that z is what you intended.zParameters za should be specified explicitly. Instead they were passed in as part of `model_kwargs` parameter.)	r   getlist
ValueErrorwarningswarnpopintersectionkeys)clsrr   Zall_required_field_namesro   
field_nameZinvalid_model_kwargsr&   r&   r*   build_extra  s,   
zOpenAIEmbeddings.build_extraafterr   c           	   
   C  sl  | j dv r	td| jr| j nd| j| j| j| j| j| j	d}| j
r@| js*| jr@| j
}| j}| j}td|d|d|| jsz| j
ri| jsizddl}W n ty` } ztd	|d}~ww |j| j
d
| _d| ji}tjdi ||j| _| js| j
r| jszddl}W n ty } ztd	|d}~ww |j| j
d
| _d| ji}tjdi ||j| _| S )z?Validate that api key and python package exists in environment.)ZazureZazure_adZazureadzEIf you are using Azure, please use the `AzureOpenAIEmbeddings` class.N)rV   rX   rQ   r^   r]   rf   rg   zwCannot specify 'openai_proxy' if one of 'http_client'/'http_async_client' is already specified. Received:
openai_proxy=z
http_client=z
http_async_client=r   zRCould not import httpx python package. Please install it with `pip install httpx`.)proxyrl   r&   )rS   ru   rW   Zget_secret_valuerY   rR   r_   r]   rf   rg   rT   rl   rm   rF   httpxImportErrorZClientopenaiZOpenAIr@   rG   ZAsyncClientZAsyncOpenAI)	selfZclient_paramsrT   rl   rm   r   eZsync_specificZasync_specificr&   r&   r*   validate_environment*  s|   


z%OpenAIEmbeddings.validate_environmentc                 C  s(   d| j i| j}| jd ur| j|d< |S )NrI   rK   )rI   re   rK   )r   paramsr&   r&   r*   _invocation_paramsd  s   

z#OpenAIEmbeddings._invocation_paramstexts	List[str]<Tuple[Iterable[int], List[Union[List[int], str]], List[int]]c                 C  s  g }g }| j p	| j}| js\zddlm} W n ty    tdw |j|d}t|D ]/\}}	|j	|	dd}
t
dt|
| jD ]}|
||| j  }||}|| || q?q+ngzt|}W n tyq   td}Y nw dd	 | j| jd
 D }t|D ]>\}}	| jdr|	dd}	|r|j	|	fi |}n||	}t
dt|| jD ]}||||| j   || qq| jrzddlm} |t
dt||}W n ty   t
dt||}Y n	w t
dt||}|||fS )a  
        Take the input `texts` and `chunk_size` and return 3 iterables as a tuple:

        We have `batches`, where batches are sets of individual texts
        we want responses from the openai api. The length of a single batch is
        `chunk_size` texts.

        Each individual text is also split into multiple texts based on the
        `embedding_ctx_length` parameter (based on number of tokens).

        This function returns a 3-tuple of the following:

        _iter: An iterable of the starting index in `tokens` for each *batch*
        tokens: A list of tokenized texts, where each text has already been split
            into sub-texts based on the `embedding_ctx_length` parameter. In the
            case of tiktoken, this is a list of token arrays. In the case of
            HuggingFace transformers, this is a list of strings.
        indices: An iterable of the same length as `tokens` that maps each token-array
            to the index of the original text in `texts`.
        r   )AutoTokenizerzCould not import transformers python package. This is needed for OpenAIEmbeddings to work without `tiktoken`. Please install it with `pip install transformers`. )Zpretrained_model_name_or_pathF)Zadd_special_tokensZcl100k_basec                 S  s   i | ]\}}|d ur||qS r.   r&   )r(   kvr&   r&   r*   
<dictcomp>  s
    z.OpenAIEmbeddings._tokenize.<locals>.<dictcomp>)rZ   r[   Z001
 )tqdm)rb   rI   ra   Ztransformersr   r   ru   Zfrom_pretrained	enumerateencoder<   r=   rU   decoder>   tiktokenZencoding_for_modelKeyErrorZget_encodingrZ   r[   itemsendswithreplaceZencode_ordinaryrc   Z	tqdm.autor   )r   r   r\   r   r   Z
model_namer   Z	tokenizerr5   textZ	tokenizedjZtoken_chunkZ
chunk_textencodingZencoder_kwargstokenr   _iterr&   r&   r*   	_tokenizek  sn   



zOpenAIEmbeddings._tokenize)r\   enginer   c                  s   |pj }||\}}}g }|D ])}	jjdd||	|	|  ij}
t|
ts/|
 }
|dd |
d D  qt	t
||||j}d d fdd	fd
d|D S )al  
        Generate length-safe embeddings for a list of texts.

        This method handles tokenization and embedding generation, respecting the
        set embedding context length and chunk size. It supports both tiktoken
        and HuggingFace tokenizer based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        inputc                 s      | ]}|d  V  qdS r4   Nr&   r(   rr&   r&   r*   r1     r:   z<OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<genexpr>dataNr#   List[float]c                    sF    d u r!j jdddij} t| ts|  } | d d d   S Nr    r   r   r4   r&   )rF   creater   
isinstancedict
model_dumpZaverage_embedded_cached_empty_embeddingr   r&   r*   empty_embedding  s   

zBOpenAIEmbeddings._get_len_safe_embeddings.<locals>.empty_embeddingc                   s   g | ]}|d ur
|n  qS r.   r&   r(   r   r   r&   r*   r+     s    z=OpenAIEmbeddings._get_len_safe_embeddings.<locals>.<listcomp>r&   r#   r   )r\   r   rF   r   r   r   r   r   extendrB   r=   r!   r   r   r   r\   _chunk_sizer   r   r   r   r5   responser@   r&   r   r   r   r*   _get_len_safe_embeddings  s$   


z)OpenAIEmbeddings._get_len_safe_embeddingsc                  s   |pj }||\}}}g }|pj }tdt||D ],}	jjdd||	|	|  ijI dH }
t|
ts>|
	 }
|
dd |
d D  qtt||||j}d d fd	d
fdd|D I dH S )a  
        Asynchronously generate length-safe embeddings for a list of texts.

        This method handles tokenization and asynchronous embedding generation,
        respecting the set embedding context length and chunk size. It supports both
        `tiktoken` and HuggingFace `tokenizer` based on the tiktoken_enabled flag.

        Args:
            texts (List[str]): A list of texts to embed.
            engine (str): The engine or model to use for embeddings.
            chunk_size (Optional[int]): The size of chunks for processing embeddings.

        Returns:
            List[List[float]]: A list of embeddings for each input text.
        r   r   Nc                 s  r   r   r&   r   r&   r&   r*   r1     r:   z=OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<genexpr>r   r#   r   c                    sN    d u r%j jdddijI d H } t| ts|  } | d d d   S r   )rG   r   r   r   r   r   r   r   r&   r*   r   #  s   

zCOpenAIEmbeddings._aget_len_safe_embeddings.<locals>.empty_embeddingc                   s&   g | ]}|d ur|n  I d H qS r.   r&   r   r   r&   r*   r+   .  s   $ z>OpenAIEmbeddings._aget_len_safe_embeddings.<locals>.<listcomp>r&   r   )r\   r   r<   r=   rG   r   r   r   r   r   r   rB   r!   r   r&   r   r*   _aget_len_safe_embeddings  s(   



z*OpenAIEmbeddings._aget_len_safe_embeddings
int | Nonec                 C  s   |p| j }| js>g }tdt||D ])}| jjdd||||  i| j}t|ts/| }|	dd |d D  q|S t
t| j}| j||dS )	aM  Call out to OpenAI's embedding endpoint for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   c                 s  r   r   r&   r   r&   r&   r*   r1   F  r:   z3OpenAIEmbeddings.embed_documents.<locals>.<genexpr>r   r   Nr&   )r\   rn   r<   r=   rF   r   r   r   r   r   r   rH   rL   r   r   r   r\   Zchunk_size_r@   r5   r   r   r&   r&   r*   embed_documents0  s   


z OpenAIEmbeddings.embed_documentsc                   s   |p| j }| jsBg }tdt||D ],}| jjdd||||  i| jI dH }t|ts3| }|	dd |d D  q|S t
t| j}| j||dI dH S )	aS  Call out to OpenAI's embedding endpoint async for embedding search docs.

        Args:
            texts: The list of texts to embed.
            chunk_size: The chunk size of embeddings. If None, will use the chunk size
                specified by the class.

        Returns:
            List of embeddings, one for each text.
        r   r   Nc                 s  r   r   r&   r   r&   r&   r*   r1   d  r:   z4OpenAIEmbeddings.aembed_documents.<locals>.<genexpr>r   r   r&   )r\   rn   r<   r=   rG   r   r   r   r   r   r   rH   rL   r   r   r&   r&   r*   aembed_documentsN  s    


z!OpenAIEmbeddings.aembed_documentsr   r   c                 C  s   |  |gd S )zCall out to OpenAI's embedding endpoint for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        r   )r   )r   r   r&   r&   r*   embed_queryl  s   	zOpenAIEmbeddings.embed_queryc                   s   |  |gI dH }|d S )zCall out to OpenAI's embedding endpoint async for embedding query text.

        Args:
            text: The text to embed.

        Returns:
            Embedding for the text.
        Nr   )r   )r   r   r@   r&   r&   r*   aembed_queryw  s   	zOpenAIEmbeddings.aembed_query)rr   rd   r#   r   )r#   r   )r#   rd   )r   r   r\   r   r#   r   )r   r   r   rH   r\   rJ   r#   r   r.   )r   r   r\   r   r#   r   )r   rH   r#   r   )6__name__
__module____qualname____doc__r   rF   __annotations__rG   rI   rK   rL   r   rP   rR   rS   rT   rU   r   rW   rY   rZ   r[   r\   r]   r_   r`   ra   rb   rc   r   re   r!   rf   rg   ri   rk   rl   rm   rn   r   Zmodel_configr   classmethodr}   r   propertyr   r   r   r   r   r   r   r   r&   r&   r&   r*   rC   [   s   
 Q


	9
c23
rC   )r   r   r   r   r   r   r   r    r!   r"   r#   r$   )&
__future__r   loggingrv   typingr   r   r   r   r   r   r	   r
   r   r   r   r   r   r   Zlangchain_core.embeddingsr   Zlangchain_core.utilsr   r   r   Zpydanticr   r   r   r   r   Ztyping_extensionsr   	getLoggerr   loggerrB   rC   r&   r&   r&   r*   <module>   s    8

=