o
    Zh                     @  sL   d dl mZ d dlmZmZmZmZ d dlmZm	Z	m
Z
 G dd deZdS )    )annotations)AnyListOptionalcast)TextSplitter	Tokenizersplit_text_on_tokensc                      s`   e Zd ZU dZ			dd fddZd ddZd!ddZd"ddZdZde	d< d#ddZ
  ZS )$%SentenceTransformersTokenTextSplitterz8Splitting text to tokens using sentence model tokenizer.2   'sentence-transformers/all-mpnet-base-v2Nchunk_overlapint
model_namestrtokens_per_chunkOptional[int]kwargsr   returnNonec                   sn   t  jdi |d|i zddlm} W n ty    tdw || _|| j| _| jj| _| j|d dS )zCreate a new TextSplitter.r   r   )SentenceTransformerzCould not import sentence_transformers python package. This is needed in order to for SentenceTransformersTokenTextSplitter. Please install it with `pip install sentence-transformers`.)r   N )	super__init__Zsentence_transformersr   ImportErrorr   _model	tokenizer_initialize_chunk_configuration)selfr   r   r   r   r   	__class__r   e/var/www/html/lang_env/lib/python3.10/site-packages/langchain_text_splitters/sentence_transformers.pyr      s   
z.SentenceTransformersTokenTextSplitter.__init__c                C  sZ   t t| jj| _|d u r| j| _n|| _| j| jkr+td| j d| j d| j dd S )NzThe token limit of the models 'z' is: z. Argument tokens_per_chunk=z > maximum token limit.)r   r   r   Zmax_seq_lengthZmaximum_tokens_per_chunkr   
ValueErrorr   )r   r   r   r   r!   r   #   s   

zESentenceTransformersTokenTextSplitter._initialize_chunk_configurationtext	List[str]c                   s2   d
 fdd}t  j j jj|d}t||dS )a  Splits the input text into smaller components by splitting text on tokens.

        This method encodes the input text using a private `_encode` method, then
        strips the start and stop token IDs from the encoded result. It returns the
        processed segments as a list of strings.

        Args:
            text (str): The input text to be split.

        Returns:
            List[str]: A list of string components derived from the input text after
            encoding and processing.
        r#   r   r   	List[int]c                   s     | dd S )N   )_encode)r#   r   r   r!   %encode_strip_start_and_stop_token_idsD   s   z_SentenceTransformersTokenTextSplitter.split_text.<locals>.encode_strip_start_and_stop_token_ids)r   r   decodeencode)r#   r   Nr#   r   r   r%   )r   Z_chunk_overlapr   r   r+   r	   )r   r#   r*   r   r   r)   r!   
split_text5   s   z0SentenceTransformersTokenTextSplitter.split_textc                C  s   t | |S )ay  Counts the number of tokens in the given text.

        This method encodes the input text using a private `_encode` method and
        calculates the total number of tokens in the encoded result.

        Args:
            text (str): The input text for which the token count is calculated.

        Returns:
            int: The number of tokens in the encoded text.
        )lenr(   )r   r#   r   r   r!   count_tokensP   s   z2SentenceTransformersTokenTextSplitter.count_tokensl         _max_length_equal_32_bit_integerr%   c                 C  s   | j j|| jdd}|S )NZdo_not_truncate)
max_lengthZ
truncation)r   r,   r1   )r   r#   Z&token_ids_with_start_and_end_token_idsr   r   r!   r(   `   s   z-SentenceTransformersTokenTextSplitter._encode)r   r   N)
r   r   r   r   r   r   r   r   r   r   )r   r   r   r   )r#   r   r   r$   )r#   r   r   r   r-   )__name__
__module____qualname____doc__r   r   r.   r0   r1   __annotations__r(   __classcell__r   r   r   r!   r
      s   
 


r
   N)
__future__r   typingr   r   r   r   Zlangchain_text_splitters.baser   r   r	   r
   r   r   r   r!   <module>   s    