o
    Zh{/                     @  s   d dl mZ d dlZd dlZd dlmZmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZmZmZmZmZmZ d dlmZmZ eeZedd	d
ZG dd	 d	eeZG dd deZ G dd de!e
Z"eddG dd dZ#dddZ$dS )    )annotationsN)ABCabstractmethod)	dataclass)Enum)AbstractSetAnyCallable
CollectionIterableListLiteralOptionalSequenceTypeTypeVarUnion)BaseDocumentTransformerDocumentTSTextSplitter)boundc                   @  s   e Zd ZdZddedddfdBddZedCddZ	dDdEddZdFd"d#Z	dGd'd(Z
dHd+d,ZedId0d1Zed2de d3fdJd=d>ZdKd@dAZdS )Lr   z)Interface for splitting text into chunks.i     FT
chunk_sizeintchunk_overlaplength_functionCallable[[str], int]keep_separator$Union[bool, Literal['start', 'end']]add_start_indexboolstrip_whitespacereturnNonec                 C  sF   ||krt d| d| d|| _|| _|| _|| _|| _|| _dS )ad  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator and where to place it
                            in each corresponding chunk (True='start')
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_functionZ_keep_separator_add_start_index_strip_whitespace)selfr   r   r   r   r    r"    r,   T/var/www/html/lang_env/lib/python3.10/site-packages/langchain_text_splitters/base.py__init__!   s   
zTextSplitter.__init__textstr	List[str]c                 C  s   dS )z$Split text into multiple components.Nr,   )r+   r/   r,   r,   r-   
split_textB   s    zTextSplitter.split_textNtexts	metadatasOptional[List[dict]]List[Document]c                 C  s   |pi gt | }g }t|D ]=\}}d}d}| |D ]/}	t|| }
| jr@|| | j }||	td|}||
d< t |	}t	|	|
d}|
| qq|S )z&Create documents from a list of texts.r   start_index)page_contentmetadata)len	enumerater2   copydeepcopyr)   r'   findmaxr   append)r+   r3   r4   Z
_metadatas	documentsir/   indexZprevious_chunk_lenchunkr9   offsetZnew_docr,   r,   r-   create_documentsF   s    	zTextSplitter.create_documentsrA   Iterable[Document]c                 C  s:   g g }}|D ]}| |j | |j q| j||dS )zSplit documents.)r4   )r@   r8   r9   rF   )r+   rA   r3   r4   docr,   r,   r-   split_documentsZ   s
   
zTextSplitter.split_documentsdocs	separatorOptional[str]c                 C  s(   | |}| jr| }|dkrd S |S )N )joinr*   strip)r+   rJ   rK   r/   r,   r,   r-   
_join_docsb   s   
zTextSplitter._join_docssplitsIterable[str]c           
      C  sz  |  |}g }g }d}|D ]}|  |}|| t|dkr|nd | jkr|| jkr6td| d| j  t|dkr| ||}	|	d urK||	 || jkse|| t|dkr[|nd | jkr|dkr||  |d t|dkrt|nd 8 }|dd  }|| jkse|| t|dkr|nd | jkr|dkse|| ||t|dkr|nd 7 }q| ||}	|	d ur||	 |S )Nr   zCreated a chunk of size z%, which is longer than the specified    )r(   r:   r&   loggerwarningrP   r@   r'   )
r+   rQ   rK   Zseparator_lenrJ   Zcurrent_doctotald_lenrH   r,   r,   r-   _merge_splitsk   sN   







zTextSplitter._merge_splits	tokenizerr   kwargsc                   sZ   zddl m} t |stdd fdd	}W n ty#   td
w | dd|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBaser/   r0   r#   r   c                   s   t  | S Nr:   encoder/   rZ   r,   r-   _huggingface_tokenizer_length   s   zNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_lengthz`Could not import transformers python package. Please install it with `pip install transformers`.r   Nr/   r0   r#   r   r,   )Ztransformersr\   
isinstancer%   ImportError)clsrZ   r[   r\   rb   r,   ra   r-   from_huggingface_tokenizer   s   
z'TextSplitter.from_huggingface_tokenizergpt2allrf   Type[TS]encoding_name
model_nameallowed_special'Union[Literal['all'], AbstractSet[str]]disallowed_special&Union[Literal['all'], Collection[str]]r   c           	        s   zddl }W n ty   tdw |dur||n||d fdd	}t| tr<|| d
}i ||}| dd|i|S )z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.r/   r0   r#   r   c                   s   t j|  dS N)rm   ro   r^   r`   rm   ro   encr,   r-   _tiktoken_encoder   s   z=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder)rk   rl   rm   ro   r   rc   r,   )tiktokenre   encoding_for_modelget_encoding
issubclassTokenTextSplitter)	rf   rk   rl   rm   ro   r[   ru   rt   extra_kwargsr,   rr   r-   from_tiktoken_encoder   s&   


	z"TextSplitter.from_tiktoken_encoderSequence[Document]c                 K  s   |  t|S )z2Transform sequence of documents by splitting them.)rI   list)r+   rA   r[   r,   r,   r-   transform_documents   s   z TextSplitter.transform_documents)r   r   r   r   r   r   r   r   r    r!   r"   r!   r#   r$   r/   r0   r#   r1   r]   )r3   r1   r4   r5   r#   r6   )rA   rG   r#   r6   )rJ   r1   rK   r0   r#   rL   )rQ   rR   rK   r0   r#   r1   )rZ   r   r[   r   r#   r   )rf   rj   rk   r0   rl   rL   rm   rn   ro   rp   r[   r   r#   r   )rA   r|   r[   r   r#   r|   )__name__
__module____qualname____doc__r:   r.   r   r2   rF   rI   rP   rY   classmethodrg   setr{   r~   r,   r,   r,   r-   r      s2    !


	*+c                      s8   e Zd ZdZdde dfd fddZdddZ  ZS )ry   z/Splitting text to tokens using model tokenizer.rh   Nri   rk   r0   rl   rL   rm   rn   ro   rp   r[   r   r#   r$   c                   sj   t  jdi | zddl}W n ty   tdw |dur%||}n||}|| _|| _|| _dS )zCreate a new TextSplitter.r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r,   )	superr.   ru   re   rv   rw   
_tokenizer_allowed_special_disallowed_special)r+   rk   rl   rm   ro   r[   ru   rs   	__class__r,   r-   r.      s   	

zTokenTextSplitter.__init__r/   r1   c                   s2   d
 fdd}t  j j jj|d}t||dS )a  Splits the input text into smaller chunks based on tokenization.

        This method uses a custom tokenizer configuration to encode the input text
        into tokens, processes the tokens in chunks of a specified size with overlap,
        and decodes them back into text chunks. The splitting is performed using the
        `split_text_on_tokens` function.

        Args:
            text (str): The input text to be split into smaller chunks.

        Returns:
            List[str]: A list of text chunks, where each chunk is derived from a portion
            of the input text based on the tokenization and chunking rules.
        _textr0   r#   	List[int]c                   s    j j|  j jdS rq   )r   r_   r   r   )r   r+   r,   r-   _encode  s
   z-TokenTextSplitter.split_text.<locals>._encode)r   tokens_per_chunkdecoder_   )r/   rZ   N)r   r0   r#   r   )	Tokenizerr'   r&   r   r   split_text_on_tokens)r+   r/   r   rZ   r,   r   r-   r2      s   zTokenTextSplitter.split_text)rk   r0   rl   rL   rm   rn   ro   rp   r[   r   r#   r$   r   )r   r   r   r   r   r.   r2   __classcell__r,   r,   r   r-   ry      s    ry   c                   @  sx   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlZsolcsharpcobolcluaperlhaskellelixir
powershellN)r   r   r   r   ZCPPZGOZJAVAZKOTLINZJSr   PHPPROTOPYTHONZRSTZRUBYZRUSTZSCALAZSWIFTMARKDOWNZLATEXHTMLZSOLZCSHARPCOBOLCZLUAZPERLZHASKELLZELIXIRZ
POWERSHELLr,   r,   r,   r-   r     s8    r   T)frozenc                   @  s8   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< d	S )
r   zTokenizer data class.r   r   r   zCallable[[List[int]], str]r   zCallable[[str], List[int]]r_   N)r   r   r   r   __annotations__r,   r,   r,   r-   r   ;  s   
 r   r/   r0   rZ   r#   r1   c                 C  s   g }| | }d}t||j t|}||| }|t|k rN||| |t|kr0	 |S ||j|j 7 }t||j t|}||| }|t|k s|S )z6Split incoming text and return chunks using tokenizer.r   )r_   minr   r:   r@   r   r   )r/   rZ   rQ   Z	input_idsZ	start_idxZcur_idxZ	chunk_idsr,   r,   r-   r   I  s   
r   )r/   r0   rZ   r   r#   r1   )%
__future__r   r<   loggingabcr   r   dataclassesr   enumr   typingr   r   r	   r
   r   r   r   r   r   r   r   r   Zlangchain_core.documentsr   r   	getLoggerr   rT   r   r   ry   r0   r   r   r   r,   r,   r,   r-   <module>   s"    8
 @?