o
    .if                     @  s  d Z ddlmZ ddlZddlZddlZddlZddlmZm	Z	 ddl
mZ ddlmZ ddlmZmZ ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZm Z m!Z! ddl"Z"dd	l#m$Z$m%Z% e&e'Z(ed
ddZ)dddDddZ*dEddZ+G dd de$eZ,G dd  d e,Z-G d!d" d"eZ.G d#d$ d$eZ/G d%d& d&Z0G d'd( d(eZ1G d)d* d*Z2ed+d,G d-d. d.Z3dFd0d1Z4G d2d3 d3e,Z5G d4d5 d5e,Z6G d6d7 d7e7eZ8G d8d9 d9e,Z9G d:d; d;e,Z:G d<d= d=e,Z;G d>d? d?e9Z<G d@dA dAe9Z=G dBdC dCe9Z>dS )Ga  **Text Splitters** are classes for splitting text.


**Class hierarchy:**

.. code-block::

    BaseDocumentTransformer --> TextSplitter --> <name>TextSplitter  # Example: CharacterTextSplitter
                                                 RecursiveCharacterTextSplitter -->  <name>TextSplitter

Note: **MarkdownHeaderTextSplitter** and **HTMLHeaderTextSplitter do not derive from TextSplitter.


**Main helpers:**

.. code-block::

    Document, Tokenizer, Language, LineType, HeaderType

    )annotationsN)ABCabstractmethod)	dataclass)Enum)BytesIOStringIO)AbstractSetAnyCallable
CollectionDictIterableListLiteralOptionalSequenceTupleType	TypedDictTypeVarUnioncast)BaseDocumentTransformerDocumentTSTextSplitter)bound@B 
max_lengthpipelinestrr    intreturnr
   c                C  sh   zdd l }W n ty   tdw | dkr&ddlm} | }|d |S |j| ddgd}||_|S )Nr   zCSpacy is not installed, please install it with `pip install spacy`.sentencizer)Englishnertagger)exclude)spacyImportErrorspacy.lang.enr&   add_pipeloadr    )r!   r    r*   r&   r%    r/   P/var/www/html/corbot_env/lib/python3.10/site-packages/langchain/text_splitter.py"_make_spacy_pipeline_for_splitting;   s   
r1   text	separatorkeep_separatorbool	List[str]c                   s   |r<|r5t d| d|   fddtdt dD }t d dkr-| dd  7 } d g| }nt || }nt| }d	d |D S )
N()c                   s    g | ]} |  |d    qS )   r/   ).0i_splitsr/   r0   
<listcomp>W   s     z*_split_text_with_regex.<locals>.<listcomp>r9      r   c                 S  s   g | ]}|d kr|qS ) r/   r:   sr/   r/   r0   r>   _   s    )resplitrangelenlist)r2   r3   r4   splitsr/   r<   r0   _split_text_with_regexO   s   rJ   c                   @  s   e Zd ZdZddedddfdAddZedBddZ	dCdDddZdEd!d"Z	dFd&d'Z
dGd*d+ZedHd/d0Zed1de d2fdId<d=ZdJd?d@ZdS )Kr   z)Interface for splitting text into chunks.i     FT
chunk_sizer#   chunk_overlaplength_functionCallable[[str], int]r4   r5   add_start_indexstrip_whitespacer$   Nonec                 C  sF   ||krt d| d| d|| _|| _|| _|| _|| _|| _dS )a  Create a new TextSplitter.

        Args:
            chunk_size: Maximum size of chunks to return
            chunk_overlap: Overlap in characters between chunks
            length_function: Function that measures the length of given chunks
            keep_separator: Whether to keep the separator in the chunks
            add_start_index: If `True`, includes chunk's start index in metadata
            strip_whitespace: If `True`, strips whitespace from the start and end of
                              every document
        zGot a larger chunk overlap (z) than chunk size (z), should be smaller.N)
ValueError_chunk_size_chunk_overlap_length_function_keep_separator_add_start_index_strip_whitespace)selfrL   rM   rN   r4   rP   rQ   r/   r/   r0   __init__e   s   
zTextSplitter.__init__r2   r"   r6   c                 C  s   dS )z$Split text into multiple components.Nr/   rZ   r2   r/   r/   r0   
split_text   s    zTextSplitter.split_textNtexts	metadatasOptional[List[dict]]List[Document]c                 C  s   |pi gt | }g }t|D ]/\}}d}| |D ]#}t|| }	| jr2|||d }||	d< t||	d}
||
 qq|S )z&Create documents from a list of texts.r@   r9   start_indexpage_contentmetadata)	rG   	enumerater]   copydeepcopyrX   findr   append)rZ   r^   r_   
_metadatas	documentsr;   r2   indexchunkre   new_docr/   r/   r0   create_documents   s   zTextSplitter.create_documentsrl   Iterable[Document]c                 C  s:   g g }}|D ]}| |j | |j q| j||dS )zSplit documents.)r_   )rj   rd   re   rp   )rZ   rl   r^   r_   docr/   r/   r0   split_documents   s
   
zTextSplitter.split_documentsdocsr3   Optional[str]c                 C  s(   | |}| jr| }|dkrd S |S )NrA   )joinrY   strip)rZ   rt   r3   r2   r/   r/   r0   
_join_docs   s   
zTextSplitter._join_docsrI   Iterable[str]c           
      C  sz  |  |}g }g }d}|D ]}|  |}|| t|dkr|nd | jkr|| jkr6td| d| j  t|dkr| ||}	|	d urK||	 || jkse|| t|dkr[|nd | jkr|dkr||  |d t|dkrt|nd 8 }|dd  }|| jkse|| t|dkr|nd | jkr|dkse|| ||t|dkr|nd 7 }q| ||}	|	d ur||	 |S )Nr   zCreated a chunk of size z%, which is longer than the specified r9   )rV   rG   rT   loggerwarningrx   rj   rU   )
rZ   rI   r3   separator_lenrt   current_doctotald_lenrr   r/   r/   r0   _merge_splits   sN   







zTextSplitter._merge_splits	tokenizerr
   kwargsc                   sZ   zddl m} t |stdd fdd	}W n ty#   td
w | dd|i|S )z>Text splitter that uses HuggingFace tokenizer to count length.r   )PreTrainedTokenizerBasezATokenizer received was not an instance of PreTrainedTokenizerBaser2   r"   r$   r#   c                   s   t  | S NrG   encoder2   r   r/   r0   _huggingface_tokenizer_length      zNTextSplitter.from_huggingface_tokenizer.<locals>._huggingface_tokenizer_lengthz`Could not import transformers python package. Please install it with `pip install transformers`.rN   Nr2   r"   r$   r#   r/   )transformersr   
isinstancerS   r+   )clsr   r   r   r   r/   r   r0   from_huggingface_tokenizer   s   
z'TextSplitter.from_huggingface_tokenizergpt2allr   Type[TS]encoding_name
model_nameallowed_special'Union[Literal['all'], AbstractSet[str]]disallowed_special&Union[Literal['all'], Collection[str]]r   c           	        s   zddl }W n ty   tdw |dur||n||d fdd	}t| tr<|| d
}i ||}| dd|i|S )z9Text splitter that uses tiktoken encoder to count length.r   NzCould not import tiktoken python package. This is needed in order to calculate max_tokens_for_prompt. Please install it with `pip install tiktoken`.r2   r"   r$   r#   c                   s   t j|  dS N)r   r   r   r   r   r   encr/   r0   _tiktoken_encoder  s   z=TextSplitter.from_tiktoken_encoder.<locals>._tiktoken_encoder)r   r   r   r   rN   r   r/   )tiktokenr+   encoding_for_modelget_encoding
issubclassTokenTextSplitter)	r   r   r   r   r   r   r   r   extra_kwargsr/   r   r0   from_tiktoken_encoder   s&   


	z"TextSplitter.from_tiktoken_encoderSequence[Document]c                 K     |  t|S )z2Transform sequence of documents by splitting them.)rs   rH   )rZ   rl   r   r/   r/   r0   transform_documents  s   z TextSplitter.transform_documents)rL   r#   rM   r#   rN   rO   r4   r5   rP   r5   rQ   r5   r$   rR   r2   r"   r$   r6   r   )r^   r6   r_   r`   r$   ra   )rl   rq   r$   ra   )rt   r6   r3   r"   r$   ru   )rI   ry   r3   r"   r$   r6   )r   r
   r   r
   r$   r   )r   r   r   r"   r   ru   r   r   r   r   r   r
   r$   r   )rl   r   r   r
   r$   r   )__name__
__module____qualname____doc__rG   r[   r   r]   rp   rs   rx   r   classmethodr   setr   r   r/   r/   r/   r0   r   b   s2     


	*+c                      s0   e Zd ZdZ	dd fddZdddZ  ZS )CharacterTextSplitterz(Splitting text that looks at characters.

Fr3   r"   is_separator_regexr5   r   r
   r$   rR   c                   s"   t  jdi | || _|| _dS )Create a new TextSplitter.Nr/   )superr[   
_separator_is_separator_regex)rZ   r3   r   r   	__class__r/   r0   r[      s   
zCharacterTextSplitter.__init__r2   r6   c                 C  sB   | j r| jnt| j}t||| j}| jrdn| j}| ||S )&Split incoming text and return chunks.rA   )r   r   rD   escaperJ   rW   r   )rZ   r2   r3   rI   r   r/   r/   r0   r]   (  s
   z CharacterTextSplitter.split_text)r   F)r3   r"   r   r5   r   r
   r$   rR   r   r   r   r   r   r[   r]   __classcell__r/   r/   r   r0   r     s
    r   c                   @  s"   e Zd ZU dZded< ded< dS )LineTypezLine type as typed dict.Dict[str, str]re   r"   contentNr   r   r   r   __annotations__r/   r/   r/   r0   r   3  s   
 r   c                   @  s*   e Zd ZU dZded< ded< ded< dS )
HeaderTypezHeader type as typed dict.r#   levelr"   namedataNr   r/   r/   r/   r0   r   :  s
   
 r   c                   @  s4   e Zd ZdZ		ddd	d
ZdddZdddZdS )MarkdownHeaderTextSplitterz4Splitting markdown files based on specified headers.FTheaders_to_split_onList[Tuple[str, str]]return_each_liner5   strip_headersc                 C  s$   || _ t|dd dd| _|| _dS )a  Create a new MarkdownHeaderTextSplitter.

        Args:
            headers_to_split_on: Headers we want to track
            return_each_line: Return each line w/ associated headers
            strip_headers: Strip split headers from the content of the chunk
        c                 S  s   t | d S )Nr   )rG   )rE   r/   r/   r0   <lambda>W  s    z5MarkdownHeaderTextSplitter.__init__.<locals>.<lambda>T)keyreverseN)r   sortedr   r   )rZ   r   r   r   r/   r/   r0   r[   E  s
   

z#MarkdownHeaderTextSplitter.__init__linesList[LineType]r$   ra   c                 C  s   g }|D ]e}|r!|d d |d kr!|d d  d|d  7  < q|rd|d d |d krdt |d d t |d k rd|d d dd d dkrd| jsd|d d  d|d  7  < |d |d d< q|| qdd	 |D S )
zCombine lines with common metadata into chunks
        Args:
            lines: Line of text / associated header metadata
        r@   re   r     

r   #c                 S      g | ]}t |d  |d dqS r   re   rc   r   r:   rn   r/   r/   r0   r>         zHMarkdownHeaderTextSplitter.aggregate_lines_to_chunks.<locals>.<listcomp>)rG   rE   r   rj   )rZ   r   aggregated_chunksliner/   r/   r0   aggregate_lines_to_chunks\  s&   	z4MarkdownHeaderTextSplitter.aggregate_lines_to_chunksr2   r"   c                 C  s  | d}g }g }i }g }i }d}d}	|D ]}
|
 }|s8|dr.|ddkr.d}d}	n|dr7d}d}	n	||	rAd}d}	|rI|| q| jD ]}\}}||rt|t|kse|t| dkr|d	ur|d
}|r|d d |kr| }|d |v r||d  |r|d d |ksx|||t|d	  d}|| |d ||< |r|d||	 d |
  | js||  nqL|r|| n|r|d||	 d |
  |	 }q|r|d||d | js| |S dd |D S )zASplit markdown file
        Args:
            text: Markdown filer   FrA   z```r9   Tz~~~ Nr   r@   r   r   )r   r   r   r   )r   re   c                 S  r   r   r   r   r/   r/   r0   r>     r   z9MarkdownHeaderTextSplitter.split_text.<locals>.<listcomp>)rE   rw   
startswithcountrj   r   rG   poprv   rg   clearr   r   r   )rZ   r2   r   lines_with_metadatacurrent_contentcurrent_metadataheader_stackinitial_metadatain_code_blockopening_fencer   stripped_linesepr   current_header_levelpopped_headerheaderr/   r/   r0   r]     s   




 




z%MarkdownHeaderTextSplitter.split_textN)FT)r   r   r   r5   r   r5   )r   r   r$   ra   r2   r"   r$   ra   )r   r   r   r   r[   r   r]   r/   r/   r/   r0   r   B  s    
*r   c                   @  s2   e Zd ZU dZded< ded< ded< ded< dS )	ElementTypezElement type as typed dict.r"   urlxpathr   r   re   Nr   r/   r/   r/   r0   r     s   
 r   c                   @  sF   e Zd ZdZ	ddddZdddZdddZdddZd ddZdS )!HTMLHeaderTextSplitterzU
    Splitting HTML files based on specified headers.
    Requires lxml package.
    Fr   r   return_each_elementr5   c                 C  s   || _ t|| _dS )ay  Create a new HTMLHeaderTextSplitter.

        Args:
            headers_to_split_on: list of tuples of headers we want to track mapped to
                (arbitrary) keys for metadata. Allowed header values: h1, h2, h3, h4,
                h5, h6 e.g. [("h1", "Header 1"), ("h2", "Header 2)].
            return_each_element: Return each element w/ associated headers.
        N)r   r   r   )rZ   r   r   r/   r/   r0   r[     s   zHTMLHeaderTextSplitter.__init__elementsList[ElementType]r$   ra   c                 C  s\   g }|D ]"}|r!|d d |d kr!|d d  d|d  7  < q| | qdd |D S )zCombine elements with common metadata into chunks

        Args:
            elements: HTML element content with associated identifying info and metadata
        r@   re   r   r   c                 S  r   r   r   r   r/   r/   r0   r>   6  r   zGHTMLHeaderTextSplitter.aggregate_elements_to_chunks.<locals>.<listcomp>)rj   )rZ   r   r   elementr/   r/   r0   aggregate_elements_to_chunks  s   z3HTMLHeaderTextSplitter.aggregate_elements_to_chunksr   r"   c                 C  s   t |}| t|jS )zHSplit HTML from web URL

        Args:
            url: web URL
        )requestsgetsplit_text_from_filer   r   )rZ   r   rr/   r/   r0   split_text_from_url;  s   
z*HTMLHeaderTextSplitter.split_text_from_urlr2   c                 C  r   )zJSplit HTML text string

        Args:
            text: HTML text
        )r   r   r\   r/   r/   r0   r]   D  s   z!HTMLHeaderTextSplitter.split_textfiler
   c                   sV  zddl m} W n ty } ztd|d}~ww | }|||}ttjd }||}|	|}||}	|
t|	}
dd | jD  t| jdd	i}g }|
d
|D ]A}|dsf|dr|t|ddd |d|D ddd |d|D fddt fdd|d|D d qZ| js| |S dd |D S )zCSplit HTML file

        Args:
            file: HTML file
        r   )etreez>Unable to import lxml, please install with `pip install lxml`.Nz7document_transformers/xsl/html_chunks_with_headers.xsltc                 S  s   g | ]}|d  qS )r   r/   )r:   r   r/   r/   r0   r>   h  s    z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<listcomp>hzhttp://www.w3.org/1999/xhtmlz*//*z*[@class='headers']z*[@class='chunk']rA   c                 S     g | ]}|j qS r/   r   r:   noder/   r/   r0   r>   x      z*[@class='xpath']c                 S  r   r/   r   r   r/   r/   r0   r>   ~  r  c                   s   i | ]	} |j  |jqS r/   )tagr2   r   )header_mappingr/   r0   
<dictcomp>  s    z?HTMLHeaderTextSplitter.split_text_from_file.<locals>.<dictcomp>c                   s
   | j  v S r   )r  )x)header_filterr/   r0   r     s   
 z=HTMLHeaderTextSplitter.split_text_from_file.<locals>.<lambda>z*[@class='headers']/*)r   r   r   re   c                 S  r   r   r   r   r/   r/   r0   r>     r   )lxmlr   r+   
HTMLParserparsepathlibPath__file__parentXSLT
fromstringr"   r   dictfindallrj   r   rv   filterr   r   )rZ   r   r   eparsertree	xslt_path	xslt_tree	transformresult
result_domns_mapr   r   r/   )r  r  r0   r   L  sn   









z+HTMLHeaderTextSplitter.split_text_from_fileN)F)r   r   r   r5   )r   r   r$   ra   )r   r"   r$   ra   r   )r   r
   r$   ra   )	r   r   r   r   r[   r   r   r]   r   r/   r/   r/   r0   r     s    


	r   T)frozenc                   @  s8   e Zd ZU dZded< 	 ded< 	 ded< 	 ded< d	S )
	TokenizerzTokenizer data class.r#   rM   tokens_per_chunkzCallable[[List[int]], str]decodezCallable[[str], List[int]]r   Nr   r/   r/   r/   r0   r    s   
 r  r   c                 C  s   g }| | }d}t||j t|}||| }|t|k rN||| |t|kr0	 |S ||j|j 7 }t||j t|}||| }|t|k s|S )z6Split incoming text and return chunks using tokenizer.r   )r   minr  rG   rj   r  rM   )r2   r   rI   	input_ids	start_idxcur_idx	chunk_idsr/   r/   r0   split_text_on_tokens  s   
r%  c                      s8   e Zd ZdZdde dfd fddZdddZ  ZS )r   z/Splitting text to tokens using model tokenizer.r   Nr   r   r"   r   ru   r   r   r   r   r   r
   r$   rR   c                   sj   t  jdi | zddl}W n ty   tdw |dur%||}n||}|| _|| _|| _dS )r   r   NzCould not import tiktoken python package. This is needed in order to for TokenTextSplitter. Please install it with `pip install tiktoken`.r/   )	r   r[   r   r+   r   r   
_tokenizer_allowed_special_disallowed_special)rZ   r   r   r   r   r   r   r   r   r/   r0   r[     s   	

zTokenTextSplitter.__init__r2   r6   c                   2   d	 fdd}t  j j jj|d}t||dS )
N_textr"   r$   	List[int]c                   s    j j|  j jdS r   )r&  r   r'  r(  )r*  rZ   r/   r0   _encode  s
   z-TokenTextSplitter.split_text.<locals>._encoderM   r  r  r   r2   r   )r*  r"   r$   r+  )r  rU   rT   r&  r  r%  )rZ   r2   r-  r   r/   r,  r0   r]     s   zTokenTextSplitter.split_text)r   r"   r   ru   r   r   r   r   r   r
   r$   rR   r   )r   r   r   r   r   r[   r]   r   r/   r/   r   r0   r     s    r   c                      s`   e Zd ZU dZ			dd fddZd ddZd!ddZd"ddZdZde	d< d#ddZ
  ZS )$%SentenceTransformersTokenTextSplitterz8Splitting text to tokens using sentence model tokenizer.2   'sentence-transformers/all-mpnet-base-v2NrM   r#   r   r"   r  Optional[int]r   r
   r$   rR   c                   sn   t  jdi |d|i zddlm} W n ty    tdw || _|| j| _| jj| _| j|d dS )r   rM   r   )SentenceTransformerzCould not import sentence_transformer python package. This is needed in order to for SentenceTransformersTokenTextSplitter. Please install it with `pip install sentence-transformers`.)r  Nr/   )	r   r[   sentence_transformersr4  r+   r   _modelr   _initialize_chunk_configuration)rZ   rM   r   r  r   r4  r   r/   r0   r[     s   
z.SentenceTransformersTokenTextSplitter.__init__c                C  sZ   t t| jj| _|d u r| j| _n|| _| j| jkr+td| j d| j d| j dd S )NzThe token limit of the models 'z' is: z. Argument tokens_per_chunk=z > maximum token limit.)r   r#   r6  max_seq_lengthmaximum_tokens_per_chunkr  rS   r   )rZ   r  r/   r/   r0   r7    s   

zESentenceTransformersTokenTextSplitter._initialize_chunk_configurationr2   r6   c                   r)  )
Nr2   r"   r$   r+  c                   s     | dd S )Nr9   r@   )r-  r   r,  r/   r0   %encode_strip_start_and_stop_token_ids  s   z_SentenceTransformersTokenTextSplitter.split_text.<locals>.encode_strip_start_and_stop_token_idsr.  r/  r2   r"   r$   r+  )r  rU   r  r   r  r%  )rZ   r2   r:  r   r/   r,  r0   r]     s   z0SentenceTransformersTokenTextSplitter.split_textc                C  s   t | |S r   )rG   r-  r\   r/   r/   r0   count_tokens#  r   z2SentenceTransformersTokenTextSplitter.count_tokensl         _max_length_equal_32_bit_integerr+  c                 C  s   | j j|| jdd}|S )Ndo_not_truncate)r    
truncation)r   r   r=  )rZ   r2   &token_ids_with_start_and_end_token_idsr/   r/   r0   r-  (  s   z-SentenceTransformersTokenTextSplitter._encode)r1  r2  N)
rM   r#   r   r"   r  r3  r   r
   r$   rR   )r  r3  r$   rR   r   r   r;  )r   r   r   r   r[   r7  r]   r<  r=  r   r-  r   r/   r/   r   r0   r0    s   
 


r0  c                   @  s`   e Zd ZdZdZdZdZdZdZdZ	dZ
d	Zd
ZdZdZdZdZdZdZdZdZdZdZdZdS )Languagez"Enum of the programming languages.cppgojavakotlinjstsphpprotopythonrstrubyrustscalaswiftmarkdownlatexhtmlsolcsharpcobolN)r   r   r   r   CPPGOJAVAKOTLINJSr   PHPPROTOPYTHONRSTRUBYRUSTSCALASWIFTMARKDOWNLATEXHTMLSOLCSHARPCOBOLr/   r/   r/   r0   rA  1  s,    rA  c                      sZ   e Zd ZdZ			dd fddZdddZd ddZed!ddZe	d"ddZ
  ZS )#RecursiveCharacterTextSplitterzSplitting text by recursively look at characters.

    Recursively tries to split by different characters to find one
    that works.
    NTF
separatorsOptional[List[str]]r4   r5   r   r   r
   r$   rR   c                   s.   t  jdd|i| |pg d| _|| _dS )r   r4   )r   r   r   rA   Nr/   )r   r[   _separatorsr   )rZ   rj  r4   r   r   r   r/   r0   r[   Q  s   
z'RecursiveCharacterTextSplitter.__init__r2   r"   r6   c                 C  s&  g }|d }g }t |D ](\}}| jr|nt|}|dkr"|} nt||r4|}||d d } nq| jr:|nt|}t||| j}	g }
| jrMdn|}|	D ]2}| || jk ra|
	| qQ|
rp| 
|
|}|| g }
|sx|	| qQ| ||}|| qQ|
r| 
|
|}|| |S )r   r@   rA   r9   N)rf   r   rD   r   searchrJ   rW   rV   rT   rj   r   extend_split_text)rZ   r2   rj  final_chunksr3   new_separatorsr;   _sr   rI   _good_splitsrC   merged_text
other_infor/   r/   r0   ro  ]  s@   

z*RecursiveCharacterTextSplitter._split_textc                 C  s   |  || jS r   )ro  rl  r\   r/   r/   r0   r]     r   z)RecursiveCharacterTextSplitter.split_textlanguagerA  c                 K  s   |  |}| d|dd|S )NT)rj  r   r/   )get_separators_for_language)r   rv  r   rj  r/   r/   r0   from_language  s   
z,RecursiveCharacterTextSplitter.from_languagec                 C  s  | t jkr	g dS | t jkrg dS | t jkrg dS | t jkr$g dS | t jkr-g dS | t jkr6g dS | t jkr?g dS | t jkrHg dS | t j	krQg d	S | t j
krZg d
S | t jkrcg dS | t jkrlg dS | t jkrug dS | t jkr~g dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS | t jkrg dS td|  dtt  )N)
class z
void z
int z
float z
double 
if 
for 
while 
switch 
case r   r   r   rA   )
func 
var 
const 
type rz  r{  r}  r~  r   r   r   rA   )ry  
public 
protected 	
private 
static rz  r{  r|  r}  r~  r   r   r   rA   )ry  r  r  r  z

internal z
companion z
fun 
val r  rz  r{  r|  z
when r~  
else r   r   r   rA   )

function r  
let r  ry  rz  r{  r|  r}  r~  	
default r   r   r   rA   )
enum 
interface z
namespace r  ry  r  r  r  r  rz  r{  r|  r}  r~  r  r   r   r   rA   )r  ry  rz  	
foreach r|  
do r}  r~  r   r   r   rA   )
z	
message z	
service r  z
option z
import z
syntax r   r   r   rA   )ry  
def z
	def r   r   r   rA   )z
=+
z
-+
z
\*+
z

.. *

r   r   r   rA   )r  ry  rz  z
unless r|  r{  r  z
begin z
rescue r   r   r   rA   )z
fn r  r  rz  r|  r{  z
loop 
match r  r   r   r   rA   )ry  z
object r  r  r  rz  r{  r|  r  r~  r   r   r   rA   )r  ry  
struct r  rz  r{  r|  r  r}  r~  r   r   r   rA   )	z
#{1,6} z```
z	
\*\*\*+
z
---+
z
___+
r   r   r   rA   )z
\\chapter{z
\\section{z
\\subsection{z
\\subsubsection{z
\\begin{enumerate}z
\\begin{itemize}z
\\begin{description}z
\\begin{list}z
\\begin{quote}z
\\begin{quotation}z
\\begin{verse}z
\\begin{verbatim}z
\egin{align}z$$$r   rA   )z<bodyz<divz<pz<brz<liz<h1z<h2z<h3z<h4z<h5z<h6z<spanz<tablez<trz<tdz<thz<ulz<olz<headerz<footerz<navz<headz<stylez<scriptz<metaz<titlerA   )r  r  z
implements z

delegate 
event ry  z

abstract r  r  r  r  z
return rz  z

continue r{  r  r|  r}  z
break r~  r  z
try z
throw z	
finally z
catch r   r   r   rA   )z
pragma z
using z

contract r  z	
library z
constructor r  r  r  z

modifier z
error r  r  rz  r{  r|  z

do while z

assembly r   r   r   rA   )z
IDENTIFICATION DIVISION.z
ENVIRONMENT DIVISION.z
DATA DIVISION.z
PROCEDURE DIVISION.z
WORKING-STORAGE SECTION.z
LINKAGE SECTION.z
FILE SECTION.z
INPUT-OUTPUT SECTION.z
OPEN z
CLOSE z
READ z
WRITE z
IF z
ELSE z
MOVE z	
PERFORM z
UNTIL z	
VARYING z
ACCEPT z	
DISPLAY z

STOP RUN.r   r   rA   z	Language z& is not supported! Please choose from )rA  rV  rW  rX  rY  rZ  r   r[  r\  r]  r^  r_  r`  ra  rb  rc  rd  re  rg  rf  rh  rS   rH   rv  r/   r/   r0   rw    sZ   


















$
!z:RecursiveCharacterTextSplitter.get_separators_for_language)NTF)
rj  rk  r4   r5   r   r5   r   r
   r$   rR   )r2   r"   rj  r6   r$   r6   r   )rv  rA  r   r
   r$   ri  )rv  rA  r$   r6   )r   r   r   r   r[   ro  r]   r   rx  staticmethodrw  r   r/   r/   r   r0   ri  J  s    

(ri  c                      s0   e Zd ZdZ	dd fddZdddZ  ZS )NLTKTextSplitterz"Splitting text using NLTK package.r   englishr3   r"   rv  r   r
   r$   rR   c                   sP   t  jdi | zddlm} || _W n ty   tdw || _|| _dS )zInitialize the NLTK splitter.r   )sent_tokenizezANLTK is not installed, please install it with `pip install nltk`.Nr/   )r   r[   nltk.tokenizer  r&  r+   r   	_language)rZ   r3   rv  r   r  r   r/   r0   r[   a  s   

zNLTKTextSplitter.__init__r2   r6   c                 C  s   | j || jd}| || jS )r   r  )r&  r  r   r   rZ   r2   rI   r/   r/   r0   r]   q  s   zNLTKTextSplitter.split_text)r   r  )r3   r"   rv  r"   r   r
   r$   rR   r   r   r/   r/   r   r0   r  ^  s
    r  c                      s4   e Zd ZdZ			dd fddZdddZ  ZS )SpacyTextSplitteraR  Splitting text using Spacy package.


    Per default, Spacy's `en_core_web_sm` model is used and
    its default max_length is 1000000 (it is the length of maximum character
    this model takes which can be increased for large files). For a faster, but
    potentially less accurate splitting, you can use `pipeline='sentencizer'`.
    r   en_core_web_smr   r3   r"   r!   r    r#   r   r
   r$   rR   c                   s*   t  jdi | t||d| _|| _dS )z#Initialize the spacy text splitter.r   Nr/   )r   r[   r1   r&  r   )rZ   r3   r!   r    r   r   r/   r0   r[     s
   
zSpacyTextSplitter.__init__r2   r6   c                 C  s$   dd |  |jD }| || jS )r   c                 s  s    | ]}|j V  qd S r   r   rB   r/   r/   r0   	<genexpr>  s    z/SpacyTextSplitter.split_text.<locals>.<genexpr>)r&  sentsr   r   r  r/   r/   r0   r]     s   zSpacyTextSplitter.split_text)r   r  r   )
r3   r"   r!   r"   r    r#   r   r
   r$   rR   r   r   r/   r/   r   r0   r  x  s    r  c                      "   e Zd ZdZd fddZ  ZS )	PythonCodeTextSplitterz/Attempts to split the text along Python syntax.r   r
   r$   rR   c                   &   |  tj}t jdd|i| dS )z$Initialize a PythonCodeTextSplitter.rj  Nr/   )rw  rA  r]  r   r[   rZ   r   rj  r   r/   r0   r[        zPythonCodeTextSplitter.__init__r   r
   r$   rR   r   r   r   r   r[   r   r/   r/   r   r0   r        r  c                      r  )	MarkdownTextSplitterz=Attempts to split the text along Markdown-formatted headings.r   r
   r$   rR   c                   r  )z"Initialize a MarkdownTextSplitter.rj  Nr/   )rw  rA  rc  r   r[   r  r   r/   r0   r[     r  zMarkdownTextSplitter.__init__r  r  r/   r/   r   r0   r    r  r  c                      r  )	LatexTextSplitterzAAttempts to split the text along Latex-formatted layout elements.r   r
   r$   rR   c                   r  )zInitialize a LatexTextSplitter.rj  Nr/   )rw  rA  rd  r   r[   r  r   r/   r0   r[     r  zLatexTextSplitter.__init__r  r  r/   r/   r   r0   r    r  r  )r!   r"   r    r#   r$   r
   )r2   r"   r3   r"   r4   r5   r$   r6   )r2   r"   r   r  r$   r6   )?r   
__future__r   rg   loggingr
  rD   abcr   r   dataclassesr   enumr   ior   r   typingr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   langchain_core.documentsr   r   	getLoggerr   rz   r   r1   rJ   r   r   r   r   r   r   r   r  r%  r   r0  r"   rA  ri  r  r  r  r  r  r/   r/   r/   r0   <module>   sX    H

 < >	 
0H    		