o
    if                     @   sL   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ G dd de	Z
dS )    N)Language)	Processor)Cleaner)TextSpanc                   @   s6   e Zd ZdddZdd Zdd	 Zd
d Zdd ZdS )	SegmenterenFNc                 C   sX   || _ t|| _|| _|| _|| _| jr| jrtd| jdkr(| js*tddS dS )a  Segments a text into an list of sentences
        with or withour character offsets from original text

        Parameters
        ----------
        language : str, required
            specify a language use its two character ISO 639-1 code,
            by default "en"
        clean : bool, optional
            cleans original text, by default False
        doc_type : [type], optional
            Normal text or OCRed text, by default None
            set to `pdf` for OCRed text
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        zWchar_span must be False if clean is True. Since `clean=True` will modify original text.pdfzl`doc_type='pdf'` should have `clean=True` & `char_span` should be False since originaltext will be modified.N)languager   get_language_codelanguage_modulecleandoc_type	char_span
ValueError)selfr	   r   r   r    r   H/var/www/html/corbot_env/lib/python3.10/site-packages/pysbd/segmenter.py__init__   s   zSegmenter.__init__c                 C   4   t | jdr| jj|| j| jdS t|| j| jdS )Nr   )r   )hasattrr   r   r   r   textr   r   r   cleaner,   s
   zSegmenter.cleanerc                 C   r   )Nr   )r   )r   r   r   r   r   r   r   r   	processor3   s   zSegmenter.processorc           	      C   sl   g }d}|D ]-}t dt || jD ]}| }| \}}||kr2|t||| |} nqq|S )Nr   z{0}\s*)	refinditerformatescapeoriginal_textgroupspanappendr   )	r   	sentences
sent_spansprior_end_char_idxsentmatch	match_strmatch_start_idxmatch_end_idxr   r   r   sentences_with_char_spans;   s   
z#Segmenter.sentences_with_char_spansc                 C   sf   || _ |sg S | js| jdkr| | }| | }| |}| jr'|S | jr,|S dd |D S )Nr   c                 S   s   g | ]}|j qS r   )r%   ).0textspanr   r   r   
<listcomp>`   s    z%Segmenter.segment.<locals>.<listcomp>)r   r   r   r   r   processr*   r   )r   r   postprocessed_sentssentence_w_char_spansr   r   r   segmentO   s   
zSegmenter.segment)r   FNF)__name__
__module____qualname__r   r   r   r*   r1   r   r   r   r   r   	   s    
!r   )r   pysbd.languagesr   pysbd.processorr   pysbd.cleanerr   pysbd.utilsr   objectr   r   r   r   r   <module>   s   