o
    Zh                     @   sL   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ G dd de	Z
dS )    N)Language)	Processor)Cleaner)TextSpanc                   @   s6   e Zd ZdddZdd Zdd	 Zd
d Zdd ZdS )	SegmenterenFNc                 C   sX   || _ t|| _|| _|| _|| _| jr| jrtd| jdkr(| js*tddS dS )a  Segments a text into an list of sentences
        with or withour character offsets from original text

        Parameters
        ----------
        language : str, required
            specify a language use its two character ISO 639-1 code,
            by default "en"
        clean : bool, optional
            cleans original text, by default False
        doc_type : [type], optional
            Normal text or OCRed text, by default None
            set to `pdf` for OCRed text
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        zWchar_span must be False if clean is True. Since `clean=True` will modify original text.pdfzl`doc_type='pdf'` should have `clean=True` & `char_span` should be False since originaltext will be modified.N)languager   Zget_language_codelanguage_modulecleandoc_type	char_span
ValueError)selfr	   r   r   r    r   F/var/www/html/lang_env/lib/python3.10/site-packages/pysbd/segmenter.py__init__   s   zSegmenter.__init__c                 C   4   t | jdr| jj|| j| jdS t|| j| jdS )Nr   )r   )hasattrr
   r   r   r   textr   r   r   cleaner,   s
   zSegmenter.cleanerc                 C   r   )Nr   )r   )r   r
   r   r   r   r   r   r   	processor3   s   zSegmenter.processorc           	      C   sl   g }d}|D ]-}t dt || jD ]}| }| \}}||kr2|t||| |} nqq|S )Nr   z{0}\s*)	refinditerformatescapeoriginal_textgroupspanappendr   )	r   Z	sentencesZ
sent_spansZprior_end_char_idxsentmatchZ	match_strZmatch_start_idxZmatch_end_idxr   r   r   sentences_with_char_spans;   s   
z#Segmenter.sentences_with_char_spansc                 C   sf   || _ |sg S | js| jdkr| | }| | }| |}| jr'|S | jr,|S dd |D S )Nr   c                 S   s   g | ]}|j qS r   )r!   ).0Ztextspanr   r   r   
<listcomp>`   s    z%Segmenter.segment.<locals>.<listcomp>)r   r   r   r   r   processr#   r   )r   r   Zpostprocessed_sentsZsentence_w_char_spansr   r   r   segmentO   s   
zSegmenter.segment)r   FNF)__name__
__module____qualname__r   r   r   r#   r'   r   r   r   r   r   	   s    
!r   )r   Zpysbd.languagesr   Zpysbd.processorr   Zpysbd.cleanerr   Zpysbd.utilsr   objectr   r   r   r   r   <module>   s   