o
    Zh                     @   sX   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 G dd deZdS )	    N)Text)ListItemReplacer)ExclamationWords)BetweenPunctuation)AbbreviationReplacerc                   @   s   e Zd Zd%ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$S )&	ProcessorFc                 C   s   || _ || _|| _dS )an  Process a text - do pre and post processing - to get proper sentences

        Parameters
        ----------
        text : str
            Original text
        language : object
            Language module
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        N)textlang	char_span)selfr   r	   r
    r   F/var/www/html/lang_env/lib/python3.10/site-packages/pysbd/processor.py__init__   s   
zProcessor.__init__c                 C   s   | j s| j S | j dd| _ t| j }| | _ |   |   |   |   t| j 	| j
jj| j
j| j
j| _ |  }|S )N
)r   replacer   Zadd_line_breakreplace_abbreviationsreplace_numbersreplace_continuous_punctuation)replace_periods_before_numeric_referencesr   applyr	   ZAbbreviationZWithMultiplePeriodsAndEmailRuleZGeoLocationRuleZFileFormatRulesplit_into_segments)r   Zlipostprocessed_sentsr   r   r   process   s   


zProcessor.processc                 C   s`   t td|}tdd |D s|S g }|D ]}t|t r(|D ]}|| qq|| q|S )zRemove None values and unpack list of list sents

        Parameters
        ----------
        sents : list
            list of sentences

        Returns
        -------
        list
            unpacked and None removed list of sents
        Nc                 s   s    | ]}t |tV  qd S N)
isinstancelist.0sr   r   r   	<genexpr>:   s    z,Processor.rm_none_flatten.<locals>.<genexpr>)r   filteranyr   append)r   sentsZ	new_sentssentr   r   r   r   rm_none_flatten,   s   
zProcessor.rm_none_flattenc                    s        jd} |} fdd|D } fdd|D } |}g }|D ]-}t|j jjj } 	|}|rHt
|trH|| q*t
|trW|D ]}|| qOq* fdd|D }|S )Nr   c                    s,   g | ]}t |j jjg jjjR  qS r   )r   r   r	   ZSingleNewLineRuleZEllipsisRulesAllr   r   r   r   
<listcomp>J   s    z1Processor.split_into_segments.<locals>.<listcomp>c                    s   g | ]}  |qS r   )check_for_punctuationr   r(   r   r   r)   N   s    c                    s   g | ]}t | jjqS r   )r   r   r	   ZSubSingleQuoteRule)r   nsr(   r   r   r)   Z   s    )check_for_parens_between_quotesr   splitr&   r   r   r	   ZSubSymbolsRulesr'   post_process_segmentsr   strr#   r   )r   r$   r   r%   Zpost_process_sentZppsr   r(   r   r   E   s,   





zProcessor.split_into_segmentsc                 C   sv   t |dkrtd|r|S td|r	 t|j| jjj }t| jj	|r1t
| jj|}|S |dd}| S )N   z\A[a-zA-Z]*\Zz\tr    )lenresearchmatchr   r   r	   ZReinsertEllipsisRulesr'   Z"QUOTATION_AT_END_OF_SENTENCE_REGEXr-   Z.SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEXr   stripr   txtr   r   r   r.   ^   s   zProcessor.post_process_segmentsc                 C   "   dd }t | jj|| j| _d S )Nc                 S   s(   |   } tdd| }tdd|}|S )Nz\s(?=\()r   z	(?<=\))\s)groupr3   subr5   Zsub1Zsub2r   r   r   paren_replacey   s   z@Processor.check_for_parens_between_quotes.<locals>.paren_replace)r3   r;   r	   Z"PARENS_BETWEEN_DOUBLE_QUOTES_REGEXr   )r   r=   r   r   r   r,   x      

z)Processor.check_for_parens_between_quotesc                 C   r9   )Nc                 S   s4   |   } ttdd| }ttdd|}|S )N!u   &ᓴ&?u   &ᓷ&)r:   r3   r;   escaper<   r   r   r   continuous_puncs_replace   s   zJProcessor.replace_continuous_punctuation.<locals>.continuous_puncs_replace)r3   r;   r	   ZCONTINUOUS_PUNCTUATION_REGEXr   )r   rB   r   r   r   r      r>   z(Processor.replace_continuous_punctuationc                 C   s   t | jjd| j| _d S )Nu	   ∯\2\r\7)r3   r;   r	   ZNUMBERED_REFERENCE_REGEXr   r(   r   r   r   r      s   

z3Processor.replace_periods_before_numeric_referencesc                 C   s   t dd|}t|dkS )Nz_{3,}r1   r   )r3   r;   r2   r7   r   r   r   consecutive_underscore   s   z Processor.consecutive_underscorec                    s.   t  fdd| jjD r|  }|S  gS )Nc                 3   s    | ]}| v V  qd S r   r   )r   pr8   r   r   r       s    z2Processor.check_for_punctuation.<locals>.<genexpr>)r"   r	   Punctuationsprocess_text)r   r8   r$   r   rE   r   r*      s   
zProcessor.check_for_punctuationc                 C   s   |d | j jvr|d7 }t|}| |}t| j jj|s)t	|j
| j jj }t	|j
| j jg| j jjR  }t| }| |}|S )Nu   ȸ)r	   rF   r   Zapply_rulesbetween_punctuationr3   r5   ZDoublePunctuationRulesZDoublePunctuationr   r   r'   ZQuestionMarkInQuotationRuleZExclamationPointRulesr   Zreplace_parenssentence_boundary_punctuationr7   r   r   r   rG      s   


zProcessor.process_textc                 C   s   t | jj| jjj | _d S r   )r   r   r   r	   ZNumbersr'   r(   r   r   r   r      s   zProcessor.replace_numbersc                 C   s,   t | jdr| j| j| jS t| j| jS )Nr   )hasattrr	   r   r   r(   r   r   r   abbreviations_replacer   s   z Processor.abbreviations_replacerc                 C   s   |    | _d S r   )rL   r   r   r(   r   r   r   r      s   zProcessor.replace_abbreviationsc                 C   s    t | jdr| j|S t|S )Nr   )rK   r	   r   r7   r   r   r   between_punctuation_processor   s   z'Processor.between_punctuation_processorc                 C   s   |  | }|S r   )rM   r   r7   r   r   r   rI      s   zProcessor.between_punctuationc                 C   sh   t | jdrt|| jj}t | jdrt|| jj}tdd|}dd t| jj	|D }|S )NReplaceColonBetweenNumbersRule#ReplaceNonSentenceBoundaryCommaRuleu   &ᓴ&$r?   c                 S   s   g | ]}|  qS r   )r:   )r   mr   r   r   r)      s    z;Processor.sentence_boundary_punctuation.<locals>.<listcomp>)
rK   r	   r   r   rN   rO   r3   r;   finditerZSENTENCE_BOUNDARY_REGEXr7   r   r   r   rJ      s   z'Processor.sentence_boundary_punctuationN)F)__name__
__module____qualname__r   r   r&   r   r.   r,   r   r   rC   r*   rG   r   rL   r   rM   rI   rJ   r   r   r   r   r   	   s$    
		r   )r3   Zpysbd.utilsr   Zpysbd.lists_item_replacerr   Zpysbd.exclamation_wordsr   Zpysbd.between_punctuationr   Zpysbd.abbreviation_replacerr   objectr   r   r   r   r   <module>   s   