o
    if                     @   sX   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 G dd deZdS )	    N)Text)ListItemReplacer)ExclamationWords)BetweenPunctuation)AbbreviationReplacerc                   @   s   e Zd Zd%ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd Zd d! Zd"d# Zd$S )&	ProcessorFc                 C   s   || _ || _|| _dS )an  Process a text - do pre and post processing - to get proper sentences

        Parameters
        ----------
        text : str
            Original text
        language : object
            Language module
        char_span : bool, optional
            Get start & end character offsets of each sentences
            within original text, by default False
        N)textlang	char_span)selfr   r	   r
    r   H/var/www/html/corbot_env/lib/python3.10/site-packages/pysbd/processor.py__init__   s   
zProcessor.__init__c                 C   s   | j s| j S | j dd| _ t| j }| | _ |   |   |   |   t| j 	| j
jj| j
j| j
j| _ |  }|S )N
)r   replacer   add_line_breakreplace_abbreviationsreplace_numbersreplace_continuous_punctuation)replace_periods_before_numeric_referencesr   applyr	   AbbreviationWithMultiplePeriodsAndEmailRuleGeoLocationRuleFileFormatRulesplit_into_segments)r   lipostprocessed_sentsr   r   r   process   s   


zProcessor.processc                 C   s`   t td|}tdd |D s|S g }|D ]}t|t r(|D ]}|| qq|| q|S )zRemove None values and unpack list of list sents

        Parameters
        ----------
        sents : list
            list of sentences

        Returns
        -------
        list
            unpacked and None removed list of sents
        Nc                 s   s    | ]}t |tV  qd S N)
isinstancelist.0sr   r   r   	<genexpr>:   s    z,Processor.rm_none_flatten.<locals>.<genexpr>)r"   filteranyr!   append)r   sents	new_sentssentr%   r   r   r   rm_none_flatten,   s   
zProcessor.rm_none_flattenc                    s        jd} |} fdd|D } fdd|D } |}g }|D ]-}t|j jjj } 	|}|rHt
|trH|| q*t
|trW|D ]}|| qOq* fdd|D }|S )Nr   c                    s,   g | ]}t |j jjg jjjR  qS r   )r   r   r	   SingleNewLineRuleEllipsisRulesAllr#   r   r   r   
<listcomp>J   s    z1Processor.split_into_segments.<locals>.<listcomp>c                    s   g | ]}  |qS r   )check_for_punctuationr#   r1   r   r   r2   N   s    c                    s   g | ]}t | jjqS r   )r   r   r	   SubSingleQuoteRule)r$   nsr1   r   r   r2   Z   s    )check_for_parens_between_quotesr   splitr-   r   r   r	   SubSymbolsRulesr0   post_process_segmentsr!   strr)   r"   )r   r*   r   r,   post_process_sentppsr   r1   r   r   E   s,   





zProcessor.split_into_segmentsc                 C   sv   t |dkrtd|r|S td|r	 t|j| jjj }t| jj	|r1t
| jj|}|S |dd}| S )N   z\A[a-zA-Z]*\Zz\tr    )lenresearchmatchr   r   r	   ReinsertEllipsisRulesr0   "QUOTATION_AT_END_OF_SENTENCE_REGEXr7   .SPLIT_SPACE_QUOTATION_AT_END_OF_SENTENCE_REGEXr   stripr   txtr   r   r   r9   ^   s   zProcessor.post_process_segmentsc                 C   "   dd }t | jj|| j| _d S )Nc                 S   s(   |   } tdd| }tdd|}|S )Nz\s(?=\()r   z	(?<=\))\s)groupr@   subrB   sub1sub2r   r   r   paren_replacey   s   z@Processor.check_for_parens_between_quotes.<locals>.paren_replace)r@   rK   r	   "PARENS_BETWEEN_DOUBLE_QUOTES_REGEXr   )r   rO   r   r   r   r6   x      

z)Processor.check_for_parens_between_quotesc                 C   rI   )Nc                 S   s4   |   } ttdd| }ttdd|}|S )N!u   &ᓴ&?u   &ᓷ&)rJ   r@   rK   escaperL   r   r   r   continuous_puncs_replace   s   zJProcessor.replace_continuous_punctuation.<locals>.continuous_puncs_replace)r@   rK   r	   CONTINUOUS_PUNCTUATION_REGEXr   )r   rU   r   r   r   r      rQ   z(Processor.replace_continuous_punctuationc                 C   s   t | jjd| j| _d S )Nu	   ∯\2\r\7)r@   rK   r	   NUMBERED_REFERENCE_REGEXr   r1   r   r   r   r      s   

z3Processor.replace_periods_before_numeric_referencesc                 C   s   t dd|}t|dkS )Nz_{3,}r>   r   )r@   rK   r?   rG   r   r   r   consecutive_underscore   s   z Processor.consecutive_underscorec                    s.   t  fdd| jjD r|  }|S  gS )Nc                 3   s    | ]}| v V  qd S r    r   )r$   prH   r   r   r&      s    z2Processor.check_for_punctuation.<locals>.<genexpr>)r(   r	   Punctuationsprocess_text)r   rH   r*   r   rZ   r   r3      s   
zProcessor.check_for_punctuationc                 C   s   |d | j jvr|d7 }t|}| |}t| j jj|s)t	|j
| j jj }t	|j
| j jg| j jjR  }t| }| |}|S )Nu   ȸ)r	   r[   r   apply_rulesbetween_punctuationr@   rB   DoublePunctuationRulesDoublePunctuationr   r   r0   QuestionMarkInQuotationRuleExclamationPointRulesr   replace_parenssentence_boundary_punctuationrG   r   r   r   r\      s   


zProcessor.process_textc                 C   s   t | jj| jjj | _d S r    )r   r   r   r	   Numbersr0   r1   r   r   r   r      s   zProcessor.replace_numbersc                 C   s,   t | jdr| j| j| jS t| j| jS )Nr   )hasattrr	   r   r   r1   r   r   r   abbreviations_replacer   s   z Processor.abbreviations_replacerc                 C   s   |    | _d S r    )rh   r   r   r1   r   r   r   r      s   zProcessor.replace_abbreviationsc                 C   s    t | jdr| j|S t|S )Nr   )rg   r	   r   rG   r   r   r   between_punctuation_processor   s   z'Processor.between_punctuation_processorc                 C   s   |  | }|S r    )ri   r   rG   r   r   r   r_      s   zProcessor.between_punctuationc                 C   sh   t | jdrt|| jj}t | jdrt|| jj}tdd|}dd t| jj	|D }|S )NReplaceColonBetweenNumbersRule#ReplaceNonSentenceBoundaryCommaRuleu   &ᓴ&$rR   c                 S   s   g | ]}|  qS r   )rJ   )r$   mr   r   r   r2      s    z;Processor.sentence_boundary_punctuation.<locals>.<listcomp>)
rg   r	   r   r   rj   rk   r@   rK   finditerSENTENCE_BOUNDARY_REGEXrG   r   r   r   re      s   z'Processor.sentence_boundary_punctuationN)F)__name__
__module____qualname__r   r   r-   r   r9   r6   r   r   rX   r3   r\   r   rh   r   ri   r_   re   r   r   r   r   r   	   s$    
		r   )r@   pysbd.utilsr   pysbd.lists_item_replacerr   pysbd.exclamation_wordsr   pysbd.between_punctuationr   pysbd.abbreviation_replacerr   objectr   r   r   r   r   <module>   s   