o
    ifT                     @   s<   d dl Z d dlmZ d dlmZmZmZ G dd deZ	dS )    N)Text)PDFHTML
CleanRulesc                   @   s   e Zd Zd ddZdd Zdd Zdd	 Zd
d Zdd Zdd Z	dd Z
dd Zdd Zdd Zdd Zdd Zdd Zdd ZdS )!CleanerNc                 C   s   || _ || _|| _d S N)textlangdoc_type)selfr   r	   r
    r   F/var/www/html/corbot_env/lib/python3.10/site-packages/pysbd/cleaner.py__init__	   s   
zCleaner.__init__c                 C   s   | j s| j S |   |   |   |   t| j jtj | _ | 	  t| j t
j| _ |   |   |   |   | j S r   )r   remove_all_newlinesreplace_double_newlinesreplace_newlinesreplace_escaped_newlinesr   applyr   Allreplace_punctuation_in_bracketscrInlineFormattingRuleclean_quotationsclean_table_of_contents'check_for_no_space_in_between_sentencesclean_consecutive_charactersr   r   r   r   clean   s   zCleaner.cleanc                 C   s   |    |   d S r   )$remove_newline_in_middle_of_sentence remove_newline_in_middle_of_wordr   r   r   r   r      s   zCleaner.remove_all_newlinesc                 C      dd }t d|| j| _d S )Nc                 S   s   |   } ttjd| }|S )N )groupresubr   #NEWLINE_IN_MIDDLE_OF_SENTENCE_REGEXmatchr$   r   r   r   replace_w_blank#   s   zECleaner.remove_newline_in_middle_of_sentence.<locals>.replace_w_blankz
(?:[^\.])*r#   r$   r   )r   r(   r   r   r   r   "   s   z,Cleaner.remove_newline_in_middle_of_sentencec                 C   s   t | jtj| _d S r   )r   r   r   r   NewLineInMiddleOfWordRuler   r   r   r   r   )   s   z(Cleaner.remove_newline_in_middle_of_wordc                 C      t | jtjtj| _d S r   )r   r   r   r   DoubleNewLineWithSpaceRuleDoubleNewLineRuler   r   r   r   r   ,   s   
zCleaner.replace_double_newlinesc                 C   s    t | jtjtjtj| _d S r   )r   r   r   r   NewLineFollowedByBulletRuler   NewLineInMiddleOfSentenceRule%NewLineInMiddleOfSentenceNoSpacesRuler   r   r   r   remove_pdf_line_breaks0   s   zCleaner.remove_pdf_line_breaksc                 C   s2   | j dkr|   d S t| jtjtj| _d S )Npdf)r
   r1   r   r   r   r   NewLineFollowedByPeriodRule$ReplaceNewlineWithCarriageReturnRuler   r   r   r   r   6   s   
zCleaner.replace_newlinesc                 C   s$   t | jtjtjtjtj| _d S r   )r   r   r   r   EscapedNewLineRuleEscapedCarriageReturnRuleTypoEscapedNewLineRuleTypoEscapedCarriageReturnRuler   r   r   r   r   >   s   z Cleaner.replace_escaped_newlinesc                 C   r    )Nc                 S   s,   |   } d| v rttdd| }|S | S )N?u   &ᓷ&)r"   r#   r$   escaper&   r   r   r   replace_punctF   s
   z>Cleaner.replace_punctuation_in_brackets.<locals>.replace_punctz\[(?:[^\]])*\]r)   )r   r;   r   r   r   r   E   s   z'Cleaner.replace_punctuation_in_bracketsc                 C   s.   t dd| j| _t| jtjtj| _d S )N`')r#   r$   r   r   r   r   QuotationsFirstRuleQuotationsSecondRuler   r   r   r   r   N   s
   

zCleaner.clean_quotationsc                 C   s    t | jtjtjtj| _d S r   )r   r   r   r   TableOfContentsRuleConsecutivePeriodsRuleConsecutiveForwardSlashRuler   r   r   r   r   W   s
   

zCleaner.clean_table_of_contentsc                    sR   t | s|S t fddtjD r|S t |}t t  ||}|S )Nc                 3   s    | ]}| v V  qd S r   r   ).0kwordr   r   	<genexpr>`   s    z9Cleaner.search_for_connected_sentences.<locals>.<genexpr>)	r#   searchanyr   URL_EMAIL_KEYWORDSr   r   r$   r:   )r   rF   txtregexrulenew_wordr   rE   r   search_for_connected_sentences]   s   z&Cleaner.search_for_connected_sentencesc                 C   sJ   | j d}|D ]}| || j tjtj| _ | || j tjtj| _ qd S )N )r   splitrO   r    NO_SPACE_BETWEEN_SENTENCES_REGEXNoSpaceBetweenSentencesRule&NO_SPACE_BETWEEN_SENTENCES_DIGIT_REGEX NoSpaceBetweenSentencesDigitRule)r   wordsrF   r   r   r   r   f   s
   z/Cleaner.check_for_no_space_in_between_sentencesc                 C   r+   r   )r   r   r   r   rA   rB   r   r   r   r   r   l   s   

z$Cleaner.clean_consecutive_charactersr   )__name__
__module____qualname__r   r   r   r   r   r   r1   r   r   r   r   r   rO   r   r   r   r   r   r   r      s     
			r   )
r#   pysbd.utilsr   pysbd.clean.rulesr   r   r   r   objectr   r   r   r   r   <module>   s   