o
    Zh 	                     @   s~  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 d dl
mZ ejddddZedZeed ejdd	gd
ZejdddZe Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd ZeeZ dd Z!e"dkrd dl#Z#eeeeeeefZ$e$D ]4Z%e## Z&e'd D ]Z(e!ee%Z)qe## e& Z*e+  e+e%j" e+d!,e) e+d",e*d# d   qdS dS )$    N)	Tokenizer)GOLDEN_EN_RULESenF)languagecleanZ	char_spanZsentencizerZen_core_web_smZner)disabletokenize)langZ
processorsc                 C   s   t | dS )N
)	blingfireZtext_to_sentencessplittext r   U/var/www/html/lang_env/lib/python3.10/site-packages/benchmarks/benchmark_sbd_tools.pyblingfire_tokenize   s   r   c                 C   s
   t | S N)nltkZsent_tokenizer   r   r   r   nltk_tokenize   s   
r   c                 C   s   t | }dd |D S )Nc                 S   s   g | ]}|  qS r   )strip).0sr   r   r   
<listcomp>   s    z"pysbd_tokenize.<locals>.<listcomp>)pysbd_segmentersegment)r   segmentsr   r   r   pysbd_tokenize   s   
r   c                 C      dd t | jD S )Nc                 S      g | ]}|j qS r   r   r   sentr   r   r   r   !       z"spacy_tokenize.<locals>.<listcomp>)nlpsentsr   r   r   r   spacy_tokenize       r$   c                 C   r   )Nc                 S   r   r   r   r   r   r   r   r   $   r!   z&spacy_dep_tokenize.<locals>.<listcomp>)nlp_depr#   r   r   r   r   spacy_dep_tokenize#   r%   r'   c                 C   r   )Nc                 S   r   r   r   )r   er   r   r   r   '   r!   z#stanza_tokenize.<locals>.<listcomp>)
stanza_nlpZ	sentencesr   r   r   r   stanza_tokenize&   r%   r*   c                 c   s*    | D ]}d dd |D  V  qd S )N c                 s   s    | ]}t |V  qd S r   )str)r   tokenr   r   r   	<genexpr>+   s    z!make_sentences.<locals>.<genexpr>)joinr   )Zsegmented_tokensZsentencer   r   r   make_sentences)   s   r0   c                 C   s.   t | }tt|}dd t|D }|S )Nc                 S   s   g | ]}|qS r   r   r   r   r   r   r   0   s    z#syntok_tokenize.<locals>.<listcomp>)syntok_tokenizerr   syntok_segmenteriterr0   )r   tokensresultr   r   r   r   syntok_tokenize-   s   
r6   c                 C   s>   d}| D ]}|\}}||}||kr|d7 }q|t  d }|S )Nr      g      Y@)total_rules)Zgolden_rulestokenize_funcZscoreruler   expectedr   percent_scorer   r   r   	benchmark6   s   r=   __main__d   zGRS score: {:0.2f}%z&Speed(Avg over 100 runs): {:>10.2f} msi  )-r   r   ZpysbdZspacyZstanzaZsyntok.tokenizerr   Zsyntok.segmenterZ	segmenterr2   Zenglish_golden_rulesr   Z	Segmenterr   blankr"   Zadd_pipeZcreate_pipeloadr&   ZPipeliner)   r1   r   r   r   r$   r'   r*   r0   r6   lenr8   r=   __name__timeZ	librariesr9   trangeir<   Z
time_takenprintformatr   r   r   r   <module>   sZ    


