o
    Zhw                  	   @   sz  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 ej
ddddZedZeed ejddgd	Zejdd
dZe Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zedkrd dlZeeeeeeefZ e D ]<Z!e Z"e#dZ$e$% Z&W d   n1 sw   Y  ee&e!Z'e e" Z(e)  e)e!j e)d *e(d!  q~dS dS )"    N)	TokenizerenF)languagecleanZ	char_spanZsentencizerZen_core_web_smZner)disabletokenize)langZ
processorsc                 C   s   t | dS )N
)	blingfireZtext_to_sentencessplittext r   Y/var/www/html/lang_env/lib/python3.10/site-packages/benchmarks/bigtext_speed_benchmark.pyblingfire_tokenize   s   r   c                 C   s
   t | S N)nltkZsent_tokenizer   r   r   r   nltk_tokenize   s   
r   c                 C   s   t | }dd |D }|S )Nc                 S   s   g | ]}|  qS r   )strip).0sr   r   r   
<listcomp>   s    z"pysbd_tokenize.<locals>.<listcomp>)pysbd_segmentersegment)r   segmentsr   r   r   pysbd_tokenize   s   
r   c                 C      dd t | jD S )Nc                 S      g | ]}|j d qS r	   r   r   r   sentr   r   r   r           z"spacy_tokenize.<locals>.<listcomp>)nlpsentsr   r   r   r   spacy_tokenize      r%   c                 C   r   )Nc                 S   r   r   r   r    r   r   r   r   #   r"   z&spacy_dep_tokenize.<locals>.<listcomp>)nlp_depr$   r   r   r   r   spacy_dep_tokenize"   r&   r(   c                 C   r   )Nc                 S   s   g | ]}|j qS r   r   )r   er   r   r   r   &   s    z#stanza_tokenize.<locals>.<listcomp>)
stanza_nlp	sentencesr   r   r   r   stanza_tokenize%   r&   r,   c                 c   s*    | D ]}d dd |D  V  qd S )N c                 s   s    | ]}t |V  qd S r   )str)r   tokenr   r   r   	<genexpr>*   s    z!make_sentences.<locals>.<genexpr>)joinr   )Zsegmented_tokensZsentencer   r   r   make_sentences(   s   r2   c                 C   s.   t | }tt|}dd t|D }|S )Nc                 S   s   g | ]}|qS r   r   r    r   r   r   r   /   s    z#syntok_tokenize.<locals>.<listcomp>)syntok_tokenizerr   syntok_segmenteriterr2   )r   tokensresultr   r   r   r   syntok_tokenize,   s   
r8   c                 C   s   || }|S r   r   )big_texttokenize_funcr   r   r   r   speed_benchmark2   s   r;   __main__zbenchmarks/1661-0.txtzSpeed : {:>20.2f} msi  )+r
   r   ZpysbdZspacyZstanzaZsyntok.tokenizerr   Zsyntok.segmenterZ	segmenterr4   Z	Segmenterr   blankr#   Zadd_pipeZcreate_pipeloadr'   ZPipeliner*   r3   r   r   r   r%   r(   r,   r2   r8   r;   __name__timeZ	librariesr:   topenZbigfilereadr9   r+   Z
time_takenprintformatr   r   r   r   <module>   sX    
	



