o
    if
                     @   s  d dl Z d dlZd dlZd dlZd dlZd dlmZ d dlmZ	 d dl
mZ ejddddZedZeed ejdd	gd
ZejdddZe Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd  Z e!d!krd dl"Z"e"# Z$e$j%d"d#d$ e$& Z'eeeeeeefZ(ee'j)Z*e+e*Z,e(D ]Z-e e*e-Z.e.e, d% Z/e0  e0e-j! e0d&1e/ qdS dS )'    N)	Tokenizer)PathenF)languageclean	char_spansentencizeren_core_web_smner)disabletokenize)lang
processorsc                 C   s   t | dS )N
)	blingfiretext_to_sentencessplittext r   S/var/www/html/corbot_env/lib/python3.10/site-packages/benchmarks/genia_benchmark.pyblingfire_tokenize   s   r   c                 C   s
   t | S N)nltksent_tokenizer   r   r   r   nltk_tokenize   s   
r   c                 C   s   t | }dd |D S )Nc                 S   s   g | ]}|  qS r   )strip).0sr   r   r   
<listcomp>   s    z"pysbd_tokenize.<locals>.<listcomp>)pysbd_segmentersegment)r   segmentsr   r   r   pysbd_tokenize   s   
r#   c                 C      dd t | jD S )Nc                 S      g | ]}|j d qS r   r   r   r   sentr   r   r   r   !       z"spacy_tokenize.<locals>.<listcomp>)nlpsentsr   r   r   r   spacy_tokenize       r-   c                 C   r$   )Nc                 S   r%   r&   r'   r(   r   r   r   r   $   r*   z&spacy_dep_tokenize.<locals>.<listcomp>)nlp_depr,   r   r   r   r   spacy_dep_tokenize#   r.   r0   c                 C   r$   )Nc                 S   s   g | ]}|j qS r   r   )r   er   r   r   r   '   s    z#stanza_tokenize.<locals>.<listcomp>)
stanza_nlp	sentencesr   r   r   r   stanza_tokenize&   r.   r4   c                 c   s*    | D ]}d dd |D  V  qd S )N c                 s   s    | ]}t |V  qd S r   )str)r   tokenr   r   r   	<genexpr>+   s    z!make_sentences.<locals>.<genexpr>)joinr   )segmented_tokenssentencer   r   r   make_sentences)   s   r<   c                 C   s.   t | }tt|}dd t|D }|S )Nc                 S   s   g | ]}|qS r   r   r(   r   r   r   r   0   s    z#syntok_tokenize.<locals>.<listcomp>)syntok_tokenizerr   syntok_segmenteriterr<   )r   tokensresultr"   r   r   r   syntok_tokenize-   s   
rB   c              	   C   s   t | d}t|}g }t|ddD ]*\}}t|}|  }W d    n1 s,w   Y  |d}|||f q|S )Nz**/*.txt   )startr   )	r   globlist	enumerateopenreadr   r   append)genia_raw_dirtxtfilesall_docsindtxtfilef	geniatextexpectedr   r   r   load_genia_corpus3   s   

rS   c                 C   s.   d}| D ]\}}||}||kr|d7 }q|S )Nr   rC   r   )docstokenize_funccorrectr   rR   r"   r   r   r   	benchmark?   s   rW   __main__z--geniaz,Path to the directory containing genia data.)helpd   zGENIA abstract acc: {:0.2f}%)2r   r   pysbdspacystanzasyntok.tokenizerr   syntok.segmenter	segmenterr>   pathlibr   	Segmenterr    blankr+   add_pipecreate_pipeloadr/   Pipeliner2   r=   r   r   r#   r-   r0   r4   r<   rB   rS   rW   __name__argparseArgumentParserparseradd_argument
parse_argsargs	librariesgeniarT   lentotalrU   rV   percent_scoreprintformatr   r   r   r   <module>   sd    





