o
    /ifO                     @   s  d dl Z d dlmZmZmZmZ d dlZd dlmZ d dl	m
Z
 d dlmZmZmZmZ d dlmZ d dlmZ d dlmZ d d	lmZmZ d d
lmZ d dlmZmZmZmZ d dlm Z m!Z! d dl"m#Z#m$Z$m%Z%m&Z& d dl'm(Z( eG dd dZ)dedefddZ*deeef dee de)fddZ+				d8deeef dee, dee- dee dee. deeef fddZ/	d9d eeeef  dee d!e-d"e-d#e-dee) fd$d%Z0	d9d eeeef  dee d!e-d"e-d#e-dee) fd&d'Z1	(d:deeef dee d)e-fd*d+Z2		(	(	(	(		d;d eeeef  dee d,eee.ee.e,e3f f  d)e-d-e-d.e-d/e-d"e-d!e-fd0d1Z4d2e)fd3d4Z5d5ee) de6fd6d7Z7dS )<    N)ListOptionalUnionDict)	dataclass)process_hyperparameters)drop_and_copyget_or_create_event_loopshould_ignore_errorsshould_use_cache)capture_evaluation_run)
BaseMetric)measure_metrics_with_indicator)LLMTestCaseConversationalTestCase)PYTEST_RUN_TEST_NAME)test_run_managerLLMApiTestCaseConversationalApiTestCaseMetricMetadata)get_is_running_deepevalset_indicator)test_run_cache_managerCacheCachedTestCaseCachedMetricData)get_trace_stackc                   @   sV   e Zd ZU dZeed< ee ed< eed< eed< eed< ee ed< ee ed< d	S )

TestResultzReturned from run_testsuccessmetricsinputactual_outputexpected_outputcontextretrieval_contextN)	__name__
__module____qualname____doc__bool__annotations__r   r   str r,   r,   J/var/www/html/corbot_env/lib/python3.10/site-packages/deepeval/evaluate.pyr   $   s   
 r   metricreturnc                 C   s\   | j d urt| j| jd d d| j| j| j | jd	S t| j| j| j| j| 	 | j| jd | jd	S )NF)	r.   	thresholdscorereasonr   
strictModeevaluationModelerrorevaluationCost)	r.   r1   r0   r2   r   r3   r4   r5   r6   )
r5   r   r%   r0   strict_modeevaluation_modelevaluation_costr1   r2   is_successful)r.   r,   r,   r-   create_metric_metadata1   s.   
r;   	test_caser   c              	   C   sF   t | tr| jt| jd  }n| }t|j||j|j|j|j	|j
dS )N   )r   r   r    r!   r"   r#   r$   )
isinstancer   messageslenr   r   r    r!   r"   r#   r$   )r<   r   tcr,   r,   r-   create_test_resultL   s   
rB   Findex
is_messageadditional_metadatacommentsc           	         s   t  trC|rd }d| }|}| _| _d }nd}ttd| } j}t }t	| j
 j j j j|d d d | j j|dS t  tretttd| dd dd  j fddt jD d	S d S )
Nmessage_T
test_case_)namer    actualOutputexpectedOutputr#   retrievalContextr   metricsMetadatarunDurationr6   orderadditionalMetadatarF   
traceStackconversational_test_case_r   c                    s$   g | ]\}}t ||d  j jqS T)create_api_test_caserE   rF   ).0irA   r<   r,   r-   
<listcomp>   s    z(create_api_test_case.<locals>.<listcomp>)rI   r   rM   rN   r6   rO   	testCases)r>   r   rE   rF   osgetenvr   _dataset_rankr   r   r    r!   r"   r#   r$   r   r   	enumerater?   )	r<   rC   rD   rE   rF   r   rI   rO   rQ   r,   rW   r-   rT   `   sV   




rT   
test_casesignore_errors	use_cachesave_to_diskc                 C   s,  g }|dkt _|t_t }t| D ] \}}td t|tr)t	|j
d }	nd}	d }
|r;t|tr;t ||j}
t||}t }t }|D ]}d }|
d ur\t||
}|r\|j}|d u rd|_zt|tro|j
|	 }n|}|| W n ty } z|rt||_d|_n W Y d }~nd }~ww t|}t|tr|||	 n|| |jd u rt|trt|}d|_t|t|d}|j !| qIt }|| }||_"t#|| t|trt $|||j t j$|||jdd t%|t&|d	d
g}|!| W d    n	1 sw   Y  q|S )NF	test caser=   r   metric_metadatametric_configurationTto_tempmodel
embeddings)'r   disable_write_cacher   ra   get_test_runr]   r   r>   r   r@   r?   r   get_cached_test_casehyperparametersrT   r   timeperf_counterr   get_metric_datare   
async_modemeasure	Exceptionr+   r5   r   r;   updater9   r   create_metric_configurationcached_metrics_dataappendrun_durationupdate_test_runcache_test_caserB   r   )r^   r   r_   r`   ra   test_resultstest_runrC   r<   last_message_indexcached_test_caseapi_test_casenew_cached_test_casetest_start_timer.   re   cached_metric_datarA   ecache_metric_metadataupdated_cached_metric_datatest_end_timery   test_resultr,   r,   r-   execute_test_cases   s   







dr   c              	      s  g }|dkt _|t_t }t| D ]\}}td d }	|r,t|tr,t 	||j
}	t||}
t }t }t|||	|I d H  |D ]<}t|}t|tr[|
|t|jd  n|
| |jd u rt|trt|}d|_t|t|d}|j| qDt }|| }||
_t|
| t|trt |||j
 t j|||j
dd t|
t |dd	g}|| W d    n1 sw   Y  q|S )
NFrb   r=   r   rd   Trg   ri   rj   )!r   rk   r   ra   rl   r]   r   r>   r   rm   rn   rT   r   ro   rp   r   r;   r   ru   r@   r?   r5   r9   r   r   rv   rw   rx   ry   rz   r{   rB   r   )r^   r   r_   r`   ra   r|   r}   rC   r<   r   r   r   r   r.   re   r   r   r   ry   r   r,   r,   r-   a_execute_test_cases  s   





Jr   T	run_asyncc              	   C   s   |D ]}t |tstdq|r&t }|t| g|t t t dd }nt	| g|t t t dd }|j
srg }|jD ]"}|jd urJ|| q=z| sT|| W q=   || Y q=ddd |D }td| dd S )	N/Provided 'metric' must be of type 'BaseMetric'.r_   r`   ra   r   z, c                 S   s8   g | ]}|j  d |j d|j d|j d|j d
qS )	 (score: , threshold: 
, strict: 	, error: ))r%   r1   r0   r7   r5   rU   r.   r,   r,   r-   rX     s    *zassert_test.<locals>.<listcomp>z	Metrics: z failed.)r>   r   	TypeErrorr	   run_until_completer   r
   r   r   r   r   r   r5   rx   r:   joinAssertionError)r<   r   r   r.   loopr   failed_metricsfailed_metrics_strr,   r,   r-   assert_testl  sX   




r   rn   show_indicatorprint_resultswrite_cachec	              
   C   s(  |d ur| dd u s| dd u rtdt|}t| |D ]}	t|	ts+tdq t  t	
 }
|r:td td# |rQt }|t| ||||d}n	t| ||||d}W d    n1 sdw   Y  t	
 }||
 }|r|D ]}t| qut| t }||_t  tj|dd	 |S )
Nri   zprompt templatezTA `model` and `prompt template` key must be provided when logging `hyperparameters`.r   zEvaluating test cases...z
evaluate()r   F)display_table)get
ValueErrorr   r   r>   r   r   r   resetro   rp   printr   r	   r   r   r   print_test_resultaggregate_metric_pass_ratesrl   rn   save_test_runwrap_up_test_run)r^   r   rn   r   r   r   r   r`   r_   r.   
start_timer   r|   end_timery   r   r}   r,   r,   r-   evaluate  s`   



r   r   c                 C   st  t d t d t d | jD ]x}d}|jd urd}nz| s"d}W n   d}Y |sNt d|j d|j d|j d	|j d
|j d|j	 d|j d n!t d|j d|j d|j d	|j d
|j d|j	 d|j d |j
r|j
 D ]\}}t d| d| d qwqt d t d t d| j  t d| j  t d| j  t d| j  t d| j  d S )N zG======================================================================
zMetrics Summary
TFu     - ❌ r   r   r   z, evaluation model: z
, reason: r   r   u     - ✅ z      - zFor test case:
z  - input: z  - actual output: z  - expected output: z  - context: z  - retrieval context: )r   r   r5   r:   r%   r1   r0   r7   r8   r2   score_breakdownitemsr    r!   r"   r#   r$   )r   r.   
successfulmetric_namer1   r,   r,   r-   r     s>   

<<r   r|   c                    s   i  i | D ]+}|j D ]%}|jj}| vrd |< d|<  |  d7  < |jr0|  d7  < qq fdd D }td td | D ]\}}t| d|dd	 qHtd |S )
Nr   r=   c                    s   i | ]}||  |  qS r,   r,   r   metric_countsmetric_successesr,   r-   
<dictcomp>  s    z/aggregate_metric_pass_rates.<locals>.<dictcomp>zH
======================================================================
zOverall Metric Pass Rates
z: z.2%z
 pass rate)r   	__class__r%   r   r   r   )r|   resultr.   r   metric_pass_rates	pass_rater,   r   r-   r     s,   
	r   )NFNN)FrS   )NTTTTFF)8rZ   typingr   r   r   r   ro   dataclassesr   !deepeval.test_run.hyperparametersr   deepeval.utilsr   r	   r
   r   deepeval.telemetryr   deepeval.metricsr   deepeval.metrics.indicatorr   deepeval.test_caser   r   deepeval.constantsr   deepeval.test_runr   r   r   r   r   r   deepeval.test_run.cacher   r   r   r   deepeval.tracingr   r   r;   rB   intr)   r+   rT   r   r   r   floatr   r   dictr   r,   r,   r,   r-   <module>   s    




E
x
\

:	
E&