o
    Zhs)                     @  sZ  d dl mZ d dlZd dlmZmZ d dlZd dl	m
Z
mZ d dlmZ d dlmZ d dlmZmZ d dlmZ d d	lmZmZmZ d d
lmZ d dlmZ d dlm Z  d dl!m"Z"m#Z# d dl$m%Z% d dl&m'Z'm(Z(m)Z) d dl*m+Z+ d dl,m-Z- d dl.m/Z/ d dl0m1Z1m2Z2m3Z3m4Z4 ej5rd dl6m7Z7 								d-d.d*d+Z8eG d,d) d)e9Z:dS )/    )annotationsN)	dataclassfield)Datasetconcatenate_datasets)
Embeddings)BaseLanguageModel)EvaluationEventtrack)	new_group)BaseRagasEmbeddingsLangchainEmbeddingsWrapperembedding_factory)ExceptionInRunner)Executor)llm_factory)BaseRagasLLMLangchainLLMWrapper)AnswerCorrectness)MetricMetricWithEmbeddingsMetricWithLLM)AspectCritique)	RunConfigget_feature_language)handle_deprecated_ground_truthsremap_column_namesvalidate_column_dtypesvalidate_evaluation_modes)	CallbacksFTdatasetr   metricslist[Metric] | Nonellm't.Optional[BaseRagasLLM | LangchainLLM]
embeddings5t.Optional[BaseRagasEmbeddings | LangchainEmbeddings]	callbacksr    is_asyncbool
run_configt.Optional[RunConfig]raise_exceptions
column_mapt.Optional[t.Dict[str, str]]returnResultc	                    s   |pi }|pg }| du rt dpt |du r+ddlm}	m}
m}m} |	|
||g}t| |} t| } t	| | t
|  t|trHt|d}t|trQt|}g }g }g }d}t|D ]M\}t|trl||j t|tr|jdu r|du r}t }||_| t|tr|jdu r|du rt }||_| t|tr|jdu r}q]fdd|D  td	d
|d g }tdi |d\}}t| D ]1\ttj t!tj"f td |d\}||f  fdd|D  qg }zzE # }|g krt$ t| D ]2\}i }t|D ]\}}|t%| |  ||j< q|| | \}j&sC|'| qW n t(y^ } z|j&sX|)| |d}~ww t*t+,|| |d}|j&sr|'| W |D ]dtt| _qu|D ]dtt| _q|dkrdtt|| _n-|D ]dtt| _q|D ]dtt| _q|dkrdtt|| _w dd |D }dd |D }t-.dd |D }t/t0d|d| j1d t%|dkr|d ndd |S )aQ  
    Run the evaluation on the dataset with different metrics

    Parameters
    ----------
    dataset : Dataset[question: list[str], contexts: list[list[str]], answer: list[str], ground_truth: list[list[str]]]
        The dataset in the format of ragas which the metrics will use to score the RAG
        pipeline with
    metrics : list[Metric] , optional
        List of metrics to use for evaluation. If not provided then ragas will run the
        evaluation on the best set of metrics to give a complete view.
    llm: BaseRagasLLM, optional
        The language model to use for the metrics. If not provided then ragas will use
        the default language model for metrics which require an LLM. This can we overridden by the llm specified in
        the metric level with `metric.llm`.
    embeddings: BaseRagasEmbeddings, optional
        The embeddings to use for the metrics. If not provided then ragas will use
        the default embeddings for metrics which require embeddings. This can we overridden by the embeddings specified in
        the metric level with `metric.embeddings`.
    callbacks: Callbacks, optional
        Lifecycle Langchain Callbacks to run during evaluation. Check the
        [langchain documentation](https://python.langchain.com/docs/modules/callbacks/)
        for more information.
    is_async: bool, optional
        Whether to run the evaluation in async mode or not. If set to True then the
        evaluation is run by calling the `metric.ascore` method. In case the llm or
        embeddings does not support async then the evaluation can be run in sync mode
        with `is_async=False`. Default is False.
    run_config: RunConfig, optional
        Configuration for runtime settings like timeout and retries. If not provided,
        default values are used.
    raise_exceptions: True
        Whether to raise exceptions or not. If set to True then the evaluation will
        raise an exception if any of the metrics fail. If set to False then the
        evaluation will return `np.nan` for the row that failed. Default is True.
    column_map : dict[str, str], optional
        The column names of the dataset to use for evaluation. If the column names of
        the dataset are different from the default ones then you can provide the
        mapping as a dictionary here. Example: If the dataset column name is contexts_v1,
        column_map can be given as {"contexts":"contexts_v1"}

    Returns
    -------
    Result
        Result object containing the scores of each metric. You can use this do analysis
        later.

    Raises
    ------
    ValueError
        if validation fails because the columns required for the metrics are missing or
        if the columns are of the wrong format.

    Examples
    --------
    the basic usage is as follows:
    ```
    from ragas import evaluate

    >>> dataset
    Dataset({
        features: ['question', 'ground_truth', 'answer', 'contexts'],
        num_rows: 30
    })

    >>> result = evaluate(dataset)
    >>> print(result)
    {'context_precision': 0.817,
    'faithfulness': 0.892,
    'answer_relevancy': 0.874}
    ```
    NzProvide dataset!r   )answer_relevancycontext_precisioncontext_recallfaithfulnessr+   c                   s   g | ]}|  qS  )init.0mr6   r8   G/var/www/html/lang_env/lib/python3.10/site-packages/ragas/evaluation.py
<listcomp>   s    zevaluate.<locals>.<listcomp>Z
EvaluatingT)ZdescZkeep_progress_barr-   r+   zragas evaluation)nameZinputsr(   r)   zrow c              
     s.   g | ]} j |j|j d  dqS )-r?   )ZsubmitZascorer?   )r;   metric)executorir)   rowrow_group_cmr8   r=   r>      s    )scoresr!   binary_columnsc                 S  s   g | ]}|j qS r8   rA   r:   r8   r8   r=   r>      s    c                 S  s   g | ]}t |qS r8   r   r:   r8   r8   r=   r>      s    c                 S  s   g | ]}|d ur|qS )Nr8   r:   r8   r8   r=   r>      s    Z
evaluation )Z
event_typer"   Zevaluation_modeZnum_rowslanguage)2
ValueErrorr   Zragas.metricsr2   r3   r4   r5   r   r   r   r   
isinstanceLangchainLLMr   LangchainEmbeddingsr   	enumerater   appendr?   r   r$   r   r   r&   r   r   Zanswer_similarityr   r   tcastDictstrAnyresultsr   lenZendedZon_chain_end	ExceptionZon_chain_errorr1   r   	from_listnpuniquer
   r	   shape) r!   r"   r$   r&   r(   r)   r+   r-   r.   r2   r3   r4   r5   Zbinary_metricsZllm_changedZembeddings_changedZanswer_correctness_is_setrB   Zrow_run_managersZevaluation_rmZevaluation_group_cmZrow_rmrG   rV   _sjr<   eresultZmetrics_namesZmetric_langr8   )rC   rD   r)   rE   rF   r+   r=   evaluate(   s   S


















	rb   c                   @  sR   e Zd ZU ded< dZded< eedZded< d	d
 ZddddZ	dddZ
dS )r1   r   rG   Nzt.Optional[Dataset]r!   )default_factoryzt.List[str]rH   c                 C  sX   g }| j d  D ] }t| j | }|| |< || jvr)tt|}||d  q	d S )Nr   g|=)	rG   keysrZ   ZnanmeanrH   rQ   rR   floatrP   )selfvaluesZcnvaluer8   r8   r=   __post_init__  s   
zResult.__post_init__F
batch_size
int | Nonebatchedr*   c                 C  sP   | j d u r	td| jjd | j jd ksJ t| j | jgdd}|j||dS )Nz-dataset is not provided for the results classr      )Zaxis)rj   rl   )r!   rK   rG   r\   r   	to_pandas)rf   rj   rl   Z	result_dsr8   r8   r=   rn     s
   
zResult.to_pandasr0   rT   c                 C  s,   |   }dd | D }dd| d S )Nc                 S  s"   g | ]\}}d | d|dqS )'z': z0.4fr8   )r;   kvr8   r8   r=   r>   $  s   " z#Result.__repr__.<locals>.<listcomp>{z, })copyitemsjoin)rf   rG   Z
score_strsr8   r8   r=   __repr__"  s   zResult.__repr__)NF)rj   rk   rl   r*   )r0   rT   )__name__
__module____qualname____annotations__r!   r   listrH   ri   rn   rw   r8   r8   r8   r=   r1     s   
 	)NNNNFNTN)r!   r   r"   r#   r$   r%   r&   r'   r(   r    r)   r*   r+   r,   r-   r*   r.   r/   r0   r1   );
__future__r   typingrQ   dataclassesr   r   numpyrZ   Zdatasetsr   r   Zlangchain_core.embeddingsr   rN   Zlangchain_core.language_modelsr   rM   Zragas._analyticsr	   r
   Zragas.callbacksr   Zragas.embeddings.baser   r   r   Zragas.exceptionsr   Zragas.executorr   Z
ragas.llmsr   Zragas.llms.baser   r   Z!ragas.metrics._answer_correctnessr   Zragas.metrics.baser   r   r   Zragas.metrics.critiquer   Zragas.run_configr   Zragas.utilsr   Zragas.validationr   r   r   r   TYPE_CHECKINGZlangchain_core.callbacksr    rb   dictr1   r8   r8   r8   r=   <module>   sD     d