o
    Zh                     @  s   d Z ddlmZ ddlZddlZddlZddlZddlZddl	Z	ddl
m
Z
mZ ddlmZmZmZmZmZmZmZmZmZ ddlmZ ddlmZ ddlmZ dd	lmZmZ dd
lm Z m!Z! ddl"m#Z#m$Z$m%Z% ddl"m&Z' ddl"m(Z) ddl*m+Z+m,Z, ddl-m.Z. ddl/m0Z0 ddl1m2Z2m3Z3 ddl4m5Z5m6Z6 ddl4m7Z8 ddl9m:Z:m;Z; ddl<m=Z=m>Z>m?Z?m@Z@mAZA ddlBmCZC ddlDmEZE ddlFmGZG ddlHmIZI ddlJmKZK ddlLmMZMmNZNmOZO ddlPmQZR ddlSm&ZT ddlSmUZUmVZV erddlWZXeYeZZ[eeg eeIe#f f eee\gef e#eIf Z]eeg eeIe#f f ef Z^G dd de_Z`G d d! d!e\ZaG d"d# d#e\Zb	$ddd+d,Zcdd/d0ZdG d1d2 d2eGZedd4d5Zfdd;d<Zgdd?d@ZhddBdCZiddKdLZjddRdSZkddUdVZlddXdYZmddcddZnddfdgZoddidjZpdddddkddtduZqddddvddydzZrdd{dd~dZsddddvdddZtddddvdddZudd{dddZv			ddddZwG dd deGddZxejyG dd dZzdddZ{dddZ|dZ}dddddddddddZ~dddddddddddZdZee_ edde~_ dS )z>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)	TYPE_CHECKINGAnyCallableDictListOptionalTupleUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                   @  s   e Zd ZdZdS )InputFormatErrorz(Raised when the input format is invalid.N)__name__
__module____qualname____doc__ r;   r;   ^/var/www/html/lang_env/lib/python3.10/site-packages/langchain/smith/evaluation/runner_utils.pyr6   N   s    r6   c                   @  s$   e Zd ZdZd	ddZd	ddZdS )

TestResultz1A dictionary of the results of a single test run.returnpd.DataFramec                 C  s.   |   }dd |jD }|jddj|ddS )zReturn quantiles for the feedback scores.

        This method calculates and prints the quantiles for the feedback scores
        across all feedback keys.

        Returns:
            A DataFrame containing the quantiles for each feedback key.
        c                 S  s6   g | ]}| d s| ds|dv s| dr|qS )inputs.outputs.>   outputinput	reference)
startswith).0colr;   r;   r<   
<listcomp>e   s    z5TestResult.get_aggregate_feedback.<locals>.<listcomp>all)include   )Zaxis)to_dataframecolumnsZdescribeZdrop)selfdfZto_dropr;   r;   r<   get_aggregate_feedbackX   s
   z!TestResult.get_aggregate_feedbackc              
   C  sH  zddl }W n ty } ztd|d}~ww g }g }| d  D ]z\}}|d }|d}t|tr>dd | D }	n|du rEi }	nd|i}	i d	d |d
  D |	}
d|v rxt|d trr|
dd |d  D  n|d |
d< |
i dd |D |d|d |dd ||
 || q"|j||dS )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrB   c                 S     i | ]
\}}d | |qS )rA   r;   rF   kvr;   r;   r<   
<dictcomp>       z+TestResult.to_dataframe.<locals>.<dictcomp>c                 S  rS   )r@   r;   rT   r;   r;   r<   rW      rX   rC   rD   c                 S  rS   )z
reference.r;   rT   r;   r;   r<   rW      rX   c                 S  s   i | ]
}d |j  |jqS )z	feedback.)keyZscore)rF   fr;   r;   r<   rW      rX   Errorexecution_timerun_id)errorr\   r]   )index)	pandasImportErroritemsget
isinstancedictupdateappendZ	DataFrame)rN   pdeindicesrecords
example_idresultrR   Zoutput_rB   rr;   r;   r<   rL   o   sV   


zTestResult.to_dataframeN)r>   r?   )r7   r8   r9   r:   rP   rL   r;   r;   r;   r<   r=   U   s    
r=   c                      s,   e Zd ZdZd fdd	ZdddZ  ZS )	EvalErrorz"Your architecture raised an error.r[   BaseExceptionkwargsr   r>   Nonec                   s   t  jdd|i| d S )Nr[   r;   )super__init__)rN   r[   rq   	__class__r;   r<   rt      s   zEvalError.__init__namestrc                 C  s*   z| | W S  t y   td| dw )Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rN   rw   r;   r;   r<   __getattr__   s
   
zEvalError.__getattr__)r[   rp   rq   r   r>   rr   )rw   rx   r>   r   )r7   r8   r9   r:   rt   r|   __classcell__r;   r;   ru   r<   ro      s    ro   <my_dataset>llm_or_chain_factoryMODEL_OR_CHAIN_FACTORYdataset_namerx   r>   MCFc                   sZ  t | tr)|   jj}| jdur# jjj}td| d| d| d fddS t | tr0| S t | tr=| fddS t| rt	| rRt
tt| fd	dS z|  }W n& ty}   tt| }t|}td
| d t|fdd Y S w tt| t |tr|S t	tt|rt
tt|fddS t |tsfddS S | S )zForgive the user if they pass in a chain without memory instead of a chain
    factory. It's a common mistake. Raise a more helpful error message as well.Na$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                         S Nr;   r;   )chainr;   r<   <lambda>       z(_wrap_in_chain_factory.<locals>.<lambda>c                     r   r   r;   r;   )lcfr;   r<   r      r   c                     r   r   r;   r;   	runnable_r;   r<   r      r   zWrapping function z as RunnableLambda.c                     r   r   r;   r;   )wrappedr;   r<   r      r   c                     r   r   r;   r;   r   r;   r<   r      r   c                     s   t  S r   )r   r;   )constructorr;   r<   r          )rd   r.   rv   r7   Zmemory
ValueErrorr   r   callabler%   r$   r   r   	TypeErrorinspect	signatureloggerinfor   )r   r   Zchain_classZmemory_class_modelZ	user_funcsigr;   )r   r   r   r   r   r<   _wrap_in_chain_factory   sV   











r   inputsDict[str, Any]c                 C  s,  | st dg }d| v r%t| d tst dt| d j | d g}n]d| v rLt| d tr;tdd | d D sGt dt| d j | d }n6t| dkr{tt	| 
 }t|trc|g}nt|trttd	d |D rt|}nt d
|  t d|  t|dkr|d S t dt| d)zGet prompt from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A string prompt.
    Raises:
        InputFormatError: If the input format is invalid.
    Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc                 s      | ]}t |tV  qd S r   rd   rx   rF   ir;   r;   r<   	<genexpr>       

z_get_prompt.<locals>.<genexpr>z,Expected list of strings for 'prompts', got rK   c                 s  r   r   r   r   r;   r;   r<   r   
      z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r6   rd   rx   typer7   listrI   lennextitervalues)r   r   Zprompt_r;   r;   r<   _get_prompt   sD   

r   c                   @  s   e Zd ZU dZded< dS )ChatModelInputzVInput for a chat model.

    Parameters:
        messages: List of chat messages.
    zList[BaseMessage]messagesNr7   r8   r9   r:   __annotations__r;   r;   r;   r<   r     s   
 r   re   c                 C  s   | st d|  }d| v r|d|d< nt| dkr&tt|  |d< d|v rS|d }t|tr?t	dd |D r?|g}t|dkrOt
|d |d< |S t dt d	|  )
zGet Chat Messages from inputs.

    Args:
        inputs: The input dictionary.

    Returns:
        A list of chat messages.
    Raises:
        InputFormatError: If the input format is invalid.
    r   r   rC   rK   c                 s  r   r   )rd   re   r   r;   r;   r<   r   8  r   z _get_messages.<locals>.<genexpr>r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r6   copypopr   r   r   r   rd   r   rI   r   )r   Z
input_copyZraw_messagesr;   r;   r<   _get_messages$  s0   r   first_exampler(   input_mapperOptional[Callable[[Dict], Any]]rr   c                 C  s   |r+|| j }t|ts't|trtdd |D s)td| dt| dd S d S zt| j  W d S  tyT   z	t| j  W Y d S  tyS   td| j  dw w )Nc                 s  r   r   rd   r   rF   msgr;   r;   r<   r   T  r   z>_validate_example_inputs_for_language_model.<locals>.<genexpr>zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   rd   rx   r   rI   r6   r   r   r   )r   r   Zprompt_inputr;   r;   r<   +_validate_example_inputs_for_language_modelL  s8   

r   r   r.   c                 C  s   |r2|| j }t|j|}t|ts!td| dt| d|r0td|j d|  dS | j }t|j|}t	|dkrLt	|jdkrLdS |r[td|j d|  dS )	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rK   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencerd   re   r6   r   keysr   )r   r   r   Zfirst_inputsZmissing_keysr;   r;   r<   "_validate_example_inputs_for_chainm  sB   

r   examplec                 C  sZ   t |trt| | dS | }t |trt| || dS t |tr+td|  dS dS )z9Validate that the example inputs are valid for the model.zSkipping input validation for N)rd   r   r   r.   r   r   r   debug)r   r   r   r   r;   r;   r<   _validate_example_inputs  s   


r   examplesList[Example]r3   "Optional[smith_eval.RunEvalConfig]	data_typer'   Optional[List[RunEvaluator]]c           	      C  s   |r>t | trd\}}d}nd}|  }t |tr|jnd}t |tr%|jnd}t||||d jr7t|d jnd||}|S d}|S )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )rd   r   r.   r   Zoutput_keys_load_run_evaluatorsoutputsr   )	r   r   r3   r   
run_inputsrun_outputsrun_typer   run_evaluatorsr;   r;   r<   _setup_evaluation  s&   
r   r   smith_eval.RunEvalConfigr   Optional[List[str]]Optional[str]c                 C  ~   d }| j r| j }|r||vrtd| d| d |S |r*t|dkr*|d }|S |d ur=t|dkr=td| d |S )Nz
Input key z% not in chain's specified input keys '. Evaluation behavior may be undefined.rK   r   z#Chain expects multiple input keys: z, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   r;   r;   r<   _determine_input_key  s$   
r   r   c                 C  r   )NzPrediction key z& not in chain's specified output keys r   rK   r   z$Chain expects multiple output keys: zl, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   r;   r;   r<   _determine_prediction_key  s$   
r   example_outputsc                 C  sX   | j r| j }|r||vrtd| d| |S |r(t|dkr(t|d }|S d }|S )NzReference key z! not in Dataset example outputs: rK   r   )reference_keyr   r   r   )r   r   r   r;   r;   r<   _determine_reference_key  s   r   eval_configYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]eval_llmOptional[BaseLanguageModel]r   r   r   r   r"   c              	   C  s>  t | tr| S t | ttfr!t | tst| } t| |d}| j}	nBt | tjrRd|i|  }
t| j	fi |
}| j	j}	t | tj
rQ| jpF|}| jpK|}| jpP|}nt| rZt| S tdt|  t |tr|jrz|d u rztd|	 d| dtjj|||||||	gd}|S t |trtd|	 d	td|	 d
)N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)rd   r"   r0   rx   r/   valuesmith_eval_configZ
EvalConfigZ
get_kwargsZevaluator_typeZSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r2   Zrequires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer1   NotImplementedError)r   r   r   r   r   r   r   r   Z
evaluator_Zeval_type_tagrq   r#   r;   r;   r<   _construct_run_evaluator  sZ   







	
r   2Tuple[Optional[str], Optional[str], Optional[str]]c                 C  s(   t | |}t| |}t| |}|||fS r   )r   r   r   )r   r   r   r   r   r   r   r;   r;   r<   	_get_keysH  s   



r   List[RunEvaluator]c                 C  s   g }d\}}}	| j s| jr!tdd | jD r!t| |||\}}}	| j D ]}
t|
| j||||	||}|| q$| jp<g }|D ]5}t|trL|| q?t|t	ra|t
jj||||||	d q?t|rm|t| q?td| d|S )z
    Load run evaluators from a configuration.

    Args:
        config: Configuration for the run evaluators.

    Returns:
        A list of run evaluators.
    NNNc                 S  s   g | ]}t |tqS r;   )rd   r2   )rF   ri   r;   r;   r<   rH   i  s    z(_load_run_evaluators.<locals>.<listcomp>)r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr   r   r   rg   rd   r"   r2   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r#   r   Zcustom_evaluatorr;   r;   r<   r   T  sV   








r   r   	callbacksr   metadatar   r   r   r   r   r   Optional[Dict[str, Any]]Union[str, BaseMessage]c          
        s   |dur7||}t |tst |tr/tdd |D r/| j|t||p$g |p'i ddI dH S td| dzt|}| j|t||pDg |pGi ddI dH }W |S  tyw   t|}	| jd	i |	dt||phg |pki diI dH }Y |S w )
a  Asynchronously run the language model.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map inputs to the expected format.

    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r;   r;   r<   r     r   z_arun_llm.<locals>.<genexpr>r   r   r   r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r;   )	rd   rx   r   rI   ainvoker   r6   r   r   )
r   r   r   r   r   r   prompt_or_messagesr   
llm_output
llm_inputsr;   r;   r<   	_arun_llm  sP   r   r   r   r   Union[Chain, Runnable]Union[dict, str]c          
        s   |du r|n||}t | tr;t |tr;t|dkr;| jr;tt| }| j|t	||p.g |p1i ddI dH }|S t	|p?g ||pCi d}	| j||	dI dH }|S )z%Run a chain asynchronously on inputs.NrK   r   r   r   r   r   )
rd   r.   re   r   r   r   r   r   r   r   
r   r   r   r   r   r   Zinputs_valrB   runnable_configr;   r;   r<   _arun_chain  s,   
r  )r   r   'Union[dict, str, LLMResult, ChatResult]c          	        s   t |trdnd}d}z8t |tr(t|| j|d |d ||ddI dH }n| }t|| j|d |d ||ddI dH }|}W |S  typ } z t| d| j	 d	| j d
t
|  t|d}W Y d}~|S d}~ww )a  Asynchronously run the Chain or language model.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.
        input_mapper: Optional function to map the input to the expected format.

    Returns:
        A list of outputs.
    LLMr.   Nr   r   r   r    failed for example  with inputs 
r[   )rd   r   r   r   rc   r  	Exceptionr   r   idreprro   )	r   r   r   r   chain_or_llmrm   rB   r   ri   r;   r;   r<   _arun_llm_or_chain  sJ   
	r  c          
      C  s   |dur5||}t |tst |tr-tdd |D r-| j|t||p#g |p&i dd}|S td| dzt|}| j|t||pBg |pEi dd}W |S  tyl   t|}	| jd
i |	dt||pci d	i}Y |S w )a  
    Run the language model on the example.

    Args:
        llm: The language model to run.
        inputs: The input dictionary.
        callbacks: The callbacks to use during the run.
        tags: Optional tags to add to the run.
        input_mapper: function to map to the inputs dictionary from an Example
    Returns:
        The LLMResult or ChatResult.
    Raises:
        ValueError: If the LLM type is unsupported.
        InputFormatError: If the input format is invalid.
    Nc                 s  r   r   r   r   r;   r;   r<   r   N  r   z_run_llm.<locals>.<genexpr>r   r   z'Input mapper returned invalid format:  r   r   )r   r   r;   )	rd   rx   r   rI   invoker   r6   r   r   )
r   r   r   r   r   r   r   r   Zllm_promptsr   r;   r;   r<   _run_llm0  sL   
r  Union[Dict, str]c          
      C  s   |du r|n||}t | tr7t |tr7t|dkr7| jr7tt| }| j|t	||p-g |p0i dd}|S t	|p;g ||p?i d}	| j||	d}|S )zRun a chain on inputs.NrK   r   r   r   )
rd   r.   re   r   r   r   r   r   r  r   r  r;   r;   r<   
_run_chainn  s*   
r  c          
      C  s   t |trdnd}d}z2t |tr$t|| j|d |d ||dd}n| }t|| j|d |d ||dd}|}W |S  tyo } z&t|j}	t	
| d| j d	| j d
|	 d| 	 t|d}W Y d}~|S d}~ww )a  
    Run the Chain or language model synchronously.

    Args:
        example: The example to run.
        llm_or_chain_factory: The Chain or language model constructor to run.
        tags: Optional tags to add to the run.
        callbacks: Optional callbacks to use during the run.

    Returns:
        Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
          The outputs of the model or chain.
    r  r.   Nr   r   r   r   r  r  z
Error Type: z, Message: r
  )rd   r   r  r   rc   r  r  r   r7   r   r   r  ro   )
r   r   r   r   r  rm   rB   r   ri   Z
error_typer;   r;   r<   _run_llm_or_chain  sN   
		
r  clientr   project_nameproject_metadatadataset_versionOptional[Union[str, datetime]]1Tuple[MCF, TracerSession, Dataset, List[Example]]c              
   C  sb  t ||}| j|d}t| j|j|d}	|	std| ddd |	D }
|
r,t|
nd }|r4| nd }z'|p:i }t }|rHi |d|i}||d< | j	||j|rWd	|ini |d
}W n1 t
ttfy } z"dt|vrp|t }d| d| d| d}td| d| d }~ww |jd|j  }td| d| d| d|j dd ||||	fS )N)r   )Z
dataset_idZas_ofzDataset z has no example rows.c                 S  s   g | ]}|j r|j qS r;   )modified_at)rF   exr;   r;   r<   rH     s    z%_prepare_eval_run.<locals>.<listcomp>gitr  r   )Zreference_dataset_idZproject_extrar   zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   Zread_datasetr   Zlist_examplesr  r   max	isoformatr   Zcreate_projectr,   r+   rx   uuiduuid4urlprint)r  r   r   r  r  r   r  wrapped_modeldatasetr   r  Zmax_modified_atZinferred_versionZgit_infoprojectri   uidZexample_msgZcomparison_urlr;   r;   r<   _prepare_eval_run  sl   
	
r)  c                   @  s*   e Zd ZU dZded< ded< ded< dS )	
_RowResultz5A dictionary of the results for a single example row.z Optional[List[EvaluationResult]]rR   zOptional[float]r\   r   r]   Nr   r;   r;   r;   r<   r*    s
   
 r*  F)totalc                   @  s   e Zd ZU dZded< ded< ded< ded	< d
ed< dZded< d>ddZd?ddZd@ddZdAd!d"Z	dBdCd&d'Z
e				(			dDdEd<d=ZdS )F_DatasetRunContainerz3A container to help manage the state of a eval run.r   r  r*   r'  r   r%  r   r   zList[RunnableConfig]configsNz6Optional[List[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsbatch_resultsr   all_eval_resultsDict[str, _RowResult]r>   re   c                 C  s   i }t | j|D ]M\}}tt|t|ji }|j|dg |d|dd|t|j< t|t	r?|j
|t|j d< n	||t|j d< |jrU|j|t|j d< q|S )NrR   r\   r]   )rC   rR   r\   r]   r[   rB   rD   )zipr   r   r*  rc   rx   r  r   rd   ro   r[   r   )rN   r/  r0  rQ   r   rB   Z
row_resultr;   r;   r<   _merge_test_outputs  s   

z(_DatasetRunContainer._merge_test_outputsrunsDict[str, Run]
List[dict]c           	        s   | j }|sg S  fdd| jD }g }tj X}|D ]L}z+||| j}t|tr-| }|t	t| |j
| jjfi |d | jjd W q tyg } ztdt| d|  W Y d }~qd }~ww W d    |S 1 ssw   Y  |S )Nc                   s   g | ]	} t |j qS r;   )rx   r  rF   r   r4  r;   r<   rH   0  s    z>_DatasetRunContainer._run_batch_evaluators.<locals>.<listcomp>)r]   Z
project_idzError running batch evaluator z: )r.  r   
concurrentfuturesThreadPoolExecutorrd   r!   re   rg   r   submitr  Zcreate_feedbackr'  r  r  r   r^   r  )	rN   r4  r   Z	runs_listaggregate_feedbackexecutorZ	evaluatorrm   ri   r;   r8  r<   _run_batch_evaluators,  s>   

z*_DatasetRunContainer._run_batch_evaluators,Tuple[Dict[str, _RowResult], Dict[str, Run]]c                 C  s   i }i }| j D ]d}tt|d D ]Z}t|tr3|j}| D ]\\}}}|t|i 	d|i qqt|t
rj|j}	|	rH|	jrH|	j|	j  nd }
|	rQt|	jnd }|t|ji 	|
||	d |	|t|j< qqttttf ||fS )Nr   rR   )r\   r]   run)r-  r   r   rd   r   Zlogged_eval_resultsrb   
setdefaultrx   rf   r   Z
latest_runend_time
start_timetotal_secondsr  rl   r   r*  )rN   r0  all_runsccallbackZeval_results_rl   rV   rA  r\   r]   r;   r;   r<   _collect_metricsE  s<   


z%_DatasetRunContainer._collect_metrics-List[Union[dict, str, LLMResult, ChatResult]]r=   c                 C  sX   t d t  |  \}}d }| jrt d | |}| ||}t| jj	||dS )Nz#Waiting for evaluators to complete.zRunning session evaluators.)r  rQ   Zaggregate_metrics)
r   r   r   rJ  r.  r?  r3  r=   r'  rw   )rN   r/  r0  rF  r=  rQ   r;   r;   r<   _collect_test_resultsb  s   


z*_DatasetRunContainer._collect_test_resultsFverboseboolc              
   C  s   |  |}|r.z
| }t| W n ty- } ztdt|  W Y d }~nd }~ww z| jj| j	j
ttjd W |S  ty\ } ztdt|  W Y d }~|S d }~ww )Nz$Failed to print aggregate feedback: )rC  zFailed to close project: )rL  rP   _display_aggregate_resultsr  r   r   r  r  Zupdate_projectr'  r  r   nowr   utc)rN   r/  rM  rQ   Zagg_feedbackri   r;   r;   r<   finisht  s&   
 z_DatasetRunContainer.finish   r   rx   r   r   r  r   r3   r   r   r   r   r   concurrency_levelintr  r   revision_idr  Optional[Union[datetime, str]]c              	     s  |pt  }|
r|	si }	|	d|
i t ||||	|d\}}}p%g jdp-i  D ]\}}d| d|  q0djd i|
rM|
d< t|}t	||||j
pZtjt|d || tt| fdd	|D }|  ||||r|jd
S d d
S )NrV  )r  r   r  r  zgit:=r  r   c              
     sB   g | ]}t tj |jd tpg  |jddgdqS ))r  r  rl   r   )r   r  rl   max_concurrency)r   r   rY  r   )r   r   rw   r  r   r7  r  rT  progress_barr'  r   Zrun_metadatar   r;   r<   rH     s*    z0_DatasetRunContainer.prepare.<locals>.<listcomp>)r  r'  r%  r   r-  r.  )r4   Zrandom_namerf   r)  r   rc   rb   rg   r   r   r   r'   kvr   r5   ZProgressBarCallbackr   r.  )clsr  r   r   r  r3   r   r   rT  r  rV  r  r%  r&  r   rU   rV   r-  r;   rZ  r<   prepare  sN   	z_DatasetRunContainer.prepare)r/  r   r0  r1  r>   re   )r4  r5  r>   r6  )r>   r@  )r/  rK  r>   r=   )F)r/  r   rM  rN  r>   r=   )NNNrS  NNN)r  r   r   rx   r   r   r  r   r3   r   r   r   r   r   rT  rU  r  r   rV  r   r  rW  r>   r,  )r7   r8   r9   r:   r   r.  r3  r?  rJ  rL  rR  classmethodr^  r;   r;   r;   r<   r,    s,   
 



r,  rN  c                  C  sD   zddl m}  |  }|  d uodtt|v W S  ty!   Y dS w )Nr   )get_ipythonZzmqshellF)ZIPythonr`  rx   r   ra   )r`  resr;   r;   r<   _is_jupyter_environment  s   rb  aggregate_resultsr?   c                 C  sT   t  rddlm}m} ||d ||  d S | jdd dd}td t| d S )	Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                 S  s   | dS )Nz.2fr;   )xr;   r;   r<   r     r   z,_display_aggregate_results.<locals>.<lambda>right)Zfloat_formatjustifyz
 Experiment Results:)rb  IPython.displayrd  re  Z	to_stringr$  )rc  rd  re  Zformatted_stringr;   r;   r<   rO    s   rO  a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)rS  )r3   r  rT  r  r  rM  rV  Optional[Client]rW  rT  rU  rM  rV  rq   r   c                  s   |
 dd }|rtdtdd |	d u rt d}	|
 dd }|r)tdddd |
r8tdd	|
  d
dd | p<t } tj| |||||||||	|d}t	j
|jd dgttjt|j|d|j|jR  I d H }|j||dS )Nr   0.0.305TmessagependingrV  r   0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   rm  Zremovalr  rV  r  r   rY  r   r   rM  )r   r   _INPUT_MAPPER_DEP_WARNINGr    rc   r   r   r,  r^  runnable_utilsZgather_with_concurrencyr-  map	functoolspartialr  r%  r   rR  )r  r   r   r3   r  rT  r  r  rM  rV  rq   r   r   	containerr/  r;   r;   r<   arun_on_dataset  sb   
r|  c                  s"  |
 dd rtdtdd |
 dd }|rtdddd |	d u r(t d}	|
r7tdd	|
  d
dd | p;t } tj| ||||||||	|d |dkr` fddt	 j
 jD }n*t jd }t|tjt jd j
 j}W d    n1 sw   Y   j||dS )Nr   rk  Trl  r   ro  rp  rV  rq  r   rr  rs  r   c                   s"   g | ]\}}t || jd qS )rt  )r  r%  )rF   r   r   r{  r   r;   r<   rH   k  s    z"run_on_dataset.<locals>.<listcomp>rt  ru  )r   r   rv  r    rc   r   r   r,  r^  r2  r   r-  r  Zget_executor_for_configr   rx  ry  rz  r  r%  rR  )r  r   r   r3   r  rT  r  r  rM  rV  rq   r   r/  r>  r;   r}  r<   run_on_dataset8  sh   

r~  a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()r~   )r   r   r   rx   r>   r   )r   r   r>   rx   )r   r   r>   re   )r   r(   r   r   r>   rr   )r   r(   r   r.   r   r   r>   rr   )r   r(   r   r   r   r   r>   rr   )
r   r   r   r   r3   r   r   r'   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r>   r   )r   r   r   r   r   rx   r   r'   r   r   r   r   r   r   r   r   r>   r"   )
r   r   r   r   r   r   r   r   r>   r   )r   r   r   rx   r   r'   r   r   r   r   r   r   r>   r   )r   r   r   r   r   r   r   r   r   r   r   r   r>   r   )r   r   r   r   r   r   r   r   r   r   r   r   r>   r   )
r   r(   r   r   r   r   r   r   r>   r  )r   r   r   r   r   r   r   r   r   r   r   r   r>   r   )r   r   r   r   r   r   r   r   r   r   r   r   r>   r  r   )r  r   r   rx   r   r   r  rx   r  r   r   r   r  r  r>   r  )r>   rN  )rc  r?   r>   rr   )r  rj  r   rx   r   r   r3   r   r  rW  rT  rU  r  r   r  r   rM  rN  rV  r   rq   r   r>   r   )r:   
__future__r   concurrent.futuresr9  dataclassesry  r   loggingr!  r   r   typingr   r   r   r   r	   r
   r   r   r   Zlangchain_core._apir   Z langchain_core.callbacks.managerr   Zlangchain_core.language_modelsr   Zlangchain_core.messagesr   r   Zlangchain_core.outputsr   r   Zlangchain_core.runnablesr   r   r   r   r  r   rw  Z!langchain_core.tracers.evaluationr   r   Z langchain_core.tracers.langchainr   Zlangsmith.clientr   Zlangsmith.envr   r    Zlangsmith.evaluationr!   r"   r#   r   Zlangsmith.run_helpersr$   r%   Zlangsmith.schemasr&   r'   r(   r)   r*   Zlangsmith.utilsr+   requestsr,   Ztyping_extensionsr-   Zlangchain.chains.baser.   Zlangchain.evaluation.loadingr/   Zlangchain.evaluation.schemar0   r1   r2   Zlangchain.smithr3   r   Zlangchain.smith.evaluationr   r4   r5   r`   rh   	getLoggerr7   r   re   r   r   r  r6   r=   ro   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r  r  r  r  r)  r*  	dataclassr,  rb  rO  rv  r|  r~  Z_RUN_ON_DATASET_DOCSTRINGreplacer;   r;   r;   r<   <module>   s    ,
I
=2


(
!
%





C
GE%>C%== 
C

FMk
