
    Ahy                       S r SSKJr  SSKrSSKrSSKrSSKrSSKrSSK	r	SSK
J
r
Jr  SSKJrJrJrJrJrJr  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJ r J!r!J"r"  SSKJ#r$  SSKJ%r&  SSK'J(r(J)r)  SSK*J+r+  SSK,J-r-  SSK.J/r/J0r0  SSK1J2r2J3r3  SSK1J4r5  SSK6J7r7J8r8  SSK9J:r:J;r;J<r<J=r=J>r>  SSK?J@r@  SSKAJBrB  SSKCJDrD  SSKEJFrF  SSKGJHrH  SSKIJJrJJKrKJLrL  SSKMJNrO  SSKPJ#rQ  SSKPJRrRJSrS  \(       a  SSKTrU\R                  " \W5      rX\\/ \\F\ 4   4   \\\Y/\4   \ \F4   rZ\\/ \\F\ 4   4   \4   r[ " S S\\5      r] " S  S!\Y5      r^ " S" S#\Y5      r_ SM     SNS$ jjr`SOS% jra " S& S'\D5      rbSPS( jrc      SQS) jrd        SRS* jre        SSS+ jrf          STS, jrg      SUS- jrh      SVS. jri      SWS/ jrj                  SXS0 jrk          SYS1 jrl              SZS2 jrmSSSSS3.             S[S4 jjrnSSSS5.             S\S6 jjroSS7.         S]S8 jjrpSSSS5.             S^S9 jjrqSSSS5.             S\S: jjrrSS7.         S]S; jjrs   S_               S`S< jjrt " S= S>\DS?S@9ru\R                   " SA SB5      5       rwSaSC jrxSbSD jrySErzSSSFSSS?SSG.                       ScSH jjr{SSSFSSS?SSG.                       ScSI jjr|SJr}\}\|l         \}R                  SKSL5      \{l         g)dz>Utilities for running language models or Chains over datasets.    )annotationsN)datetimetimezone)TYPE_CHECKINGAnyCallableOptionalUnioncast)warn_deprecated)	Callbacks)BaseLanguageModel)BaseMessagemessages_from_dict)
ChatResult	LLMResult)RunnableRunnableConfigRunnableLambdaconfig)utils)EvaluatorCallbackHandlerwait_for_all_evaluators)LangChainTracer)Client)get_git_infoget_langchain_env_var_metadata)EvaluationResultRunEvaluator)run_evaluator)as_runnableis_traceable_function)DatasetDataTypeExampleRunTracerSession)LangSmithError)	HTTPError)	TypedDict)Chain)load_evaluator)EvaluatorTypePairwiseStringEvaluatorStringEvaluator)
evaluation)name_generationprogressc                      \ rS rSrSrSrg)InputFormatErrorK   z(Raised when the input format is invalid. N)__name__
__module____qualname____firstlineno____doc____static_attributes__r7       _/var/www/html/shao/venv/lib/python3.13/site-packages/langchain/smith/evaluation/runner_utils.pyr5   r5   K   s    2r>   r5   c                  0    \ rS rSrSr  SS jrSS jrSrg)
TestResultR   z1A dictionary of the results of a single test run.c                    U R                  5       nUR                   Vs/ sH#  nUR                  S5      (       d  US;   d  M!  UPM%     nnUR                  SS9R	                  USS9$ s  snf )zReturn quantiles for the feedback scores.

This method calculates and prints the quantiles for the feedback scores
across all feedback keys.

Returns:
    A DataFrame containing the quantiles for each feedback key.
)inputs.outputs.	reference>   inputoutputall)include   )axis)to_dataframecolumns
startswithdescribedrop)selfdfcolto_drops       r?   get_aggregate_feedback!TestResult.get_aggregate_feedbackU   sy       zz
!~~BCC)) ! 	 
 {{5{)..wQ.??
s   A'A'c           	         SSK n/ n/ nU S   R                  5        GHw  u  pgUS   nUR                  S5      n	[	        U	[
        5      (       a(  U	R                  5        V
Vs0 sH  u  pSU
 3U_M     nn
nO
U	c  0 nOSU	0n0 US   R                  5        V
Vs0 sH  u  pS	U
 3U_M     snn
EUEnS
U;   a[  [	        US
   [
        5      (       a;  UR                  US
   R                  5        V
Vs0 sH  u  pSU
 3U_M     snn
5        OUS
   US
'   UR                  0 U Vs0 sH  nSUR                   3UR                  _M     snEUR                  S5      US   UR                  S5      S.E5        UR                  U5        UR                  U5        GMz     UR                  " XTS9$ ! [         a  nSn[        U5      UeSnAff = fs  snn
f s  snn
f s  snn
f s  snf )z#Convert the results to a dataframe.r   NzfPandas is required to convert the results to a dataframe. to install pandas, run `pip install pandas`.resultsfeedbackrH   rE   rG   rD   rF   z
reference.z	feedback.Errorexecution_timerun_id)errorr\   r]   )index)pandasImportErroritemsget
isinstancedictupdatekeyscoreappend	DataFrame)rR   pdemsgindicesrecords
example_idresultrZ   output_kvrH   rfs                  r?   rM   TestResult.to_dataframej   s   	* "&y/"7"7"9Jj)Hjj*G'4((8?HHQC.!+H"G,06w0E0E0GH0GWQC=!#0GHA f$f[1488HH9?9L9R9R9TU9T:aS)1,9TU &,K%8AkNHH=EFX155'*AGG3XF#ZZ0&,-=&>$jj2	 NN1NN:&= #:@ ||G33S  	*@  c")	* I I V Gs/   F) !G	G
.G #G)
G3GGr7   N)returnpd.DataFrame)r8   r9   r:   r;   r<   rV   rM   r=   r7   r>   r?   rA   rA   R   s    ;@	@*-4r>   rA   c                  :   ^  \ rS rSrSrSU 4S jjrSS jrSrU =r$ )	EvalError   z"Your architecture raised an error.c                *   > [         TU ]  " SSU0UD6  g )Nr[   r7   )super__init__)rR   r[   kwargs	__class__s      r?   r   EvalError.__init__   s    /u//r>   c                T     X   $ ! [          a  nSU S3n[        U5      UeS nAff = f)Nz%'EvalError' object has no attribute '')KeyErrorAttributeError)rR   namerl   rm   s       r?   __getattr__EvalError.__getattr__   s:    	-: 	-9$qAC %1,	-s    
'"'r7   )r[   BaseExceptionr   r   rx   None)r   strrx   r   )	r8   r9   r:   r;   r<   r   r   r=   __classcell__)r   s   @r?   r{   r{      s    ,0- -r>   r{   c                  ^^	^
^^ [        U [        5      (       ab  U mTR                  R                  nU R                  b7  TR                  R                  R                  nSU SU SU S3n[        U5      eU4S j$ [        U [        5      (       a  U $ [        U [        5      (       a  U m
U
4S j$ [        U 5      (       a  [        U 5      (       a  [        [        [        U 5      5      mU4S j$  U " 5       n[        [        U 5      m	[        U[        5      (       a  U$ [        [        [        U5      5      (       a  [        [        [        U5      5      mU4S
 j$ [        U[        5      (       d  U	4S j$ T	$ U $ ! [         aP    [        [        U 5      n[        R                  " U5      n[         R#                  SU5        [%        U5      mU4S	 js $ f = f)zForgive the user if they pass in a chain without memory instead of a chain
factory. It's a common mistake. Raise a more helpful error message as well.a$  Cannot directly evaluate a chain with stateful memory. To evaluate this chain, pass in a chain constructor that initializes fresh memory each time it is called.  This will safegaurd against information leakage between dataset examples.
For example:

def chain_constructor():
    new_memory = z(...)
    return z*(memory=new_memory, ...)

run_on_dataset("z", chain_constructor, ...)c                    > T $ Nr7   )chains   r?   <lambda>(_wrap_in_chain_factory.<locals>.<lambda>   s    ur>   c                    > T $ r   r7   )lcfs   r?   r   r      s    sr>   c                    > T $ r   r7   	runnable_s   r?   r   r          9r>   z'Wrapping function %s as RunnableLambda.c                    > T $ r   r7   )wrappeds   r?   r   r      s    7r>   c                    > T $ r   r7   r   s   r?   r   r      r   r>   c                    > [        T 5      $ r   )r   )constructors   r?   r   r      s
    >+6r>   )rd   r,   r   r8   memory
ValueErrorr   r   callabler#   r"   r   r   	TypeErrorinspect	signatureloggerinfor   )llm_or_chain_factorydataset_namechain_classmemory_classrm   _model	user_funcsigr   r   r   r   r   s           @@@@@r?   _wrap_in_chain_factoryr      s    &..$oo..&&2 <<11::L$ %1> 2)]##/.0J
L  S/!&(9::##&11"$%% !566#D3G$HII$$	#)+F 8%9:f/00 M h!788#D6$:;I$$&(++66)  	#X';<I##I.CKKA3G$Y/G""	#s   1E; ;AGGc                   U (       d  Sn[        U5      e/ nSU ;   aE  [        U S   [        5      (       d&  S[        U S   5      R                   3n[        U5      eU S   /nOSU ;   a^  [        U S   [
        5      (       a  [        S U S    5       5      (       d&  S[        U S   5      R                   3n[        U5      eU S   nO[        U 5      S:X  az  [        [        U R                  5       5      5      n[        U[        5      (       a  U/nOO[        U[
        5      (       a  [        S U 5       5      (       a  UnO S	U  3n[        U5      eS
U  3n[        U5      e[        U5      S:X  a  US   $ S[        U5       S3n[        U5      e)zGet prompt from inputs.

Args:
    inputs: The input dictionary.

Returns:
    A string prompt.
Raises:
    InputFormatError: If the input format is invalid.
Inputs should not be empty.promptz"Expected string for 'prompt', got promptsc              3  @   #    U H  n[        U[        5      v   M     g 7fr   rd   r   .0is     r?   	<genexpr>_get_prompt.<locals>.<genexpr>   s      >
(91Jq#(9   z,Expected list of strings for 'prompts', got rK   c              3  @   #    U H  n[        U[        5      v   M     g 7fr   r   r   s     r?   r   r     s     .S7az!S/A/A7r   z)LLM Run expects string prompt input. Got z5LLM Run expects 'prompt' or 'prompts' in inputs. Got r   z)LLM Run expects single prompt input. Got z	 prompts.)r5   rd   r   typer8   listrI   lennextitervalues)inputsrm   r   prompt_s       r?   _get_promptr      s    +s##G6&*C006tF8<L7M7V7V6WXC"3''(#$	f	&+T22# >
(.y(9>
 ;
 ;
VI./889;  #3''#	V	tFMMO,-gs##iG&&3.S7.S+S+SG=fXFC"3''EfXNs##
7|qqz5c'l^9
MC
3
r>   c                  $    \ rS rSr% SrS\S'   Srg)ChatModelInputi  zJInput for a chat model.

Parameters:
    messages: List of chat messages.
zlist[BaseMessage]messagesr7   Nr8   r9   r:   r;   r<   __annotations__r=   r7   r>   r?   r   r     s      r>   r   c                   U (       d  Sn[        U5      eU R                  5       nSU ;   a  UR                  S5      US'   O4[        U 5      S:X  a%  [	        [        U R                  5       5      5      US'   SU;   ac  US   n[        U[        5      (       a  [        S U 5       5      (       a  U/n[        U5      S:X  a  [        US   5      US'   U$ Sn[        U5      eSU  3n[        U5      e)	zGet Chat Messages from inputs.

Args:
    inputs: The input dictionary.

Returns:
    A list of chat messages.
Raises:
    InputFormatError: If the input format is invalid.
r   r   rG   rK   c              3  @   #    U H  n[        U[        5      v   M     g 7fr   )rd   re   r   s     r?   r    _get_messages.<locals>.<genexpr>5  s      2
)5AJq$r   r   zGBatch messages not supported. Please provide a single list of messages.zMChat Run expects single List[dict] or List[List[dict]] 'messages' input. Got )r5   copypopr   r   r   r   rd   r   rI   r   )r   rm   
input_copyraw_messagess       r?   _get_messagesr      s    +s##JV(nnZ8
7	V	"4#89
7*!'*lD))c 2
)52
 /
 /
 )>L|!"4\!_"EJw 	,  #3''h	   3
r>   c                    U(       a{  U" U R                   =(       d    0 5      n[        U[        5      (       dJ  [        U[        5      (       a  [	        S U 5       5      (       d  SU S[        U5       S3n[        U5      eg g  [        U R                   =(       d    0 5        g ! [         aP     [        U R                   =(       d    0 5         g ! [         a!  nSU R                    S3n[        U5      UeS nAff = ff = f)Nc              3  @   #    U H  n[        U[        5      v   M     g 7fr   rd   r   r   rm   s     r?   r   >_validate_example_inputs_for_language_model.<locals>.<genexpr>R  s     ILSJsK00Lr   zWhen using an input_mapper to prepare dataset example inputs for an LLM or chat model, the output must a single string or a list of chat messages.
Got: 	 of type .zvExample inputs do not match language model input format. Expected a dictionary with messages or a single prompt. Got: z Please update your dataset OR provide an input_mapper to convert the example.inputs to a compatible format for the llm or chat model you wish to evaluate.)	r   rd   r   r   rI   r   r5   r   r   )first_exampleinput_mapperprompt_inputrm   err2s        r?   +_validate_example_inputs_for_language_modelr   J  s    #M$8$8$>B?,,,|T**ILIII 'yl1C0DAG  #3'' J -	6,,23 	66m228b9# 	6*112GG  's+5	6	6s*   B# #
C=.C
C9C44C99C=c                p   U(       a  U" U R                   =(       d    0 5      n[        UR                  5      R                  U5      n[	        U[
        5      (       d  SU S[        U5       S3n[        U5      eU(       a+  SUR                   SUR                  5        3n[        U5      egU R                   n[        UR                  5      R                  U5      n[        U5      S:X  a  [        UR                  5      S:X  a  gU(       a+  SUR                   SUR                  5        3n[        U5      eg)	z<Validate that the example inputs match the chain input keys.zvWhen using an input_mapper to prepare dataset example inputs for a chain, the mapped value must be a dictionary.
Got: r   r   zAMissing keys after loading example using input_mapper.
Expected: z. Got: rK   zExample inputs missing expected chain input keys. Please provide an input_mapper to convert the example.inputs to a compatible format for the chain you wish to evaluate.Expected: N)
r   set
input_keys
differencerd   re   r   r5   keysr   )r   r   r   first_inputsmissing_keysrm   s         r?   "_validate_example_inputs_for_chainr   m  sG    #M$8$8$>B?5++,77E,--&yl1C0DAG 
 #3''$//08I8I8K7LN  #3''  %++5++,77E|!c%*:*:&;q&@  #--. /$))+,	.  #3'' r>   c                    [        U[        5      (       a  [        X5        gU" 5       n[        U[        5      (       a  [	        XU5        g[        U[
        5      (       a  [        R                  SU5        gg)z9Validate that the example inputs are valid for the model.z Skipping input validation for %sN)rd   r   r   r,   r   r   r   debug)exampler   r   r   s       r?   _validate_example_inputsr     s[     &(9::3GJ$&eU##.w|Lx((LL;UC )r>   c           	     j   U(       a  [        U [        5      (       a  Su  pESnOOSnU " 5       n[        U[        5      (       a  UR                  OSn[        U[        5      (       a  UR                  OSn[        UUUUS   R                  (       a  [        US   R                  5      OSUU5      nU$ SnU$ )z<Configure the evaluators to run on the results of the chain.)NNllmr   Nr   )rd   r   r,   r   output_keys_load_run_evaluatorsoutputsr   )	r   examplesr1   	data_type
run_inputsrun_outputsrun_typer   run_evaluatorss	            r?   _setup_evaluationr     s     *,=>>&0#JHH(*E-7u-E-E))4J/9%/G/G%++TK-)1!)<)<D!$$%$
  r>   c                   S nU R                   (       a1  U R                   nU(       a  X!;  a  [        R                  SUU5        U$ U(       a  [        U5      S:X  a  US   nU$ Ub%  [        U5      S:  a  [        R                  SU5        U$ )NzZInput key %s not in chain's specified input keys %s. Evaluation behavior may be undefined.rK   r   zChain expects multiple input keys: %s, Evaluator is likely to fail. Evaluation behavior may be undefined. Specify an input_key in the RunEvalConfig to avoid this warning.)	input_keyr   warningr   )r   r   r   s      r?   _determine_input_keyr     s     I$$	)5NN8	   
J1,qM	  
	C
Oa$7P 		
 r>   c                   S nU R                   (       a1  U R                   nU(       a  X!;  a  [        R                  SUU5        U$ U(       a  [        U5      S:X  a  US   nU$ Ub%  [        U5      S:  a  [        R                  SU5        U$ )Nz`Prediction key %s not in chain's specified output keys %s. Evaluation behavior may be undefined.rK   r   zChain expects multiple output keys: %s, Evaluation behavior may be undefined. Specify a prediction_key in the RunEvalConfig to avoid this warning.)prediction_keyr   r   r   )r   r   r   s      r?   _determine_prediction_keyr     s     N..><NN8	  
[)Q.$Q  
	 S%5%9; 		
 r>   c                    U R                   (       a-  U R                   nU(       a  X!;  a  SU SU 3n[        U5      eU$ U(       a%  [        U5      S:X  a  [        [	        U5      5      nU$ S nU$ )NzReference key z! not in Dataset example outputs: rK   )reference_keyr   r   r   r   )r   example_outputsr   rm   s       r?   _determine_reference_keyr     s     ,,}C  0%%4$57  S/!
 	 
S1Q6T/23  r>   c           
        [        U [        5      (       a  U $ [        U [        [        45      (       a6  [        U [        5      (       d  [        U 5      n [	        XS9nU R
                  n	O[        U [        R                  5      (       a  SU0U R                  5       En
[	        U R                  40 U
D6nU R                  R
                  n	[        U [        R                  5      (       a?  U R                  =(       d    UnU R                  =(       d    UnU R                  =(       d    UnO4[        U 5      (       a  [        U 5      $ S[!        U 5       3n[#        U5      e[        U[$        5      (       aN  UR&                  (       a  Uc  SU	 SU S3n[#        U5      e[(        R*                  R-                  UUUUUUU	/S9nU$ [        U[.        5      (       a  SU	 S	3n[1        U5      eSU	 S
3n[1        U5      e)N)r   r   zUnknown evaluator type: zPMust specify reference_key in smith_eval.RunEvalConfig to use evaluator of type z) with dataset with multiple output keys: r   )r   r   r   tagszRun evaluator for z is not implemented. PairwiseStringEvaluators compare the outputs of two different models rather than the output of a single model. Did you mean to use a StringEvaluator instead?
See: https://python.langchain.com/docs/guides/evaluation/string/z is not implemented)rd   r    r.   r   r-   valuesmith_eval_config
EvalConfig
get_kwargsevaluator_typeSingleKeyEvalConfigr   r   r   r   run_evaluator_decr   r   r0   requires_reference
smith_evalStringRunEvaluatorChainfrom_run_and_data_typer/   NotImplementedError)eval_configeval_llmr   r   r   r   r   r   
evaluator_eval_type_tagr   rm   r!   s                r?   _construct_run_evaluatorr    s    +|,,+s344+}55'4K#K>
#))	K!2!=!=	>	>>[%;%;%=>#K$>$>I&I
#2288k#4#H#HII#--:I(77I>N'55FM	+		 --(k):(;<o*o..((]-B&&3_ 577F6GqJ 
 S/!"::QQ)' R 
,  
J 7	8	8  0Q Q 	 "#&& #=/1DE!#&&r>   c                L    [        X5      n[        X5      n[        X5      nXEU4$ r   )r   r   r   )r   r   r   r   r   r   r   s          r?   	_get_keysr  T  s-     %V8I.vCN,VEMm33r>   c                   / nSu  pxn	U R                   (       d2  U R                  (       a2  [        S U R                   5       5      (       a  [        U UUU5      u  pxn	U R                    H0  n
[	        U
U R
                  UUUU	UU5      nUR                  U5        M2     U R                  =(       d    / nU H  n[        U[        5      (       a  UR                  U5        M+  [        U[        5      (       a3  UR                  [        R                  R                  UUUUUU	S95        Ms  [        U5      (       a  UR                  [        U5      5        M  SU S3n[        U5      e   U$ )z
Load run evaluators from a configuration.

Args:
    config: Configuration for the run evaluators.

Returns:
    A list of run evaluators.
NNNc              3  @   #    U H  n[        U[        5      v   M     g 7fr   )rd   r0   )r   rl   s     r?   r   '_load_run_evaluators.<locals>.<genexpr>u  s     Q8P1
1o..8Pr   )r   r   r   zUnsupported custom evaluator: z+. Expected RunEvaluator or StringEvaluator.)
evaluatorscustom_evaluatorsanyr  r  r  ri   rd   r    r0   r  r  r  r   r	  r   )r   r   r   r   r   r   r   r   r   r   r  r!   r  custom_evaluatorrm   s                  r?   r   r   `  st   " N/?,I}  Q8P8PQQQ3<	4
0	= ((0OO	
 	m, ) 006B-&55!!"23(/::!!22II$'#1"/ J 	 &''!!"34D"EF 11A0B C= >  S/!+ .. r>   r  	callbacksr   metadatac               `  #    Ub  U" U5      n[        U[        5      (       d,  [        U[        5      (       aJ  [        S U 5       5      (       a3  U R	                  U[        UU=(       d    / U=(       d    0 S9S9I Sh  vN $ SU S3n[        U5      e [        U5      nU R	                  U[        UU=(       d    / U=(       d    0 S9S9I Sh  vN n	U	$  NV N! [         aI    [        U5      n
U R                  " S0 U
DS[        UU=(       d    / U=(       d    0 S90D6I Sh  vN  n	 U	$ f = f7f)	a  Asynchronously run the language model.

Args:
    llm: The language model to run.
    inputs: The input dictionary.
    tags: Optional tags to add to the run.
    callbacks: Optional callbacks to use during the run.
    input_mapper: Optional function to map inputs to the expected format.

Returns:
    The LLMResult or ChatResult.
Raises:
    ValueError: If the LLM type is unsupported.
    InputFormatError: If the input format is invalid.
Nc              3  @   #    U H  n[        U[        5      v   M     g 7fr   r   r   s     r?   r   _arun_llm.<locals>.<genexpr>       O<NSJsK00<Nr   r  r  r   r   z%Input mapper returned invalid format 3
Expected a single string or list of chat messages.r   r7   )	rd   r   r   rI   ainvoker   r5   r   r   )r   r   r  r  r   r   prompt_or_messagesrm   r   
llm_output
llm_inputss              r?   	_arun_llmr+    sZ    0 )&1(#..)400O<NOOO"%'%^ %   "#BC 	
 s##
V$47KK!#ZR!R 5@ 5
 /

$ G"/
  	
"6*
;; 

!#ZR!R
 
 

 	
sU   A;D.=C>D.9C CC D.C A	D+!D$"D+'D.*D++D.r  r   r   c          	       #    Uc  UOU" U5      n[        U [        5      (       a  [        U[        5      (       aw  [        U5      S:X  ah  U R                  (       aW  [        [        UR                  5       5      5      nU R                  U[        UU=(       d    / U=(       d    0 S9S9I Sh  vN nU$ [        U=(       d    / UU=(       d    0 S9n	U R                  XiS9I Sh  vN nU$  N< N7f)z%Run a chain asynchronously on inputs.NrK   r%  r   r  r  r   )
rd   r,   re   r   r   r   r   r   r'  r   
r   r   r  r  r   r   inputs_valrH   runnable_configs
             r?   _arun_chainr3    s      %,f,v2FG5%  w%%LA4()*}}!#ZR!R % 
 
 M )^

 }}W}EEM
 Fs$   B(C+*C'+5C+ C)!C+)C+)r   c          
        #    [        U[        5      (       a  SOSnSn [        U[        5      (       a?  [        UU R                  =(       d    0 US   US   UUR	                  S5      S9I Sh  vN nOEU" 5       n[        UU R                  =(       d    0 US   US   UUR	                  S5      S9I Sh  vN nUnU$  NN N
! [         aA  n[        R                  SUU R                  U R                  U5        [        US	9n SnAU$ SnAff = f7f)
at  Asynchronously run the Chain or language model.

Args:
    example: The example to run.
    llm_or_chain_factory: The Chain or language model constructor to run.
    tags: Optional tags to add to the run.
    callbacks: Optional callbacks to use during the run.
    input_mapper: Optional function to map the input to the expected format.

Returns:
    A list of outputs.
LLMr,   Nr  r  r   r  z*%s failed for example %s with inputs %s
%sr[   )rd   r   r+  r   rc   r3  	Exceptionr   r   idr{   )	r   r   r   r   chain_or_llmrq   rH   r   rl   s	            r?   _arun_llm_or_chainr:    s    ( 02CDD'  F$*,=>> )$$"F^ -)J/! F )*E&$"F^ -)J/ F  M9  $9JJNN	
 #M$sT   DAC  -B<.AC  3B>4C  :D<C  >C   
D
6D DDDc                  Ub  U" U5      n[        U[        5      (       d,  [        U[        5      (       aD  [        S U 5       5      (       a-  U R	                  U[        UU=(       d    / U=(       d    0 S9S9nU$ SU S3n[        U5      e [        U5      n	U R	                  U	[        UU=(       d    / U=(       d    0 S9S9nU$ ! [         a5    [        U5      n
U R                  " S0 U
DS[        X%=(       d    0 S90D6n U$ f = f)	a  
Run the language model on the example.

Args:
    llm: The language model to run.
    inputs: The input dictionary.
    callbacks: The callbacks to use during the run.
    tags: Optional tags to add to the run.
    input_mapper: function to map to the inputs dictionary from an Example
Returns:
    The LLMResult or ChatResult.
Raises:
    ValueError: If the LLM type is unsupported.
    InputFormatError: If the input format is invalid.
c              3  @   #    U H  n[        U[        5      v   M     g 7fr   r   r   s     r?   r   _run_llm.<locals>.<genexpr>g  r$  r   r%  r   z'Input mapper returned invalid format:  r&  r   )r  r   r7   )	rd   r   r   rI   invoker   r5   r   r   )r   r   r  r  r   r   r(  r)  rm   llm_promptsr*  s              r?   _run_llmr@  J  s2   2 )&1(#..)400O<NOOO25**"%'%^ 3= 3J@ -&'FG 
 #3''	%f-K%'%^ $ J    	&v.J %	NPRSJ 	s   6C ;DDc          	        Uc  UOU" U5      n[        U [        5      (       a  [        U[        5      (       ao  [        U5      S:X  a`  U R                  (       aO  [        [        UR                  5       5      5      nU R                  U[        UU=(       d    / U=(       d    0 S9S9nU$ [        U=(       d    / UU=(       d    0 S9n	U R                  XiS9nU$ )zRun a chain on inputs.rK   r%  r   r.  )
rd   r,   re   r   r   r   r   r   r>  r   r/  s
             r?   
_run_chainrB    s     %,f,v2FG5%  w%%LA4()*!#ZR!R  
 M )^

 g>Mr>   c          
        [        U[        5      (       a  SOSnSn [        U[        5      (       a7  [        UU R                  =(       d    0 US   US   UUR	                  S5      S9nO=U" 5       n[        UU R                  =(       d    0 US   US   UUR	                  S5      S9nUnU$ ! [         aW  n[        U5      R                  n	[        R                  SUU R                  U R                  U	U5        [        US	9n SnAU$ SnAff = f)
a~  
Run the Chain or language model synchronously.

Args:
    example: The example to run.
    llm_or_chain_factory: The Chain or language model constructor to run.
    tags: Optional tags to add to the run.
    callbacks: Optional callbacks to use during the run.

Returns:
    Union[List[dict], List[str], List[LLMResult], List[ChatResult]]:
      The outputs of the model or chain.
r5  r,   Nr  r  r   r,  zC%s failed for example %s with inputs %s
Error Type: %s, Message: %sr6  )rd   r   r@  r   rc   rB  r7  r   r8   r   r   r8  r{   )
r   r   r   r   r9  rq   rH   r   rl   
error_types
             r?   _run_llm_or_chainrE    s   * 02CDD'  F$*,=>>"$$"{#F^)J/F )*E$"{#F^)J/F  M  
$!W%%
RJJNN	
 #M
$s   BB* *
D4ADDc           
     P   [        X!5      nU R                  US9n[        U R                  UR                  US95      n	U	(       d  SU S3n
[        U
5      eU	 Vs/ sH!  oR                  (       d  M  UR                  PM#     nnU(       a  [        U5      OS nU(       a  UR                  5       OS n U=(       d    0 n[        5       nU(       a  0 UESU0EnXS'   U R                  UUR                  U(       a  SU0O0 US9nUR                   SUR                   3-   n[#        SU SU SU SUR                    3SS9  UUX4$ s  snf ! [        [
        [        4 aJ  nS	[        U5      ;  a  e [        R                  " 5       nS
U SU SU S3nSU SU 3n
[        U
5      UeS nAff = f)N)r   )
dataset_idas_ofzDataset z has no example rows.gitdataset_versionr  )reference_dataset_idproject_extrar   zalready exists z+
run_on_dataset(
    ...
    project_name="z - z", # Update since z already exists
)
zTest project z/ already exists. Please use a different name:

z/compare?selectedSessions=z)View the evaluation results for project 'z' at:
z

View all tests for Dataset z at:
T)flush)r   read_datasetr   list_examplesr8  r   modified_atmax	isoformatr   create_projectr*   r)   r   uuiduuid4urlprint)clientr   r   project_nameproject_metadatar  rJ  wrapped_modeldatasetr   rm   exrP  max_modified_atinferred_versiongit_infoprojectrl   uidexample_msgcomparison_urls                        r?   _prepare_eval_runre    s    ++?NM!!|!<GF((GJJo(VWH&;<o,4GHb>2>>HKG +6c+&4O6E0024%+1r> " x 
 /?*+''!(,064.b%	 ( 
* [[%?

|#LLN	
3L> B  !&&2^6'++	H 	 '744Y H, z>2 %CF*jjl  .C5(:<. I L> *-! 	 o1$%s&   E1E1AE F%AF  F%c                  8    \ rS rSr% SrS\S'   S\S'   S\S'   S	rg
)
_RowResulti*  z5A dictionary of the results for a single example row.z Optional[list[EvaluationResult]]rZ   zOptional[float]r\   Optional[str]r]   r7   Nr   r7   r>   r?   rg  rg  *  s    ?..##r>   rg  F)totalc                     \ rS rSr% SrS\S'   S\S'   S\S'   S	\S
'   S\S'   SrS\S'         SS jrSS jrSS jr	    SS jr
 S     SS jjr\       S                       SS jj5       rSrg)_DatasetRunContaineri2  z3A container to help manage the state of a eval run.r   rX  r(   ra  MCFr[  list[Example]r   zlist[RunnableConfig]configsNz6Optional[list[smith_eval_config.BATCH_EVALUATOR_LIKE]]batch_evaluatorsc           	     v   0 n[        U R                  U5       GH  u  pE[        [        UR	                  [        UR                  5      0 5      5      nUR                  UR	                  S/ 5      UR	                  S5      UR	                  S5      S.U[        UR                  5      '   [        U[        5      (       a&  UR                  U[        UR                  5         S'   OXS[        UR                  5         S'   UR                  (       d  M  UR                  U[        UR                  5         S'   GM     U$ )NrZ   r\   r]   )rG   rZ   r\   r]   r[   rH   rF   )zipr   r   rg  rc   r   r8  r   rd   r{   r[   r   )rR   batch_resultsall_eval_resultsrY   r   rH   
row_results          r?   _merge_test_outputs(_DatasetRunContainer._merge_test_outputs=  s    
 "4==-@OGj*:*>*>s7::PR*STJ &NN:r:",..1A"B$..2	(GC

O$ &),,4:LLGJJ(15;GJJ(28?GJJ(5  A r>   c           	        U R                   nU(       d  / $ U R                   Vs/ sH  o1[        UR                  5         PM     nn/ n[        R
                  R                  5        nU H  n U" X@R                  5      n[        U[        5      (       a  UR                  5       nUR                  [        [        U5      5        UR                  " U R                  R                  40 UDS U R                  R                  S.D6  M     S S S 5        U$ s  snf ! [          a#    ["        R%                  S['        U5      5         M  f = f! , (       d  f       U$ = f)N)r]   
project_idz Error running batch evaluator %s)ro  r   r   r8  
concurrentfuturesThreadPoolExecutorrd   r   re   ri   r   submitrX  create_feedbackra  r7  r   	exceptionrepr)	rR   runsr  r   	runs_listaggregate_feedbackexecutor	evaluatorrq   s	            r?   _run_batch_evaluators*_DatasetRunContainer._run_batch_evaluatorsS  s   **
I:>--H-w#gjj/*-	H224'	&y--@F!&*:;;!'&--d4.@AOO33   $#'<<??	 ( 5" "!' I ! $$:DO 54" "!s6    D&E-BDE)E?EEE
Ec                "   0 n0 nU R                    GH\  n[        [        US   5       GH@  n[        U[        5      (       aV  UR
                  nUR                  5        H4  u  u  pgnUR                  [        U5      0 5      R                  SU05        M6     Mo  [        U[        5      (       d  M  UR                  n	U	(       a8  U	R                  (       a'  U	R                  U	R                  -
  R                  5       OS n
U	(       a  [        U	R                  5      OS nUR                  [        UR                   5      0 5      R                  U
UU	S.5        X[        UR                   5      '   GMC     GM_     [        ["        [        [$        4   U5      U4$ )Nr  rZ   )r\   r]   run)rn  r   r   rd   r   logged_eval_resultsrb   
setdefaultr   rf   r   
latest_runend_time
start_timetotal_secondsr8  rp   re   rg  )rR   rs  all_runsccallbackeval_results_rp   rt   r  r\   r]   s               r?   _collect_metrics%_DatasetRunContainer._collect_metricsl  sL   !#A q~6h(@AA#+#?#?L.:.@.@.B*(33C
ORHOO'O /C  /::"--C 3<< 6EEG! #
 -0S[TF$//H4G4G0H"MTT.<&,#& :=S!4!456- 7 0 Dj)+;<hFFr>   c                6   [         R                  S5        [        5         U R                  5       u  p#S nU R                  (       a&  [         R                  S5        U R                  U5      nU R                  X5      n[        U R                  R                  UUS9$ )Nz#Waiting for evaluators to complete.zRunning session evaluators.)rY  rY   aggregate_metrics)
r   r   r   r  ro  r  ru  rA   ra  r   )rR   rr  rs  r  r  rY   s         r?   _collect_test_results*_DatasetRunContainer._collect_test_results  s     	9:!%)%:%:%<"!  KK56!%!;!;H!E**=K**0
 	
r>   c                   U R                  U5      nU(       a   UR                  5       n[        U5         U R                  R                  U R                  R                  [        R                  " [        R                  5      S9  U$ ! [         a  n[        R                  SUSS9   S nANwS nAff = f! [         a   n[        R                  SUSS9   S nAU$ S nAff = f)Nz&Failed to print aggregate feedback: %sT)exc_info)r  zFailed to close project: %s)r  rV   _display_aggregate_resultsr7  r   r   rX  update_projectra  r8  r   nowr   utc)rR   rr  verboserY   agg_feedbackrl   s         r?   finish_DatasetRunContainer.finish  s    
 ,,];Y&==?*<8	JKK&&!hll3 '    YEqSWXY  	JLL6DLI	Js/   B AB4 
B1B,,B14
C>CCc                h   U=(       d    [         R                  " 5       nU
(       a  U	(       d  0 n	U	R                  SU
05        [        UUUUU	UUS9u  ppU=(       d    / nUR                  R                  S5      =(       d    0 R                  5        H  u  nnUR                  SU SU 35        M     SUR                  S   0nU
(       a  U
US'   [        U5      n[        UUUUR                  =(       d    [        R                  5      n[        US   X5        [        R                  " [!        U5      5      nU Vs/ sHL  n[#        [%        UR&                  UUR(                  S9[+        U=(       d    / UUR(                  SS	9U/UUUS
9PMN     nnU " UUUUUU(       a  UR,                  S9$ S S9$ s  snf )Nrevision_id)rZ  r  rJ  rI  zgit:=rJ  r   )rY  rX  rp   )r  rX  rp   max_concurrency)r  r  r  r   )rX  ra  r[  r   rn  ro  )r2   random_namerf   re  r   rc   rb   ri   r   r   r   r%   kvr   r3   ProgressBarCallbackr   r   r   r   r8  r   ro  )clsrX  r   r   rY  r1   r  r   concurrency_levelrZ  r  rJ  r[  ra  r\  r   rs   rt   run_metadatar   progress_barr   rn  s                          r?   prepare_DatasetRunContainer.prepare  s    $D'B'B'D##% ##]K$@A4E -+5
1 zr%%))%06B==?DAqKK$qc1#' @)7+;+;<M+NO*5L'./CD*,	
 	!!mJ33CMB* $)
( $' #%,\\%#*::
 -#1#7R%#*::()	 !  1%#& $) 	 
, '<FZ88
 	
 MQ
 	
-
s   ;AF/r7   )rr  r   rs  zdict[str, _RowResult]rx   re   )r  zdict[str, Run]rx   z
list[dict])rx   z,tuple[dict[str, _RowResult], dict[str, Run]])rr  z-list[Union[dict, str, LLMResult, ChatResult]]rx   rA   )F)rr  r   r  boolrx   rA   )NNN   NNN)rX  r   r   r   r   MODEL_OR_CHAIN_FACTORYrY  rh  r1   "Optional[smith_eval.RunEvalConfig]r  Optional[list[str]]r   Optional[Callable[[dict], Any]]r  intrZ  Optional[dict[str, Any]]r  rh  rJ  Optional[Union[datetime, str]]rx   rk  )r8   r9   r:   r;   r<   r   ro  ru  r  r  r  r  classmethodr  r=   r7   r>   r?   rk  rk  2  sF   =N!!OSLS 0 
	,"2G:
D
 

*   
	,  :>$(8<!"59%):>H
H
 H
 5	H

 $H
 7H
 "H
 6H
 H
 3H
 #H
 8H
 
H
 H
r>   rk  c                      SSK Jn   U " 5       nU " 5       S L=(       a    S[        [        U5      5      ;   $ ! [         a     gf = f)Nr   )get_ipythonzmqshellF)IPython.core.getipythonr  r   r   ra   )r  ress     r?   _is_jupyter_environmentr    sB    7m}D(IZ3tCy>-II s   36 
AAc                    [        5       (       a  SSKJnJn  U" U" S5      5        U" U 5        g U R	                  S SS9n[        S5        [        U5        g )Nr   )HTMLdisplayz<h3>Experiment Results:</h3>c                
    U S $ )Nz.2fr7   )xs    r?   r   ,_display_aggregate_results.<locals>.<lambda>  s
    aW:r>   right)float_formatjustifyz
 Experiment Results:)r  IPython.displayr  r  	to_stringrW  )aggregate_resultsr  r  formatted_strings       r?   r  r    sU      1345!",66- 7 
 	&'r>   a  The input_mapper argument is deprecated and will be removed in a future release. Please add a  RunnableLambda to your chain to map inputs to the expected format instead. Example:
def construct_chain():
    my_chain = ...
    input_mapper = {'other_key': 'MyOtherInput', 'my_input_key': x}
    return input_mapper | my_chain
run_on_dataset(..., llm_or_chain_factory=construct_chain)
(See https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableLambda.html)r  )r1   rJ  r  rY  rZ  r  r  c                 #    U
R                  SS 5      nU(       a  [        S[        SS9  U	c  [        5       R	                  S5      n	U
R                  SS 5      nU(       a  [        SSSS9  U
(       a  [        SS	U
R                  5        S
3SS9  U =(       d
    [        5       n [        R                  U UUUUUUUUU	US9n[        R                  " UR                  S   R	                  S5      /[        [        R                  " [        UR                   US9UR"                  UR                  5      Q76 I S h  vN nUR%                  XS9$  N7f)Nr   0.0.305Tmessagependingr  r  0.1.9qThe tags argument is deprecated and will be removed in a future release. Please specify project_metadata instead.PThe following arguments are deprecated and will be removed in a future release: r   r  removalrZ  r  rJ  r   r  r   r   r  )r   r   _INPUT_MAPPER_DEP_WARNINGr   rc   r   r   rk  r  runnable_utilsgather_with_concurrencyrn  map	functoolspartialr:  r[  r   r  )rX  r   r   r1   rJ  r  rY  rZ  r  r  r   r   r  	containerrr  s                  r?   arun_on_datasetr  '  sa     ::nd3L	+DdS46::=I::fd#DU		
 4{{}oQ  	
 vxF$,,)' - I )@@!  !23	"%.%<%<)
 

 M M;;s   D4E6E
7Ec               `   U
R                  SS 5      nU(       a  [        S[        SS9  U
R                  SS 5      nU(       a  [        SSSS9  U	c  [        5       R	                  S5      n	U
(       a  [        SS	U
R                  5        S
3SS9  U =(       d
    [        5       n [        R                  U UUUUUUUUU	US9nUS:X  aH  [        UR                  UR                  5       VVs/ sH  u  p[        UUUR                  US9PM     nnnO~[        R                  " UR                  S   5       n[!        UR#                  [$        R&                  " [        UR                  US9UR                  UR                  5      5      nS S S 5        UR)                  WUS9$ s  snnf ! , (       d  f       N$= f)Nr   r  Tr  r  r  r  r  r  r   r  r  r   r  r  )r   r   r  r   rc   r   r   rk  r  rq  r   rn  rE  r[  r2  get_executor_for_configr   r  r  r  r  )rX  r   r   r1   rJ  r  rY  rZ  r  r  r   r   r  r  r   r   rr  r  s                     r?   run_on_datasetr  h  s    ::nd3L	+DdS::fd#DU		
 46::=I4{{}oQ  	
 vxF$,,)' - I A $'y'9'99;L;L#M
 $N %.%<%<)	 $N 	 
 44Y5F5Fq5IJh %%)-6-D-D%1
 &&%%
M K M7;;/
 KJs   &!F.AF
F-a1  
Run the Chain or language model on a dataset and store traces
to the specified project name.

Args:
    dataset_name: Name of the dataset to run the chain on.
    llm_or_chain_factory: Language model or Chain constructor to run
        over the dataset. The Chain constructor is used to permit
        independent calls on each example without carrying over state.
    evaluation: Configuration for evaluators to run on the
        results of the chain
    concurrency_level: The number of async tasks to run concurrently.
    project_name: Name of the project to store the traces in.
        Defaults to {dataset_name}-{chain class name}-{datetime}.
    project_metadata: Optional metadata to add to the project.
        Useful for storing information the test variant.
        (prompt version, model version, etc.)
    client: LangSmith client to use to access the dataset and to
        log feedback and run traces.
    verbose: Whether to print progress.
    tags: Tags to add to each run in the project.
    revision_id: Optional revision identifier to assign this test run to
        track the performance of different versions of your system.
Returns:
    A dictionary containing the run's project name and the resulting model outputs.


For the (usually faster) async version of this function, see :func:`arun_on_dataset`.

Examples
--------

.. code-block:: python

    from langsmith import Client
    from langchain_openai import ChatOpenAI
    from langchain.chains import LLMChain
    from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset

    # Chains may have memory. Passing in a constructor function lets the
    # evaluation framework avoid cross-contamination between runs.
    def construct_chain():
        llm = ChatOpenAI(temperature=0)
        chain = LLMChain.from_string(
            llm,
            "What's the answer to {your_input_key}"
        )
        return chain

    # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum)
    evaluation_config = smith_eval.RunEvalConfig(
        evaluators=[
            "qa",  # "Correctness" against a reference answer
            "embedding_distance",
            smith_eval.RunEvalConfig.Criteria("helpfulness"),
            smith_eval.RunEvalConfig.Criteria({
                "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?"
            }),
        ]
    )

    client = Client()
    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )

You can also create custom evaluators by subclassing the
:class:`StringEvaluator <langchain.evaluation.schema.StringEvaluator>`
or LangSmith's `RunEvaluator` classes.

.. code-block:: python

    from typing import Optional
    from langchain.evaluation import StringEvaluator

    class MyStringEvaluator(StringEvaluator):

        @property
        def requires_input(self) -> bool:
            return False

        @property
        def requires_reference(self) -> bool:
            return True

        @property
        def evaluation_name(self) -> str:
            return "exact_match"

        def _evaluate_strings(self, prediction, reference=None, input=None, **kwargs) -> dict:
            return {"score": prediction == reference}


    evaluation_config = smith_eval.RunEvalConfig(
        custom_evaluators = [MyStringEvaluator()],
    )

    run_on_dataset(
        client,
        dataset_name="<my_dataset_name>",
        llm_or_chain_factory=construct_chain,
        evaluation=evaluation_config,
    )
zrun_on_dataset(zawait arun_on_dataset()z<my_dataset>)r   r  r   r   rx   rl  )r   dict[str, Any]rx   r   )r   r  rx   re   )r   r&   r   r  rx   r   )r   r&   r   r,   r   r  rx   r   )r   r&   r   rl  r   r  rx   r   )
r   rl  r   rm  r1   r  r   r%   rx   zOptional[list[RunEvaluator]])r   smith_eval.RunEvalConfigr   r  rx   rh  )r   r  r   r  rx   rh  )r   r  r   r  rx   rh  )r  zYUnion[smith_eval_config.SINGLE_EVAL_CONFIG_TYPE, smith_eval_config.CUSTOM_EVALUATOR_TYPE]r  zOptional[BaseLanguageModel]r   r   r   r%   r   r  r   rh  r   rh  r   rh  rx   r    )
r   r  r   r  r   r  r   r  rx   z2tuple[Optional[str], Optional[str], Optional[str]])r   r  r   r   r   r%   r   r  r   r  r   r  rx   zlist[RunEvaluator])r   r   r   r  r  r  r  r   r   r  r   r  rx   Union[str, BaseMessage])r   zUnion[Chain, Runnable]r   r  r  r   r  r  r   r  r   r  rx   zUnion[dict, str])
r   r&   r   r   r   rl  r   r  rx   z'Union[dict, str, LLMResult, ChatResult])r   r   r   r  r  r   r  r  r   r  r   r  rx   r  r  )rX  r   r   r   r   r  rY  r   rZ  r  r  r  rJ  zOptional[Union[str, datetime]]rx   z1tuple[MCF, TracerSession, Dataset, list[Example]])rx   r  )r  ry   rx   r   )rX  zOptional[Client]r   r   r   r  r1   r  rJ  r  r  r  rY  rh  rZ  r  r  r  r  rh  r   r   rx   r  )r<   
__future__r   concurrent.futuresry  dataclassesr  r   loggingrT  r   r   typingr   r   r   r	   r
   r   langchain_core._apir   langchain_core.callbacksr   langchain_core.language_modelsr   langchain_core.messagesr   r   langchain_core.outputsr   r   langchain_core.runnablesr   r   r   r   r2  r   r  !langchain_core.tracers.evaluationr   r    langchain_core.tracers.langchainr   langsmith.clientr   langsmith.envr   r   langsmith.evaluationr   r    r!   r	  langsmith.run_helpersr"   r#   langsmith.schemasr$   r%   r&   r'   r(   langsmith.utilsr)   requestsr*   typing_extensionsr+   langchain.chains.baser,   langchain.evaluation.loadingr-   langchain.evaluation.schemar.   r/   r0   langchain.smithr1   r  langchain.smith.evaluationr  r2   r3   r`   rk   	getLoggerr8   r   re   r  rl  r7  r5   rA   r{   r   r   r   r   r   r   r   r   r   r   r   r  r  r   r+  r3  r:  r@  rB  rE  re  rg  	dataclassrk  r  r  r  r  r  _RUN_ON_DATASET_DOCSTRINGreplacer7   r>   r?   <module>r     s)   D "       '  0 . < C 8 M M > < = # F E L L *  ' ' 7 
 5 B @			8	$Ruh''(dVS[		  HRuh//02CCD3y 3E4 E4P- -  ': 0: :  	: z. b Y  & T 6 61 6 
 6F%(%(%( 2%( 
	%(PDDD 2D 
	D( 3 	
 "<$# 6$$ 4$( &BB
 *B B B )B !B B "B BJ	4$	4#	4 %	4 )		4
 8	4A$AA A )	A
 $A %A AV !%48)-A	AA 	A
 A 2A 'A AR !%48)-!!!! !
 ! 2! '! !R 59555 	5
 25 -5@ !%48)-?	?? ?
 ? 2? '? ?N !%48)-!!!! !
 ! 2! '! !R 59888 	8
 28 -8@ 26 $6:<5<5<5 1<5 	<5
 /<5 <5 4<5 7<5~%  G
 G
 G
T  
: , 6:6:"&15!%><><>< 1><
 3>< 4>< ><  >< />< >< >< >< ><L 6:6:"&15!%J<J<J< 1J<
 3J< 4J< J<  J< /J< J< J< J< J<Zj V 3 3;; r>   