
    ChT                    2   S SK Jr  S SKrS SKrS SKJrJrJrJr  S SK	r
S SKJr  S SKJr  S SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  \(       a  S SKJr  \R2                  " \5      r\S   rSSSSSSSSSSSSSS.rSSSSSSSS S!S"S#S$S%S.r " S& S'\5      rg)(    )annotationsN)TYPE_CHECKINGAnyCallableLiteral)Tensor)tqdm)SentenceTransformer)InformationRetrievalEvaluator)SentenceEvaluator)SimilarityFunction)is_datasets_available)climatefeverdbpediafeverfiqa2018hotpotqamsmarconfcorpusnqquoraretrievalscidocsarguanascifact
touche2020zzeta-alpha-ai/NanoClimateFEVERzzeta-alpha-ai/NanoDBPediazzeta-alpha-ai/NanoFEVERzzeta-alpha-ai/NanoFiQA2018zzeta-alpha-ai/NanoHotpotQAzzeta-alpha-ai/NanoMSMARCOzzeta-alpha-ai/NanoNFCorpuszzeta-alpha-ai/NanoNQz zeta-alpha-ai/NanoQuoraRetrievalzzeta-alpha-ai/NanoSCIDOCSzzeta-alpha-ai/NanoArguAnazzeta-alpha-ai/NanoSciFactzzeta-alpha-ai/NanoTouche2020ClimateFEVERDBPediaFEVERFiQA2018HotpotQAMSMARCONFCorpusNQQuoraRetrievalSCIDOCSArguAnaSciFact
Touche2020c                     ^  \ rS rSrSr\rSS/S// SQ/ SQS/SSS	SSS\R                  S
SSS4                                 SU 4S jjjr	S r
   S         SS jjrSS jrSS jrS rS rU 4S jrSS jrSrU =r$ )NanoBEIREvaluatorH   aH  
This class evaluates the performance of a SentenceTransformer Model on the NanoBEIR collection of Information Retrieval datasets.

The collection is a set of datasets based on the BEIR collection, but with a significantly smaller size, so it can
be used for quickly evaluating the retrieval performance of a model before committing to a full evaluation.
The datasets are available on Hugging Face in the `NanoBEIR collection <https://huggingface.co/collections/zeta-alpha-ai/nanobeir-66e1a0af21dfd93e620cd9f6>`_.
This evaluator will return the same metrics as the InformationRetrievalEvaluator (i.e., MRR, nDCG, Recall@k), for each dataset and on average.

Args:
    dataset_names (List[str]): The names of the datasets to evaluate on. Defaults to all datasets.
    mrr_at_k (List[int]): A list of integers representing the values of k for MRR calculation. Defaults to [10].
    ndcg_at_k (List[int]): A list of integers representing the values of k for NDCG calculation. Defaults to [10].
    accuracy_at_k (List[int]): A list of integers representing the values of k for accuracy calculation. Defaults to [1, 3, 5, 10].
    precision_recall_at_k (List[int]): A list of integers representing the values of k for precision and recall calculation. Defaults to [1, 3, 5, 10].
    map_at_k (List[int]): A list of integers representing the values of k for MAP calculation. Defaults to [100].
    show_progress_bar (bool): Whether to show a progress bar during evaluation. Defaults to False.
    batch_size (int): The batch size for evaluation. Defaults to 32.
    write_csv (bool): Whether to write the evaluation results to a CSV file. Defaults to True.
    truncate_dim (int, optional): The dimension to truncate the embeddings to. Defaults to None.
    score_functions (Dict[str, Callable[[Tensor, Tensor], Tensor]]): A dictionary mapping score function names to score functions. Defaults to {SimilarityFunction.COSINE.value: cos_sim, SimilarityFunction.DOT_PRODUCT.value: dot_score}.
    main_score_function (Union[str, SimilarityFunction], optional): The main score function to use for evaluation. Defaults to None.
    aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
    aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".
    query_prompts (str | dict[str, str], optional): The prompts to add to the queries. If a string, will add the same prompt to all queries. If a dict, expects that all datasets in dataset_names are keys.
    corpus_prompts (str | dict[str, str], optional): The prompts to add to the corpus. If a string, will add the same prompt to all corpus. If a dict, expects that all datasets in dataset_names are keys.
    write_predictions (bool): Whether to write the predictions to a JSONL file. Defaults to False.
        This can be useful for downstream evaluation as it can be used as input to the :class:`~sentence_transformers.sparse_encoder.evaluation.ReciprocalRankFusionEvaluator` that accept precomputed predictions.

Example:
    ::

        from sentence_transformers import SentenceTransformer
        from sentence_transformers.evaluation import NanoBEIREvaluator

        model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')

        datasets = ["QuoraRetrieval", "MSMARCO"]
        query_prompts = {
            "QuoraRetrieval": "Instruct: Given a question, retrieve questions that are semantically equivalent to the given question\nQuery: ",
            "MSMARCO": "Instruct: Given a web search query, retrieve relevant passages that answer the query\nQuery: "
        }

        evaluator = NanoBEIREvaluator(
            dataset_names=datasets,
            query_prompts=query_prompts,
        )

        results = evaluator(model)
        '''
        NanoBEIR Evaluation of the model on ['QuoraRetrieval', 'MSMARCO'] dataset:
        Evaluating NanoQuoraRetrieval
        Information Retrieval Evaluation of the model on the NanoQuoraRetrieval dataset:
        Queries: 50
        Corpus: 5046

        Score-Function: cosine
        Accuracy@1: 92.00%
        Accuracy@3: 98.00%
        Accuracy@5: 100.00%
        Accuracy@10: 100.00%
        Precision@1: 92.00%
        Precision@3: 40.67%
        Precision@5: 26.00%
        Precision@10: 14.00%
        Recall@1: 81.73%
        Recall@3: 94.20%
        Recall@5: 97.93%
        Recall@10: 100.00%
        MRR@10: 0.9540
        NDCG@10: 0.9597
        MAP@100: 0.9395

        Evaluating NanoMSMARCO
        Information Retrieval Evaluation of the model on the NanoMSMARCO dataset:
        Queries: 50
        Corpus: 5043

        Score-Function: cosine
        Accuracy@1: 40.00%
        Accuracy@3: 74.00%
        Accuracy@5: 78.00%
        Accuracy@10: 88.00%
        Precision@1: 40.00%
        Precision@3: 24.67%
        Precision@5: 15.60%
        Precision@10: 8.80%
        Recall@1: 40.00%
        Recall@3: 74.00%
        Recall@5: 78.00%
        Recall@10: 88.00%
        MRR@10: 0.5849
        NDCG@10: 0.6572
        MAP@100: 0.5892
        Average Queries: 50.0
        Average Corpus: 5044.5

        Aggregated for Score Function: cosine
        Accuracy@1: 66.00%
        Accuracy@3: 86.00%
        Accuracy@5: 89.00%
        Accuracy@10: 94.00%
        Precision@1: 66.00%
        Recall@1: 60.87%
        Precision@3: 32.67%
        Recall@3: 84.10%
        Precision@5: 20.80%
        Recall@5: 87.97%
        Precision@10: 11.40%
        Recall@10: 94.00%
        MRR@10: 0.7694
        NDCG@10: 0.8085
        '''
        print(evaluator.primary_metric)
        # => "NanoBEIR_mean_cosine_ndcg@10"
        print(results[evaluator.primary_metric])
        # => 0.8084508771660436
N
   )         r,   d   F    Tmeanc                N  > [         TU ]  5         Uc  [        [        R	                  5       5      nXl        Xl        Xl        Xl        Xl	        UU l
        Xpl        Xl        Xl        U(       a,  [        [        U R                  R	                  5       5      5      O/ U l        Xl        Xl        SU 3U l        U R                   (       a"  U =R"                  SU R                    3-  sl        X l        X0l        X@l        XPl        X`l        U R/                  5         U R1                  5         UUUUUUUU	U
UUUS.n[3        U R
                  SSS9 Vs/ sH  nU R4                  " U40 UD6PM     snU l        SU S3U l        S	S
/U l        U R=                  U R                  5        g s  snf )N	NanoBEIR__)mrr_at_k	ndcg_at_kaccuracy_at_kprecision_recall_at_kmap_at_kshow_progress_bar
batch_size	write_csvtruncate_dimscore_functionsmain_score_functionwrite_predictionszLoading NanoBEIR datasetsF)descleaveNanoBEIR_evaluation_z_results.csvepochsteps)super__init__listdataset_name_to_idkeysdataset_namesaggregate_fnaggregate_keyr=   query_promptscorpus_promptsr;   r?   sortedscore_function_namesr@   r>   namer6   r7   r8   r9   r:   _validate_dataset_names_validate_promptsr	   _load_dataset
evaluatorscsv_filecsv_headers_append_csv_headers)selfrL   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rM   rN   rO   rP   rA   ir_evaluator_kwargsrS   	__class__s                       j/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/evaluation/NanoBEIREvaluator.pyrH   NanoBEIREvaluator.__init__   s   ( 	  !3!8!8!:;M*(*"*,!2".Q`F40D0D0I0I0K+L$Mfh!#6 (/	II1T../00I "*%:" $$&  !"*%: !2$"(.#6!2
  T//6QY^_
_ t;':;_

  4M?,O#W-  !:!:;
s   F"c                X   U GH#  nU R                    H#  nU R                  R                  U SU 35        M%     U R                   HC  nU R                  R                  U SU 35        U R                  R                  U SU 35        ME     U R                   H#  nU R                  R                  U SU 35        M%     U R
                   H#  nU R                  R                  U SU 35        M%     U R                   H#  nU R                  R                  U SU 35        M%     GM&     g )Nz
-Accuracy@z-Precision@z-Recall@z-MRR@z-NDCG@z-MAP@)r8   rY   appendr9   r6   r7   r:   )r[   rR   
score_nameks       r^   rZ   %NanoBEIREvaluator._append_csv_headers	  s   .J''  '':,j(DE ( //  '':,k!(EF  '':,hqc(BC 0 ]]  '':,eA3(?@ # ^^  '':,fQC(@A $ ]]  '':,eA3(?@ # /    c                (   0 n0 nUS:w  a  US:X  a  SU 3n	OSU SU S3n	OSn	U R                   b  U	SU R                    S3-  n	[        R                  S	U R                   S
U	 S35        U R                  cJ  UR
                  UR                  0U l        UR
                  /U l        U R                  U R                  5        U R                  R                  S5      n
[        U R                  SU R                  (       + S9 H  n[        R                  SUR                   35        U" XX45      nUR                  5        H=  u  pUR                  SU
S9nUS   nUU;  a  / UU'   XU'   UU   R!                  U5        M?     M     0 nU H  nU R#                  UU   5      UU'   M     UGb  U R$                  (       Ga  [&        R(                  R+                  X R,                  5      n[&        R(                  R/                  U5      (       dG  [1        USSS9nUR3                  SR+                  U R4                  5      5        UR3                  S5        O[1        USSS9nX4/nU R                   H  nU R6                   H  nUR!                  UU SU 3   5        M     U R8                   H5  nUR!                  UU SU 3   5        UR!                  UU SU 3   5        M7     U R:                   H  nUR!                  UU SU 3   5        M     U R<                   H  nUR!                  UU SU 3   5        M     U R>                   H  nUR!                  UU SU 3   5        M     M     UR3                  SR+                  [A        [B        U5      5      5        UR3                  S5        URE                  5         U RF                  (       d  U RH                  ce  [K        U R                   Vs/ sH"  nUUU S[K        U R<                  5       3   4PM$     snS S9S   nU S[K        U R<                  5       3U l#        O3U RH                  RL                   S[K        U R<                  5       3U l#        [N        RP                  " U R                   Vs/ sH  n[S        URT                  5      PM     sn5      n[N        RP                  " U R                   Vs/ sH  n[S        URV                  5      PM     sn5      n[        R                  S U 35        [        R                  S!U S35        U R                   GH  n[        R                  S"U 35        U R6                   H3  n[        R                  S#RY                  UUU SU 3   S$-  5      5        M5     U R8                   Hc  n[        R                  S%RY                  UUU SU 3   S$-  5      5        [        R                  S&RY                  UUU SU 3   S$-  5      5        Me     U R:                   H0  n[        R                  S'RY                  UUU SU 3   5      5        M2     U R<                   H0  n[        R                  S(RY                  UUU SU 3   5      5        M2     U R>                   H0  n[        R                  S)RY                  UUU SU 3   5      5        M2     GM     U R[                  UU R                  5      nU R]                  UUX45        UR_                  U5        U$ s  snf s  snf s  snf )*Nz after epoch z
 in epoch z after z steps z (truncated to )z$NanoBEIR Evaluation of the model on z dataset:r5   zEvaluating datasets)rB   disablezEvaluating )maxsplitwzutf-8)modeencoding,
az
_accuracy@z_precision@z_recall@z_mrr@z_ndcg@z_map@c                    U S   $ Nr-    )xs    r^   <lambda>,NanoBEIREvaluator.__call__.<locals>.<lambda>j  s    !A$re   )keyr   zAverage Queries: zAverage Corpus: zAggregated for Score Function: zAccuracy@{}: {:.2f}%r0   zPrecision@{}: {:.2f}%zRecall@{}: {:.2f}%zMRR@{}: {:.4f}zNDCG@{}: {:.4f}zMAP@{}: {:.4f})0r>   loggerinforL   r?   similarity_fn_name
similarityrR   rZ   rS   countr	   rW   r;   itemssplitra   rM   r=   ospathjoinrX   isfileopenwriterY   r8   r9   r6   r7   r:   mapstrcloseprimary_metricr@   maxvaluenpr2   lenqueriescorpusformatprefix_name_to_metrics store_metrics_in_model_card_dataupdate)r[   modeloutput_pathrE   rF   argskwargsper_metric_resultsper_dataset_resultsout_txtnum_underscores_in_name	evaluator
evaluationfull_keymetric_valuesplitsmetricagg_resultscsv_pathfOutoutput_datarS   rc   score_functionavg_queries
avg_corpuss                             r^   __call__NanoBEIREvaluator.__call__  s      B;{)%1&ugWUG6BG():):(;1==G:4;M;M:NhW^V__`ab'$)$<$<e>N>N#OD ).)A)A(BD%$$T%>%>?"&))//#"6doo4IW[WmWmSmnIKK+inn%567"5uDJ*4*:*:*<&!6MN!3313&v.0<H-"6*11,? += o (F"&"3"34Fv4N"OK ) "t~~~ww||K?H77>>(++H3A

388D$4$456

4  H3A .K11++A&&{dV:aS3I'JK , 33A&&{dV;qc3J'KL&&{dV8A33G'HI 4 A&&{dV53D'EF ' A&&{dV6!3E'FG ( A&&{dV53D'EF ' 2" JJsxxC 567JJtJJL""''/!$[_[t[tu[tSWdK4&s4>>7J6K(LMN[tu&" " *8(8s4>>?R>S&T#)-)A)A)G)G(HsSWSaSaObNc&d#ggtW)s9#4#45WXWWT__U_	c)"2"23_UV
'}56&zl"56--DKK9$@A''299![D6Q[\][^I_=`cf=fgh ( //3::1kTFR]^_]`Ja>beh>hij077;$xXYWZG[;\_b;bcd 0 ]],33A{dV5QRPSCT7UVW # ^^-44QtfFSTRUDV8WXY $ ]],33A{dV5QRPSCT7UVW # .$ 11+tyyI--e[%O"";/""G v XUs   $(\\

\c                x    S[         UR                  5           3nU R                  b  USU R                   3-  nU$ )NNanor5   )dataset_name_to_human_readablelowerr>   )r[   dataset_namehuman_readable_names      r^   _get_human_readable_name*NanoBEIREvaluator._get_human_readable_name  sJ     $%CLDVDVDX%Y$Z[(Qt'8'8&9#::""re   c                   [        5       (       d  [        S5      eSSKJn  [        UR                  5          nU" USSS9nU" USSS9nU" USSS9nU Vs0 sH   n[        US	   5      S:  d  M  US
   US	   _M"     n	nU Vs0 sH   n[        US	   5      S:  d  M  US
   US	   _M"     n
n0 nU H4  nUS   U;  a  [        5       XS   '   XS      R                  US   5        M6     U R                  b  U R                  R                  US 5      US'   U R                  b  U R                  R                  US 5      US'   U R                  U5      nU R                  " SU
U	UUS.UD6$ s  snf s  snf )Nzedatasets is not available. Please install it to use the NanoBEIREvaluator via `pip install datasets`.r   )load_datasetr   train)r   r   qrelstext_idzquery-idz	corpus-idquery_promptcorpus_prompt)r   r   relevant_docsrS   ru   )r   
ValueErrordatasetsr   rJ   r   r   setaddrO   getrP   r   information_retrieval_class)r[   r   r\   r   dataset_pathr   r   r   samplecorpus_dictqueries_dict
qrels_dictr   s                r^   rV   NanoBEIREvaluator._load_dataset  s   $&&w  	*),*<*<*>?lHGD|YgF\7'BCIe6SQWX^Q_M`cdMd4ve}fVn46eDKgG&sSYZ`SaObefOf5uvf~5Gg
Fj!314
*-.j)*..vk/BC 
 )262D2D2H2HW[2\/*373F3F3J3J<Y]3^0";;LI// 
 $$	

 "
 	
 fgs   E$-E$ E)E)c           	     &   [        U R                  5      S:X  a  [        S5      eU R                   Vs/ sH  oR                  5       [        ;  d  M  UPM      sn=n(       a,  [        SU S[        [        R                  5       5       35      eg s  snf )Nr   zDdataset_names cannot be empty. Use None to evaluate on all datasets.zDataset(s) z@ not found in the NanoBEIR collection. Valid dataset names are: )r   rL   r   r   rJ   rI   rK   )r[   r   missing_datasetss      r^   rT   )NanoBEIREvaluator._validate_dataset_names  s    t!!"a'cdd-1-?-? 
-?\CUCUCW_qCqL-? 
 
 
 ./ 0,,01C1H1H1J,K+LN 
  
s   BBc                   SnU R                   b  [        U R                   [        5      (       a,  U R                   Vs0 sH  o"U R                   _M     snU l         O=U R                   Vs/ sH  o"U R                   ;  d  M  UPM     sn=n(       a	  USU S3-  nU R                  b  [        U R                  [        5      (       a,  U R                   Vs0 sH  o"U R                  _M     snU l        O=U R                   Vs/ sH  o"U R                  ;  d  M  UPM     sn=n(       a	  USU S3-  nU(       a  [        UR                  5       5      eg s  snf s  snf s  snf s  snf )Nrh   z2The following datasets are missing query prompts: rq   z3The following datasets are missing corpus prompts: )rO   
isinstancer   rL   rP   r   strip)r[   	error_msgr   missing_query_promptsmissing_corpus_promptss        r^   rU   #NanoBEIREvaluator._validate_prompts  se   	)$,,c22[_[m[m%n[m<D4F4F&F[m%n"151C1C+1C[_[m[mGm1C+ &  QRgQhhjkk	*$--s33]a]o]o&p]o\T5H5H'H]o&p#151C1C,1C[_[n[nGn1C, '  RSiRjjlmm	Y__.//  &o+ 'q,s#   E)E?EE>EEc                Z   > [        U R                  5      S:  a  [        TU ]  " U0 UD6  g g rt   )r   rL   rG   r   )r[   r   r   r]   s      r^   r   2NanoBEIREvaluator.store_metrics_in_model_card_data  s/     t!!"Q&G4dEfE 're   c                r    SU R                   0n/ SQnU H  n[        X5      c  M  [        X5      X'   M      U$ )NrL   )r>   rO   rP   )rL   getattr)r[   config_dictconfig_dict_candidate_keysry   s       r^   get_config_dict!NanoBEIREvaluator.get_config_dict  sB    &(:(:;%X"-Ct!-#*4#5  . re   )r8   rM   rN   rP   rX   rY   rL   rW   r@   r:   r6   rS   r7   r9   r   rO   rR   r?   r;   r>   r=   )"rL   zlist[DatasetNameType] | Noner6   	list[int]r7   r   r8   r   r9   r   r:   r   r;   boolr<   intr=   r   r>   z
int | Noner?   z4dict[str, Callable[[Tensor, Tensor], Tensor]] | Noner@   zstr | SimilarityFunction | NonerM   zCallable[[list[float]], float]rN   r   rO   str | dict[str, str] | NonerP   r   rA   r   )Nrg   rg   )
r   r
   r   z
str | NonerE   r   rF   r   returnzdict[str, float])r   DatasetNameTyper   r   )r   r   r   r   )r   zdict[str, Any])__name__
__module____qualname____firstlineno____doc__r   r   r   r2   rH   rZ   r   r   rV   rT   rU   r   r   __static_attributes____classcell__)r]   s   @r^   r*   r*   H   s   tl #@ 7;!d "t#0+8"e"'#'PT?C79ww#596:"'%F<3F< F< 	F<
 !F<  )F< F<  F< F< F< !F< NF< =F< 5F< F<  3!F<" 4#F<$  %F< F<PA* #'q#"q#  q# 	q#
 q# 
q#f#
@	0*F re   r*   ) 
__future__r   loggingr   typingr   r   r   r   numpyr   torchr   r	   sentence_transformersr
   >sentence_transformers.evaluation.InformationRetrievalEvaluatorr   2sentence_transformers.evaluation.SentenceEvaluatorr   *sentence_transformers.similarity_functionsr   sentence_transformers.utilr   )sentence_transformers.SentenceTransformer	getLoggerr   rz   r   rJ   r   r*   ru   re   r^   <module>r      s    "  	 8 8    5 h P I <M			8	$$ 5*&,,*,
 8***0 " #
&" "Y) Yre   