
    Ch'!                    n    S SK Jr  S SKJr  S SKrS SKJrJr  S SKJrJ	r	   " S S\R                  5      rg)    )annotations)IterableN)Tensornn)SentenceTransformerutilc                  z   ^  \ rS rSr\R
                  S4     SU 4S jjjrS	S jrS
S jr\	SS j5       r
SrU =r$ )DistillKLDivLoss   g      ?c                x   > [         TU ]  5         Xl        X l        X0l        [
        R                  " SS9U l        g)a"  
Compute the KL divergence loss between probability distributions derived from student and teacher models' similarity scores.
By default, similarity is calculated using the dot-product. This loss is designed for knowledge distillation
where a smaller student model learns from a more powerful teacher model.

The loss computes softmax probabilities from the teacher similarity scores and log-softmax probabilities
from the student model, then calculates the KL divergence between these distributions.

Args:
    model: SentenceTransformer model (student model)
    similarity_fct: Which similarity function to use for the student model
    temperature: Temperature parameter to soften probability distributions (higher temperature = softer distributions)
        A temperature of 1.0 does not scale the scores. Note: in the v5.0.1 release, the default temperature was changed from 2.0 to 1.0.

References:
    - For more details, please refer to https://arxiv.org/abs/2010.11386

Requirements:
    1. (query, positive, negative_1, ..., negative_n) examples
    2. Labels containing teacher model's scores between query-positive and query-negative pairs

Inputs:
    +------------------------------------------------+------------------------------------------------------------+
    | Texts                                          | Labels                                                     |
    +================================================+============================================================+
    | (query, positive, negative)                    | [Teacher(query, positive), Teacher(query, negative)]       |
    +------------------------------------------------+------------------------------------------------------------+
    | (query, positive, negative_1, ..., negative_n) | [Teacher(query, positive), Teacher(query, negative_i)...]  |
    +------------------------------------------------+------------------------------------------------------------+

Relations:
    - Similar to :class:`~sentence_transformers.losses.MarginMSELoss` but uses KL divergence instead of MSE
    - More suited for distillation tasks where preserving ranking is important

Example:

    Using a teacher model to compute similarity scores for distillation:

    ::

        from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
        from datasets import Dataset
        import torch

        student_model = SentenceTransformer("microsoft/mpnet-base")
        teacher_model = SentenceTransformer("all-mpnet-base-v2")
        train_dataset = Dataset.from_dict({
            "query": ["It's nice weather outside today.", "He drove to work."],
            "positive": ["It's so sunny.", "He took the car to work."],
            "negative": ["It's very cold.", "She walked to the store."],
        })

        def compute_labels(batch):
            emb_queries = teacher_model.encode(batch["query"])
            emb_positives = teacher_model.encode(batch["positive"])
            emb_negatives = teacher_model.encode(batch["negative"])

            pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
            neg_scores = teacher_model.similarity_pairwise(emb_queries, emb_negatives)

            # Stack the scores for positive and negative pairs
            return {
                "label": torch.stack([pos_scores, neg_scores], dim=1)
            }

        train_dataset = train_dataset.map(compute_labels, batched=True)
        loss = losses.DistillKLDivLoss(student_model)

        trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
        trainer.train()

    With multiple negatives:

    ::

        from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, losses
        from datasets import Dataset
        import torch

        student_model = SentenceTransformer("microsoft/mpnet-base")
        teacher_model = SentenceTransformer("all-mpnet-base-v2")

        train_dataset = Dataset.from_dict(
            {
                "query": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to work."],
                "negative1": ["It's very cold.", "She walked to the store."],
                "negative2": ["Its rainy", "She took the bus"],
            }
        )


        def compute_labels(batch):
            emb_queries = teacher_model.encode(batch["query"])
            emb_positives = teacher_model.encode(batch["positive"])
            emb_negatives1 = teacher_model.encode(batch["negative1"])
            emb_negatives2 = teacher_model.encode(batch["negative2"])

            pos_scores = teacher_model.similarity_pairwise(emb_queries, emb_positives)
            neg_scores1 = teacher_model.similarity_pairwise(emb_queries, emb_negatives1)
            neg_scores2 = teacher_model.similarity_pairwise(emb_queries, emb_negatives2)

            # Stack the scores for positive and multiple negative pairs
            return {
                "label": torch.stack([pos_scores, neg_scores1, neg_scores2], dim=1)
            }

        train_dataset = train_dataset.map(compute_labels, batched=True)
        loss = losses.DistillKLDivLoss(student_model)

        trainer = SentenceTransformerTrainer(model=student_model, train_dataset=train_dataset, loss=loss)
        trainer.train()
	batchmean)	reductionN)super__init__modelsimilarity_fcttemperaturer   	KLDivLossloss_fct)selfr   r   r   	__class__s       e/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/losses/DistillKLDivLoss.pyr   DistillKLDivLoss.__init__   s2    h 	
,&{;    c                r    U Vs/ sH  o0R                  U5      S   PM     nnU R                  XB5      $ s  snf )Nsentence_embedding)r   compute_loss_from_embeddings)r   sentence_featureslabelssentence_feature
embeddingss        r   forwardDistillKLDivLoss.forward   s>    arsarM]jj!123GHar
s00DD ts   4c           	     V   US   n[         R                  " USS   Vs/ sH  o@R                  X45      PM     snSS9nXPR                  -  n[         R                  " USS9nX R                  -  n[         R
                  " USS9nU R                  Xh5      n	XR                  S-  -  n	U	$ s  snf )Nr      )dim   )torchstackr   r   log_softmaxsoftmaxr   )
r   r!   r   embeddings_queryembeddings_otherstudent_scoresstudent_log_probsteacher_scoresteacher_probslosss
             r   r   -DistillKLDivLoss.compute_loss_from_embeddings   s    %a= ]ghihj]kl]kIY  !1D]kl

 (*:*::!--n!D  "2"22n!< }}.>''*+ ms   B&c                    g)Nai  
@misc{lin2020distillingdenserepresentationsranking,
      title={Distilling Dense Representations for Ranking using Tightly-Coupled Teachers},
      author={Sheng-Chieh Lin and Jheng-Hong Yang and Jimmy Lin},
      year={2020},
      eprint={2010.11386},
      archivePrefix={arXiv},
      primaryClass={cs.IR},
      url={https://arxiv.org/abs/2010.11386},
}
 )r   s    r   citationDistillKLDivLoss.citation   s    
r   )r   r   r   r   )r   r   r   floatreturnNone)r   zIterable[dict[str, Tensor]]r   r   r9   r   )r!   zlist[Tensor]r   r   r9   r   )r9   str)__name__
__module____qualname____firstlineno__r   pairwise_dot_scorer   r"   r   propertyr6   __static_attributes____classcell__)r   s   @r   r
   r
      sV    9=9P9Pgjx<(x<_dx<	x< x<tE
,  r   r
   )
__future__r   collections.abcr   r(   r   r   sentence_transformersr   r   Moduler
   r5   r   r   <module>rH      s&    " $   ;bryy br   