
    Ch                         S r SSKJr  SSKrSSKrSSKrSSKJr  SSKJ	r	  SSK
Jr  \R                  " \5      r " S S\5      rg)	a@  
This file contains deprecated code that can only be used with the old `model.fit`-style Sentence Transformers v2.X training.
It exists for backwards compatibility with the `model.old_fit` method, but will be removed in a future version.

Nowadays, with Sentence Transformers v3+, it is recommended to use the `SentenceTransformerTrainer` class to train models.
See https://www.sbert.net/docs/sentence_transformer/training_overview.html for more information.

Instead, you should create a `datasets` `Dataset` for training: https://huggingface.co/docs/datasets/create_dataset
    )annotationsN)Dataset)SentenceTransformer)InputExamplec                      \ rS rSrSr  S       SS jjr S         SS jjr   S       SS jjrS rS r	S	 r
S
 rS rSrg)ParallelSentencesDataset   u`  
This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
sentence in different languages. For example, the file can look like this (EN       DE      ES):
hello world     hallo welt  hola mundo
second sentence zweiter satz    segunda oración

The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
mapped to this English sentence embedding.

When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
returns a list of sentence embeddings
c                    Xl         X l        / U l        / U l        / U l        / U l        / U l        / U l        X0l        X@l	        0 U l
        SU l        g)a  
Parallel sentences dataset reader to train student model given a teacher model

Args:
    student_model (SentenceTransformer): The student sentence embedding model that should be trained.
    teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
    batch_size (int, optional): The batch size for training. Defaults to 8.
    use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
r   N)student_modelteacher_modeldatasetsdatasets_iteratordatasets_tokenizeddataset_indicescopy_dataset_indicescache
batch_sizeuse_embedding_cacheembedding_cachenum_sentences)selfr   r   r   r   s        o/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__!ParallelSentencesDataset.__init__*   sY      +*!#"$!$&!
$#6 !    Nc           
        [         R                  SU-   5        / nUR                  S5      (       a  [        R                  " USSS9O	[	        USS9 nSnU H  nUR                  5       R                  S5      n	Ub0  US:  a*  [        U	 V
s/ sH  n
[        U
5      PM     sn
5      U:  a  MU  UR                  U	5        US	-  nUc  Mp  US:  d  Mx  Xs:  d  M    O   SSS5        U R                  XRX4S
9  gs  sn
f ! , (       d  f       N$= f)a~  
Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

Args:
    filepath (str): Filepath to the file.
    weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
    max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
    max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

Returns:
    None
zLoad z.gzrtutf8)encodingr   	N   )weightmax_sentencesmax_sentence_length)loggerinfoendswithgzipopenstripsplitmaxlenappendadd_dataset)r   filepathr"   r#   r$   parallel_sentencesfIncountline	sentencessents              r   	load_data"ParallelSentencesDataset.load_dataG   s     	Gh&'   '' IIhv6h0147E JJL..t4	'3+a/9=94SY9=>ATT")))4
 ,1BuG] 1$ 	] 	 	
 >1 1s0   9C7
C2$C7C7C7C72C77
Dc                   0 nU H  nUb0  US:  a*  [        U Vs/ sH  n[        U5      PM     sn5      U:  a  M6  US   nX;  a  [        5       XX'   U H  nXX   R                  U5        M     Uc  Mm  US:  d  Mu  [        U5      U:  d  M    O   [        U5      S:X  a  g U =R                  [        U Vs/ sH  n[        XW   5      PM     sn5      -  sl        [        U R                  5      n	U R                  R                  [        UR                  5       5      5        U R                  R                  S5        U R                  R                  U	/U-  5        g s  snf s  snf Nr   )r,   r-   setaddr   sumr   r.   listitemsr   r   extend)
r   r1   r"   r#   r$   sentences_mapr5   r6   source_sentence
dataset_ids
             r   r/   $ParallelSentencesDataset.add_datasetq   s?    +I#/'!+y9ytTy9:=PP'lO314.!.2248 " (]Q->3}CUYfCf! ,$ }"c"V3}':#;"VWW'
T-"5"5"789%%a(##ZL6$9:- :" #Ws   E
4Ec           	        / n/ nU R                    H8  nU R                  U5      u  pEUR                  U5        UR                  U5        M:     U R                  U5      n[	        Xb5       H2  u  puU H'  nU R
                  R                  [        U/US95        M)     M4     [        R                  " U R
                  5        g )N)textslabel)	r   
next_entryr.   get_embeddingszipr   r   randomshuffle)	r   source_sentences_listtarget_sentences_listdata_idxsrc_sentencetrg_sentencessrc_embeddingssrc_embeddingtrg_sentences	            r   generate_data&ParallelSentencesDataset.generate_data   s     " ",,H*.//(*C'L!((6!((7 - ,,-BC,/,V(M -

!!,l^="YZ !. -W 	tzz"r   c                .   U R                   U   U R                  U      u  p#U R                  U==   S-  ss'   U R                  U   [        U R                   U   5      :  a2  SU R                  U'   [        R                  " U R                   U   5        X#4$ )Nr!   r   )r   r   r-   rK   rL   )r   rO   sourcetarget_sentencess       r   rH   #ParallelSentencesDataset.next_entry   s    #'==#:4;Q;QRZ;[#\ x(A-(!!(+s4==3J/KK/0D""8,NN4==23''r   c                   U R                   (       d%  U R                  R                  XR                  SSS9$ / nU H%  nX0R                  ;  d  M  UR                  U5        M'     [        U5      S:  aG  U R                  R                  X R                  SSS9n[        X$5       H  u  p5XPR                  U'   M     U Vs/ sH  o0R                  U   PM     sn$ s  snf )NFT)r   show_progress_barconvert_to_numpyr   )r   r   encoder   r   r.   r-   rJ   )r   r5   new_sentencesr6   new_embeddings	embeddings         r   rI   'ParallelSentencesDataset.get_embeddings   s    ''%%,,ooae -  
 D///$$T*  }!!//66//Uei 7 N $'}#E-6$$T* $F 8AAyt$$T*yAAAs   >Cc                    U R                   $ )N)r   )r   s    r   __len__ ParallelSentencesDataset.__len__   s    !!!r   c                    [        U R                  5      S:X  a  U R                  5         U R                  R                  5       $ r:   )r-   r   rU   pop)r   idxs     r   __getitem__$ParallelSentencesDataset.__getitem__   s/    tzz?a zz~~r   )r   r   r   r   r   r   r   r   r   r   r   r   )   T)r   r   r   r   r   intr   bool)d   N   )
r0   strr"   rl   r#   rl   r$   rl   returnNone)r1   zlist[list[str]]r"   rl   r#   rl   r$   rl   )__name__
__module____qualname____firstlineno____doc__r   r7   r/   rU   rH   rI   rd   ri   __static_attributes__ r   r   r   r      s    ( $(* + 	
 "< gj(
(
%((
?B(
`c(
	(
Z !#&";+"; "; 	";
 !";H#"(B*" r   r   )rw   
__future__r   r(   loggingrK   torch.utils.datar   sentence_transformersr   sentence_transformers.readersr   	getLoggerrs   r%   r   ry   r   r   <module>r      s?    #    $ 5 6			8	$s w s r   