o
    sh3                     @   sf   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZ d dl	m
Z
 eeZG dd deZdS )    N)List)Dataset)SentenceTransformer)InputExamplec                   @   s   e Zd ZdZ		d!dedededefdd	Z	d"dededededdf
ddZ		
		d"de
e
e  dededefddZdd Zdd Zdd Zdd Zdd  ZdS )#ParallelSentencesDatasetu  
    This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
    sentence in different languages. For example, the file can look like this (EN	DE	ES):
    hello world     hallo welt  hola mundo
    second sentence zweiter satz    segunda oración

    The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
    embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
    mapped to this English sentence embedding.

    When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.

    teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
    returns a list of sentence embeddings
       Tstudent_modelteacher_model
batch_sizeuse_embedding_cachec                 C   sL   || _ || _g | _g | _g | _g | _g | _g | _|| _|| _	i | _
d| _dS )a+  
        Parallel sentences dataset reader to train student model given a teacher model

        Args:
            student_model (SentenceTransformer): The student sentence embedding model that should be trained.
            teacher_model (SentenceTransformer): The teacher model that provides the sentence embeddings for the first column in the dataset file.
            batch_size (int, optional): The batch size for training. Defaults to 8.
            use_embedding_cache (bool, optional): Whether to use an embedding cache. Defaults to True.
        r   N)r   r	   datasetsdatasets_iteratordatasets_tokenizeddataset_indicescopy_dataset_indicescacher
   r   embedding_cachenum_sentences)selfr   r	   r
   r    r   u/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/sentence_transformers/datasets/ParallelSentencesDataset.py__init__   s   
z!ParallelSentencesDataset.__init__d   N   filepathweightmax_sentencesmax_sentence_lengthreturnc           
      C   s   t d|  g }|drtj|dddnt|ddA}d}|D ]4}| d}	|dur>|dkr>td	d
 |	D |kr>q!||	 |d7 }|durU|dkrU||krU nq!W d   n1 s`w   Y  | j	||||d dS )a  
        Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column

        Args:
            filepath (str): Filepath to the file.
            weight (int, optional): If more than one dataset is loaded with load_data, specifies the frequency at which data should be sampled from this dataset. Defaults to 100.
            max_sentences (int, optional): Maximum number of lines to be read from the filepath. Defaults to None.
            max_sentence_length (int, optional): Skip the example if one of the sentences has more characters than max_sentence_length. Defaults to 128.

        Returns:
            None
        zLoad z.gzrtutf8)encodingr   	Nc                 S      g | ]}t |qS r   len.0sentr   r   r   
<listcomp>X       z6ParallelSentencesDataset.load_data.<locals>.<listcomp>   )r   r   r   )
loggerinfoendswithgzipopenstripsplitmaxappendadd_dataset)
r   r   r   r   r   parallel_sentencesfIncountline	sentencesr   r   r   	load_data<   s,   

z"ParallelSentencesDataset.load_datar6   c           	         s   i  |D ]?}|d ur|dkrt dd |D |krq|d }| vr't  |< |D ]	} | | q)|d urC|dkrCt |krC nqt dkrLd S |  jt fdd D 7  _t| j}| jt 	  | j
d | j|g|  d S )Nr   c                 S   r#   r   r$   r&   r   r   r   r)   p   r*   z8ParallelSentencesDataset.add_dataset.<locals>.<listcomp>c                    s   g | ]}t  | qS r   r$   r&   sentences_mapr   r   r)      s    )r3   setaddr%   r   sumr   r4   listitemsr   r   extend)	r   r6   r   r   r   r:   source_sentencer(   
dataset_idr   r<   r   r5   d   s*   
 
z$ParallelSentencesDataset.add_datasetc           	      C   s   g }g }| j D ]}| |\}}|| || q| |}t||D ]\}}|D ]}| jt|g|d q+q%t| j d S )N)textslabel)	r   
next_entryr4   get_embeddingszipr   r   randomshuffle)	r   source_sentences_listtarget_sentences_listdata_idxsrc_sentencetrg_sentencessrc_embeddingssrc_embeddingtrg_sentencer   r   r   generate_data   s   


z&ParallelSentencesDataset.generate_datac                 C   sd   | j | | j|  \}}| j|  d7  < | j| t| j | kr.d| j|< t| j |  ||fS )Nr+   r   )r   r   r%   rK   rL   )r   rO   sourcetarget_sentencesr   r   r   rH      s   
z#ParallelSentencesDataset.next_entryc                    s    j s jj| jdddS g }|D ]}| jvr|| qt|dkr? jj| jddd}t||D ]	\}}| j|< q5 fdd|D S )NFT)r
   show_progress_barconvert_to_numpyr   c                    s   g | ]} j | qS r   )r   r&   r   r   r   r)      s    z;ParallelSentencesDataset.get_embeddings.<locals>.<listcomp>)r   r	   encoder
   r   r4   r%   rJ   )r   r:   new_sentencesr(   new_embeddings	embeddingr   rZ   r   rI      s    



z'ParallelSentencesDataset.get_embeddingsc                 C   s   | j S )N)r   rZ   r   r   r   __len__   s   z ParallelSentencesDataset.__len__c                 C   s    t | jdkr|   | j S )Nr   )r%   r   rU   pop)r   idxr   r   r   __getitem__   s   
z$ParallelSentencesDataset.__getitem__)r   T)r   Nr   )__name__
__module____qualname____doc__r   intboolr   strr;   r   r5   rU   rH   rI   r_   rb   r   r   r   r   r      sT    

+

$
r   )r/   loggingrK   typingr   torch.utils.datar   sentence_transformersr   sentence_transformers.readersr   	getLoggerrc   r,   r   r   r   r   r   <module>   s    
