o
    sh                     @   sP   d dl mZ d dlZd dlmZ d dlmZmZ d dl	m
Z
 G dd deZdS )    )ListN)Dataset)NLTK_IMPORT_ERRORis_nltk_available)InputExamplec                   @   sH   e Zd ZdZdd fdee fddZdd Zd	d
 Ze	dddZ
dS )DenoisingAutoEncoderDataseta  
    The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
    It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
    sentence without noise.

    Args:
        sentences: A list of sentences
        noise_fn: A noise function: Given a string, it returns a string
            with noise, e.g. deleted words
    c                 C   s
   t | S N)r   delete)s r   x/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py<lambda>   s   
 z$DenoisingAutoEncoderDataset.<lambda>	sentencesc                 C   s(   t  stt| jj|| _|| _d S r   )r   ImportErrorr   format	__class____name__r   noise_fn)selfr   r   r   r   r   __init__   s   
z$DenoisingAutoEncoderDataset.__init__c                 C   s   | j | }t| ||gdS )N)texts)r   r   r   )r   itemsentr   r   r   __getitem__   s   
z'DenoisingAutoEncoderDataset.__getitem__c                 C   s
   t | jS r   )lenr   )r   r   r   r   __len__!   s   
z#DenoisingAutoEncoderDataset.__len__333333?c                 C   sr   ddl m}m} || }t|}|dkr| S tj||k}t|dkr,d|tj|< | 	t
|| }|S )Nr   )TreebankWordDetokenizerword_tokenizeT)nltkr   r   r   nprandomrandsumchoice
detokenizearray)text	del_ratior   r   wordsnkeep_or_notwords_processedr   r   r   r	   %   s   z"DenoisingAutoEncoderDataset.deleteN)r   )r   
__module____qualname____doc__r   strr   r   r   staticmethodr	   r   r   r   r   r   
   s    r   )typingr   numpyr    torch.utils.datar   transformers.utils.import_utilsr   r   *sentence_transformers.readers.InputExampler   r   r   r   r   r   <module>   s    