o
    sh                     @   s   d dl Z d dlmZ d dlmZmZ d dlmZmZ d dl	Z	d dl
mZmZmZ d dlmZ e r7d dlmZ e eZG dd	 d	ZG d
d deeZG dd deeZG dd deeZG dd deeZG dd deeZdS )    N)defaultdict)
accumulatecycle)IteratorList)BatchSamplerConcatDatasetSubsetRandomSampler)is_datasets_available)Datasetc                       s4   e Zd ZdZd	 fddZdeddfddZ  ZS )
SetEpochMixinz
    Required for a BatchSampler as the Trainer will call set_epoch on the BatchSampler at the beginning of each epoch.
    The BatchSampler can then set the generator seed accordingly.
    returnNc                    s   t  j|i | d| _d S Nr   )super__init__epoch)selfargskwargs	__class__ [/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/sentence_transformers/sampler.pyr      s   
zSetEpochMixin.__init__r   c                 C   s
   || _ d S )N)r   )r   r   r   r   r   	set_epoch   s   
zSetEpochMixin.set_epoch)r   N)__name__
__module____qualname____doc__r   intr   __classcell__r   r   r   r   r      s    r   c                   @   s   e Zd ZdS )DefaultBatchSamplerN)r   r   r   r   r   r   r   r        s    r    c                       s^   e Zd Z			ddddededee dejd	ed
df fddZ	d
e
ee  fddZ  ZS )GroupByLabelBatchSamplerNr   datasetr   
batch_size	drop_lastvalid_label_columns	generatorseedr   c                    s   t  ||| || _|| _|| _|| _|| _| jd dkr"td|p%g D ]}||jv r3|d } n	q&td| d~t	t
}	t|D ]\}
}|	| |
 qE fdd|	 D | _d S )	N      zEThe batch size for `GroupByLabelBatchSampler` must be divisible by 2.labelz None of the valid_label_columns z are in the dataset.c                    s.   i | ]\}}t |d    r||d  qS )r(   Nlen).0r*   sample_indicesnum_samplesr   r   
<dictcomp>D   s    z5GroupByLabelBatchSampler.__init__.<locals>.<dictcomp>)r   r   r"   r#   r$   r&   r'   
ValueErrorcolumn_namesr   list	enumerateappenditemsgroups)r   r"   r#   r$   r%   r&   r'   column_namelabelsr8   
sample_idxr*   r   r/   r   r   %   s*   	

z!GroupByLabelBatchSampler.__init__c                 c   s    | j r| jr| j | j| j  t| j }g }tjt	| j| j dD ]-}|| }| j| }|
| t	|| jkrR|d | j V  || jd  }t	|| jks<q%| js]|r_|V  d S d S d S )Nr&   )r&   r'   manual_seedr   r4   r8   keystorchrandpermr,   extendr#   r$   )r   r:   partial_batch	label_idxr*   samplesr   r   r   __iter__J   s"   



z!GroupByLabelBatchSampler.__iter__)NNr   )r   r   r   r   boolr   strr?   	Generatorr   r   rE   r   r   r   r   r   r!   $   s(    %r!   c                       sl   e Zd Zg ddfdddededee dejd	ed
df fddZ	d
e
ee  fddZd
efddZ  ZS )NoDuplicatesBatchSamplerNr   r"   r   r#   r$   r%   r&   r'   r   c                    sX   t  ||| t|jt|dhB @  }r||}|| _|| _|| _|| _|| _	d S )Ndataset_name)
r   r   setr3   remove_columnsr"   r#   r$   r&   r'   )r   r"   r#   r$   r%   r&   r'   label_columnsr   r   r   r   ]   s   	

z!NoDuplicatesBatchSampler.__init__c                 c   s    | j r| jr| j | j| j  ttjt| j| j d	 }|r`t }g }|D ]&}t| j| 
 }||@ r9q)|| t|| jkrJ|V   n|| q)| jsV|V  |t|8 }|s"dS dS )a5  
        Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
        batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
        the batch indices and continue with the next batch.
        r<   N)r&   r'   r=   r   rK   r?   r@   r,   r"   tolistvaluesr6   r#   updater$   )r   remaining_indicesbatch_valuesbatch_indicesindexsample_valuesr   r   r   rE   o   s(   
z!NoDuplicatesBatchSampler.__iter__c                 C   s0   | j rt| j| j S t| j| j d | j S )Nr)   )r$   r,   r"   r#   r   r   r   r   __len__   s   z NoDuplicatesBatchSampler.__len__)r   r   r   r   rF   r   rG   r?   rH   r   r   rE   rW   r   r   r   r   r   rI   \   s*     rI   c                
       \   e Zd Zdedee dejdeddf
 fddZ	de
ee  fd	d
ZdefddZ  ZS )RoundRobinBatchSamplerr"   batch_samplersr&   r'   r   Nc                    8   t  ||d j|d j || _|| _|| _|| _d S r   r   r   r#   r$   r"   rZ   r&   r'   r   r"   rZ   r&   r'   r   r   r   r      
   
zRoundRobinBatchSampler.__init__c              	   #   s    | j | j| j  dd | jjD }dgtt| }dd | jD }t	t
t|D ]!}||  z fddt|| D V  W q- tyN   Y  d S w d S )Nc                 S      g | ]}t |qS r   r+   r-   r"   r   r   r   
<listcomp>       z3RoundRobinBatchSampler.__iter__.<locals>.<listcomp>r   c                 S   r_   r   iterr-   samplerr   r   r   ra      rb   c                       g | ]}|  qS r   r   r-   idxsample_offsetr   r   ra      rb   )r&   r=   r'   r   r"   datasetsr4   r   rZ   r   ranger,   nextStopIteration)r   r0   sample_offsetsrZ   dataset_idxr   rj   r   rE      s    zRoundRobinBatchSampler.__iter__c                 C   s   t dd | jD t| j S )Nc                 S   r_   r   r+   re   r   r   r   ra      rb   z2RoundRobinBatchSampler.__len__.<locals>.<listcomp>)minrZ   r,   rV   r   r   r   rW      s   zRoundRobinBatchSampler.__len__r   r   r   r   r   r   r?   rH   r   r   r   rE   rW   r   r   r   r   r   rY      s    rY   c                
       rX   )ProportionalBatchSamplerr"   rZ   r&   r'   r   Nc                    r[   r   r\   r]   r   r   r   r      r^   z!ProportionalBatchSampler.__init__c                 #   s    | j | j| j  dd | jjD }dgtt| }dd | jD }dd t	|D }t
|| j d}dd | jD }|D ]}||   fddt|| D V  q?d S )	Nc                 S   r_   r   r+   r`   r   r   r   ra      rb   z5ProportionalBatchSampler.__iter__.<locals>.<listcomp>r   c                 S   r_   r   r+   re   r   r   r   ra      rb   c                 S   s"   g | ]\}}t |D ]}|q
qS r   )rm   )r-   ri   length_r   r   r   ra      s   " r<   c                 S   r_   r   rc   re   r   r   r   ra      rb   c                    rg   r   r   rh   rj   r   r   ra      rb   )r&   r=   r'   r   r"   rl   r4   r   rZ   r5   r	   rn   )r   r0   rp   num_batchesdataset_indicesdataset_idx_samplerrZ   rq   r   rj   r   rE      s   z!ProportionalBatchSampler.__iter__c                 C   s   t dd | jD S )Nc                 S   r_   r   r+   re   r   r   r   ra      rb   z4ProportionalBatchSampler.__len__.<locals>.<listcomp>)sumrZ   rV   r   r   r   rW      s   z ProportionalBatchSampler.__len__rs   r   r   r   r   rt      s    rt   )loggingcollectionsr   	itertoolsr   r   typingr   r   r?   torch.utils.datar   r   r	   sentence_transformers.utilr
   rl   r   	getLoggerr   loggerr   r    r!   rI   rY   rt   r   r   r   r   <module>   s     
8:"