o
    sh                     @   sF   d dl Z d dlZd dlZddlmZ G dd dZG dd deZdS )    N   )InputExamplec                   @   s6   e Zd ZdZddddejdddfdd	Zdd
dZdS )STSDataReadera1  Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)

    Default values expects a tab separated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
    r   r      	T   c
           
      C   s:   || _ || _|| _|| _|| _|| _|| _|| _|	| _d S )N)	dataset_folderscore_col_idx
s1_col_idx
s2_col_idx	delimiterquotingnormalize_scores	min_score	max_score
selfr   r
   r   r	   r   r   r   r   r    r   i/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/sentence_transformers/readers/STSDataReader.py__init__   s   
zSTSDataReader.__init__c              	   C   s  t j| j|}|drtj|dddnt|dda}tj|| j	| j
d}g }t|D ]?\}}t|| j }	| jrE|	| j | j| j  }	|| j }
|| j }|t|t| |
|g|	d |dkrkt||krk n	q,W d	   |S W d	   |S 1 sw   Y  |S )
zJfilename specified which data split to use (train.csv, dev.csv, test.csv).z.gzrtutf8)encodingzutf-8)r   r   )guidtextslabelr   N)ospathjoinr   endswithgzipopencsvreaderr   r   	enumeratefloatr	   r   r   r   r
   r   appendr   strlen)r   filenamemax_examplesfilepathfIndataexamplesidrowscores1s2r   r   r   get_examples$   s2   

 

zSTSDataReader.get_examplesN)r   )__name__
__module____qualname____doc__r"   
QUOTE_NONEr   r4   r   r   r   r   r      s    
r   c                       s4   e Zd ZdZddddejdddf fdd		Z  ZS )
STSBenchmarkDataReaderzReader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
    Scores are normalized from 0...5 to 0...1
    r         r   Tr   c
           
         s"   t  j|||||||||	d	 d S )N)	r   r
   r   r	   r   r   r   r   r   )superr   r   	__class__r   r   r   @   s   
zSTSBenchmarkDataReader.__init__)r5   r6   r7   r8   r"   r9   r   __classcell__r   r   r>   r   r:   ;   s    r:   )r"   r    r    r   r   r:   r   r   r   r   <module>   s    3