o
    sh`                     @   sp   d dl Z d dlZd dlZd dlmZmZmZ d dlZd dlmZm	Z	 ddl
mZ eeZG dd de	jZdS )    N)DictListLiteral)Tensornn   )WhitespaceTokenizerc                	       s   e Zd ZdZi ddfdee deeef dedef fdd	Z	d
eee
f fddZdee dee fddZdd Z	ddeee  dedeed ej
f fddZdd Zdd Zedd Z  ZS ) BoWzImplements a Bag-of-Words (BoW) model to derive sentence embeddings.

    A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
    r   Tvocabword_weightsunknown_word_weightcumulative_term_frequencyc                    s   t t|   tt|}g d| _|| _|| _|| _|| _	g | _
d}|D ]$}|}||v r2|| }n| |v r?||  }n|d7 }| j
| q%td|t|| t|t dd| _t|| _d S )N)r
   r   r   r   r   r   z>{} out of {} words without a weighting value. Set weight to {}F)
stop_wordsdo_lower_case)superr	   __init__listsetconfig_keysr
   r   r   r   weightslowerappendloggerinfoformatlenr   	tokenizersentence_embedding_dimension)selfr
   r   r   r   num_unknown_wordswordweight	__class__ ^/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/sentence_transformers/models/BoW.pyr      s0   


zBoW.__init__featuresc                 C   s   |S Nr$   )r   r&   r$   r$   r%   forward9   s   zBoW.forwardtextsreturnc                    s    fdd|D } |S )Nc                    s    g | ]}j j|fi  qS r$   )r   tokenize).0textkwargsr   r$   r%   
<listcomp>>   s     z BoW.tokenize.<locals>.<listcomp>)get_sentence_features)r   r)   r/   	tokenizedr$   r.   r%   r+   =   s   
zBoW.tokenizec                 C   s   | j S r'   )r   r   r$   r$   r%    get_sentence_embedding_dimensionA   s   z$BoW.get_sentence_embedding_dimensionr   tokenized_textspad_seq_lengthsentence_embeddingc                 C   sp   g }|D ],}t j|  t jd}|D ]}| jr#||  | j| 7  < q| j| ||< q|| qdt |iS )N)dtyper7   )torchzerosr4   float32r   r   r   stack)r   r5   r6   vectorstokensvectortokenr$   r$   r%   r1   D   s   zBoW.get_sentence_featuresc                    s    fdd j D S )Nc                    s   i | ]}| j | qS r$   )__dict__)r,   keyr3   r$   r%   
<dictcomp>U   s    z'BoW.get_config_dict.<locals>.<dictcomp>)r   r3   r$   r3   r%   get_config_dictT   s   zBoW.get_config_dictc                 C   sN   t tj|dd}tj|  |dd W d    d S 1 s w   Y  d S )Nconfig.jsonw   )indent)openospathjoinjsondumprD   )r   output_pathfOutr$   r$   r%   saveW   s   "zBoW.savec                 C   sJ   t tj| d}t|}W d    n1 sw   Y  tdi |S )NrE   r$   )rI   rJ   rK   rL   rM   loadr	   )
input_pathfInconfigr$   r$   r%   rR   [   s   zBoW.load)r   )__name__
__module____qualname____doc__r   strr   floatboolr   r   r(   intr+   r4   r   r9   r1   rD   rQ   staticmethodrR   __classcell__r$   r$   r"   r%   r	      s:    
%

r	   )rM   loggingrJ   typingr   r   r   r9   r   r   r   r   	getLoggerrV   r   Moduler	   r$   r$   r$   r%   <module>   s    
