
    Chs.                       S SK Jr  S SKrS SKrS SKrS SKrS SKJr  S SKJ	r	   S SKJ
r
  S SKrS SKrS SKJr  S SKJr  S SKJr  S S	KJr  S S
KJr  S SKJr  \R8                  " \5      r " S S\5      rg! \ a	    S SKJ
r
   NXf = f)    )annotationsN)Path)Any)Self)	save_file)	Tokenizer)nn)PreTrainedTokenizerFast)InputModule)get_device_namec                  (  ^  \ rS rSr  S       SU 4S jjjrSS jrSS jr\SS j5       rSS jr	SS.SS	 jjr
\     S             SS
 jj5       r\        S                     SS jj5       r\SS j5       rSrU =r$ )StaticEmbedding   c                  > [         TU ]  5         [        U[        5      (       a  UR                  nO [        U[
        5      (       d  [        S5      eUbY  [        U[        R                  5      (       a  [        R                  " U5      n[        R                  R                  USS9U l        O9Ub+  [        R                  " UR                  5       U5      U l        O[        S5      eU R                  R                   U l        U R                  R"                  U l        Xl        U R$                  R'                  5         UR)                  SS5      U l        g)a$  
Initializes the StaticEmbedding model given a tokenizer. The model is a simple embedding bag model that
takes the mean of trained per-token embeddings to compute text embeddings.

Args:
    tokenizer (Tokenizer | PreTrainedTokenizerFast): The tokenizer to be used. Must be a fast tokenizer
        from ``transformers`` or ``tokenizers``.
    embedding_weights (np.ndarray | torch.Tensor | None, optional): Pre-trained embedding weights.
        Defaults to None.
    embedding_dim (int | None, optional): Dimension of the embeddings. Required if embedding_weights
        is not provided. Defaults to None.

.. tip::

    Due to the extremely efficient nature of this module architecture, the overhead for moving inputs to the
    GPU can be larger than the actual computation time. Therefore, consider using a CPU device for inference
    and training.

Example::

    from sentence_transformers import SentenceTransformer
    from sentence_transformers.models import StaticEmbedding
    from tokenizers import Tokenizer

    # Pre-distilled embeddings:
    static_embedding = StaticEmbedding.from_model2vec("minishlab/potion-base-8M")
    # or distill your own embeddings:
    static_embedding = StaticEmbedding.from_distillation("BAAI/bge-base-en-v1.5", device="cuda")
    # or start with randomized embeddings:
    tokenizer = Tokenizer.from_pretrained("FacebookAI/xlm-roberta-base")
    static_embedding = StaticEmbedding(tokenizer, embedding_dim=512)

    model = SentenceTransformer(modules=[static_embedding])

    embeddings = model.encode(["What are Pandas?", "The giant panda, also known as the panda bear or simply the panda, is a bear native to south central China."])
    similarity = model.similarity(embeddings[0], embeddings[1])
    # tensor([[0.8093]]) (If you use potion-base-8M)
    # tensor([[0.6234]]) (If you use the distillation method)
    # tensor([[-0.0693]]) (For example, if you use randomized embeddings)

Raises:
    ValueError: If the tokenizer is not a fast tokenizer.
    ValueError: If neither `embedding_weights` nor `embedding_dim` is provided.
zThe tokenizer must be fast (i.e. Rust-backed) to use this class. Use Tokenizer.from_pretrained() from `tokenizers` to load a fast tokenizer.NF)freezez?Either `embedding_weights` or `embedding_dim` must be provided.
base_model)super__init__
isinstancer
   
_tokenizerr   
ValueErrornpndarraytorch
from_numpyr	   EmbeddingBagfrom_pretrained	embeddingget_vocab_sizenum_embeddingsembedding_dim	tokenizer
no_paddinggetr   )selfr"   embedding_weightsr!   kwargs	__class__s        d/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/models/StaticEmbedding.pyr   StaticEmbedding.__init__   s   f 	i!899!,,IIy11^ 
 (+RZZ88$)$4$45F$G!__<<=NW\<]DN&__Y-E-E-GWDN^__"nn;;!^^99$-!!# !**\48    c                   U R                   R                  USS9nU Vs/ sH  oDR                  PM     nn[        R                  " [
        R                  " S/US S  Vs/ sH  n[        U5      PM     sn-   5      5      n[        R                  " U VVs/ sH  of H  oPM     M     snn[        R                  S9n	XS.$ s  snf s  snf s  snnf )NF)add_special_tokensr   )dtype)	input_idsoffsets)
r"   encode_batchidsr   r   r   cumsumlentensorlong)
r%   textsr'   	encodingsencodingencodings_ids	token_idsr1   token_idr0   s
             r)   tokenizeStaticEmbedding.tokenizem   s    NN//%/P	6?@i(i@""299aSTabeceTf3gTfyC	NTf3g-g#hiLLM!dMyZch(Zc(M!dlqlvlvw	&;;	 A3g!ds   B?CC	
c                <    U R                  US   US   5      US'   U$ )Nr0   r1   sentence_embedding)r   )r%   featuresr'   s      r)   forwardStaticEmbedding.forwardu   s(    )-8MxXaOb)c%&r+   c                "    [         R                  $ N)mathinfr%   s    r)   max_seq_lengthStaticEmbedding.max_seq_lengthy   s    xxr+   c                    U R                   $ rF   )r!   rI   s    r)    get_sentence_embedding_dimension0StaticEmbedding.get_sentence_embedding_dimension}   s    !!!r+   T)safe_serializationc               j   U(       a9  [        U R                  5       [        R                  R	                  US5      5        OC[
        R                  " U R                  5       [        R                  R	                  US5      5        U R                  R                  [        [        U5      S-  5      5        g )Nzmodel.safetensorszpytorch_model.bintokenizer.json)
save_safetensors_file
state_dictospathjoinr   saver"   strr   )r%   output_pathrO   argsr'   s        r)   rW   StaticEmbedding.save   sm    !$//"3RWW\\+Ob5cdJJt("'',,{DW*XYC[ 14D DEFr+   c                    UUUUUS.nU R                   " U4SS0UD6n	[        R                  " U	5      n
U R                  " SSU0UD6n US   n[        XS9$ ! [         a    US   n Nf = f)	N)	subfoldertokencache_folderrevisionlocal_files_onlyfilenamerQ   model_name_or_pathzembedding.weight
embeddings)r&    )load_file_pathr   	from_fileload_torch_weightsKeyErrorr   )clsrc   r]   r^   r_   r`   ra   r'   
hub_kwargstokenizer_pathr"   weightss               r)   loadStaticEmbedding.load   s     #(  0

 ++,>hIYh]gh''7	((]<N]R\]	,01G yDD  	,l+G	,s   	A A)(A)c
           	         SSK Jn  [        R                  " U5      n[        UR                  R                  5       5      S1-
  nUUUUU	UUUS.U
En
[        U
R                  5       5      U-
  =n(       ab  [        R                  SSR                  [        [        U5      5       S35        U
R                  5        VVs0 sH  u  nnX;   d  M  UU_M     n
nn[        5       nU" U40 U
D6n[        UR                   ["        R$                  5      (       a!  [&        R(                  " UR                   5      nOUR                   R*                  nUR,                  nU " UUUS	9$ ! [         a    [        S5      ef = fs  snnf )
a  
Creates a StaticEmbedding instance from a distillation process using the `model2vec` package.

Args:
    model_name (str): The name of the model to distill.
    vocabulary (list[str] | None, optional): A list of vocabulary words to use. Defaults to None.
    device (str): The device to run the distillation on (e.g., 'cpu', 'cuda'). If not specified,
        the strongest device is automatically detected. Defaults to None.
    pca_dims (int | None, optional): The number of dimensions for PCA reduction. Defaults to 256.
    apply_zipf (bool): Whether to apply Zipf's law during distillation. Defaults to True.
    sif_coefficient (float | None, optional): The coefficient for SIF weighting. Defaults to 1e-4.
    token_remove_pattern (str | None, optional): A regex pattern to remove tokens from the vocabulary.
        Defaults to r"\[unused\d+\]".
    quantize_to (str): The data type to quantize the weights to. Defaults to 'float32'.
    use_subword (bool): Whether to use subword tokenization. Defaults to True.

Returns:
    StaticEmbedding: An instance of StaticEmbedding initialized with the distilled model's
        tokenizer and embedding weights.

Raises:
    ImportError: If the `model2vec` package is not installed.
r   )distillz\To use this method, please install the `model2vec` package: `pip install model2vec[distill]`
model_name)
vocabularydevicepca_dims
apply_zipfuse_subwordquantize_tosif_coefficienttoken_remove_patternz1Your version of `model2vec` does not support the z, zh arguments for the `distill` method. Consider updating `model2vec` to take advantage of these arguments.r&   r   )model2vec.distillrq   ImportErrorinspect	signatureset
parameterskeysloggerwarningrV   maprepritemsr   r   r   r   r   r   r   weightr"   )rj   rr   rs   rt   ru   rv   ry   rz   rx   rw   r'   rq   distill_signaturedistill_kwargs	leftoverskeyvaluestatic_modelr&   r"   s                       r)   from_distillation!StaticEmbedding.from_distillation   s|   L	1 $--g6.99>>@A\NR$ $&&.$8

 

 FKKM*^;;9;NNCDIIcRVXaNbDcCd eV V 4:<<>[>ZS%SEZjc5j>F[ "z4V4l,,bjj99 % 0 01G1G H , 6 6 = =+55	90AjYYC  	n 	. \s   E >E7E7E4c                J    SSK Jn  UR                  U5      n[	        UR
                  [        R                  5      (       a!  [        R                  " UR
                  5      nOUR
                  R                  nUR                  nU " XTUS9$ ! [         a    [        S5      ef = f)a  
Create a StaticEmbedding instance from a model2vec model. This method loads a pre-trained model2vec model
and extracts the embedding weights and tokenizer to create a StaticEmbedding instance.

Args:
    model_id_or_path (str): The identifier or path to the pre-trained model2vec model.

Returns:
    StaticEmbedding: An instance of StaticEmbedding initialized with the tokenizer and embedding weights
         the model2vec model.

Raises:
    ImportError: If the `model2vec` package is not installed.
r   )StaticModelzSTo use this method, please install the `model2vec` package: `pip install model2vec`r{   )	model2vecr   r}   r   r   r   r   r   r   r   r   r"   )rj   model_id_or_pathr   r   r&   r"   s         r)   from_model2vecStaticEmbedding.from_model2vec   s    "	u- #223CDl,,bjj99 % 0 01G1G H , 6 6 = =+55	9N^__  	ustt	us   B B")r   r   r!   r    r"   )NN)r"   z#Tokenizer | PreTrainedTokenizerFastr&   z np.ndarray | torch.Tensor | Noner!   
int | NonereturnNone)r8   z	list[str]r   dict[str, torch.Tensor])rB   r   r   r   )r   int)rY   rX   rO   boolr   r   ) NNNF)rc   rX   r]   rX   r^   zbool | str | Noner_   
str | Noner`   r   ra   r   r   r   )NN   Tg-C6?z\[unused\d+\]float32T)rr   rX   rs   zlist[str] | Nonert   r   ru   r   rv   r   ry   zfloat | Nonerz   r   rx   rX   rw   r   r'   r   r   r   )r   rX   r   r   )__name__
__module____qualname____firstlineno__r   r>   rC   propertyrJ   rM   rW   classmethodrn   r   r   __static_attributes____classcell__)r(   s   @r)   r   r      s    ?C$(	N96N9 <N9 "	N9 
N9 N9`<  " HL G  #'#'#!&EE E !	E
 !E E E 
E E8  (,!"(,+;$ HZHZ %HZ 	HZ
 HZ HZ &HZ )HZ HZ HZ HZ 
HZ HZT ` `r+   r   ) 
__future__r   r~   loggingrG   rT   pathlibr   typingr   r   r}   typing_extensionsnumpyr   r   safetensors.torchr   rR   
tokenizersr   r	   transformersr
   (sentence_transformers.models.InputModuler   sentence_transformers.utilr   	getLoggerr   r   r   re   r+   r)   <module>r      sp    "    	  '   @    0 @ 6			8	$p`k p`!  '&'s   A4 4BB