
    hJa                        d dl mZmZmZ d dlZd dlmZ ddlmZ ddl	m
Z
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddl"m#Z#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z*m+Z+m,Z,m-Z-  ej\                  e/      Z0 G d de      Z1 G d de*      Z2 G d de'      Z3 G d de+      Z4	 	 	 d2dejj                  dejl                  dejl                  dejl                  deejl                     de7dee7   d ee7   d!e8ejl                  ejl                  f   fd"Z9 G d# d$e#      Z: G d% d&e      Z; G d' d(e)      Z< G d) d*e(      Z= G d+ d,e$      Z> G d- d.e%      Z? G d/ d0e&      Z@g d1ZAy)3    )CallableOptionalUnionN   )ACT2FN)CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)deprecate_kwarg   )GemmaAttentionGemmaForCausalLMGemmaForSequenceClassificationGemmaForTokenClassificationGemmaMLP
GemmaModelGemmaPreTrainedModelGemmaRMSNormGemmaRotaryEmbeddingapply_rotary_pos_emb	repeat_kvc                        e Zd ZdZdZdgZddddddddZdgdgfd	d
gd	gfd	gd	gfdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZ	S )Gemma2Configa  
    This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma2-7B.
    e.g. [google/gemma2-7b](https://huggingface.co/google/gemma2-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
    Args:
        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma2 model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`Gemma2Model`]
        hidden_size (`int`, *optional*, defaults to 2304):
            Dimension of the hidden representations.
        intermediate_size (`int`, *optional*, defaults to 9216):
            Dimension of the MLP representations.
        num_hidden_layers (`int`, *optional*, defaults to 26):
            Number of hidden layers in the Transformer decoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer decoder.
        num_key_value_heads (`int`, *optional*, defaults to 4):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
            by meanpooling all the original heads within that group. For more details, check out [this
            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
            `num_attention_heads`.
        head_dim (`int`, *optional*, defaults to 256):
            The attention head dimension.
        hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
        max_position_embeddings (`int`, *optional*, defaults to 8192):
            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
            The epsilon used by the rms normalization layers.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
        bos_token_id (`int`, *optional*, defaults to 2):
            Beginning of stream token id.
        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        query_pre_attn_scalar (`float`, *optional*, defaults to 256):
            scaling factor used on the attention scores
        sliding_window (`int`, *optional*, defaults to 4096):
            in Gemma2, every other layer uses sliding window attention. This is the size of the sliding window.
        layer_types (`list`, *optional*):
            Attention pattern for each layer.
        final_logit_softcapping (`float`, *optional*, defaults to 30.0):
            scaling factor when applying tanh softcapping on the logits.
        attn_logit_softcapping (`float`, *optional*, defaults to 50.0):
            scaling factor when applying tanh softcapping on the attention scores.

    ```python
    >>> from transformers import Gemma2Model, Gemma2Config
    >>> # Initializing a Gemma2 gemma2-7b style configuration
    >>> configuration = Gemma2Config()
    >>> # Initializing a model from the gemma2-7b style configuration
    >>> model = Gemma2Model(configuration)
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```gemma2past_key_valuescolwiserowwise)zlayers.*.self_attn.q_projzlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.o_projzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_proj	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                    t        |   d||||d| || _        |	| _        || _        || _        || _        || _        || _        || _	        |
| _
        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        || _        | j*                  ;t-        | j                        D cg c]  }t/        |dz   dz        rdnd c}| _        t1        | j*                  | j                         y c c}w )N)pad_token_idbos_token_ideos_token_idtie_word_embeddings   r   sliding_attentionfull_attention )super__init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrangeboolr   )selfr;   r=   r>   r?   r@   rB   rA   rI   r<   rC   rD   rE   r1   r3   r2   r4   rF   rG   rH   rJ   rK   rN   rL   rM   kwargsi	__class__s                              h/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/gemma2/modular_gemma2.pyr:   zGemma2Config.__init__   s4   8 	 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#&#X]^b^t^tXu XuSTtQUaK'8#>NNXu D 	d..0F0FG s   D
)i  i 	  i $              gelu_pytorch_tanhi    g{Gz?gư>Tr   r5   r   Tg     @F        rY   i   Ng      >@g      I@)
__name__
__module____qualname____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planr:   __classcell__rT   s   @rU   r$   r$   2   s    JX J#4"5%.%.%.%."+ )"+ &(9:#%568IJ!"_$56 - $ ! $#3<H <H    r$   c                       e Zd Zy)Gemma2RMSNormNr\   r]   r^   r8   rf   rU   rh   rh          rf   rh   c                        e Zd Z fdZ xZS )	Gemma2MLPc                 T    t         |   |       t        |j                     | _        y N)r9   r:   r   rI   act_fnrQ   configrT   s     rU   r:   zGemma2MLP.__init__   s"     V556rf   )r\   r]   r^   r:   rd   re   s   @rU   rl   rl      s    7 7rf   rl   c                       e Zd Zy)Gemma2RotaryEmbeddingNri   r8   rf   rU   rs   rs      rj   rf   rs   modulequerykeyvaluer,   dropoutscalingsoftcapreturnc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r   r   )dimdtype)ptrainingr5   )rA   r"   num_key_value_groupstorchmatmul	transposetanhshapenn
functionalsoftmaxfloat32tor   rx   r   
contiguous)rt   ru   rv   rw   r,   rx   ry   rz   rR   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 rU   eager_attention_forwardr      sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rf   c                   N    e Zd Zdedef fdZ eddd      	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
e   de
ej                     dee   de	ej                  e
ej                     e
e	ej                        f   fd       Z xZS )Gemma2Attentionrq   	layer_idxc                    t         |   ||       | j                  j                  | _        | j                  j                  | _        d| _        |j                  dz  | _        |j                  |   dk(  r|j                  | _	        y d | _	        y )NTr}   r6   )
r9   r:   rq   rM   rH   	is_causalrJ   ry   rN   rK   rQ   rq   r   rT   s      rU   r:   zGemma2Attention.__init__  sx    +&*kk&H&H#!%!>!>33T97=7I7I)7TXk7kf33qurf   past_key_valuer&   4.58new_nameversionr+   position_embeddingsr,   cache_positionrR   r{   c                 `   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                  r| j                  nd| j                   | j"                  | j$                  d|\  }} |j&                  g |d j)                         }| j+                  |      }||fS )Nr   r5   r   )sincosr   eagerr[   )rx   ry   rK   rz   )r   rA   q_projviewr   k_projv_projr!   updater   r   rq   _attn_implementationr   r   rH   ry   rK   rM   reshaper   o_proj)rQ   r+   r   r,   r&   r   rR   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     rU   forwardzGemma2Attention.forward	  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL..//%
 %
!\ *k));;;;FFHkk+.L((rf   )NN)r\   r]   r^   r$   intr:   r   r   Tensortupler   r   
LongTensorr   r   r   rd   re   s   @rU   r   r      s    v| v v %0A6R ,059+)||+) #5<<#=>+) !.	+)
 "%+) !!1!12+) -.+) 
u||Xell3XeELL>Q5RR	S+) S+)rf   r   c                   z    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	e	ej                  ej                  f   d
e
ej                     de
ej                     de
e   de
e   de
e   de
ej                     de	ej                  e
e	ej                  ej                  f      f   fd       Z xZS )Gemma2DecoderLayerrq   r   c                    t         |           |j                  | _        || _        |j                  |   | _        t        ||      | _        t        |      | _	        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )N)rq   r   )eps)r9   r:   r=   rq   rN   attention_typer   	self_attnrl   mlprh   rD   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rU   r:   zGemma2DecoderLayer.__init__9  s    !--$00;()LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%)6v7I7IvObOb)c&*78J8JPVPcPc*d'rf   r   r&   r   r   r+   r   r,   position_idsoutput_attentionsrE   r   r{   c	                    |}
| j                  |      } | j                  d||||||||d|	\  }}| j                  |      }|
|z   }|}
| j                  |      }| j	                  |      }| j                  |      }|
|z   }|f}|r||fz  }|S )N)r+   r   r,   r   r&   r   rE   r   r8   )r   r   r   r   r   r   )rQ   r+   r   r,   r   r&   r   rE   r   rR   residualself_attn_weightsoutputss                rU   r   zGemma2DecoderLayer.forwardF  s     !,,]; ,:4>> 
,
' 3)%+/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Grf   )NNNFFN)r\   r]   r^   r$   r   r:   r   r   r   r   r   r   r   rP   FloatTensorr   rd   re   s   @rU   r   r   8  s   e| e e %0A6R
 2637+/,1$)59*||* #5<<#=>* !.	*
 u//0* "%* $D>* D>* !!1!12* 
u  (51B1BEDUDU1U+V"WW	X* S*rf   r   c                       e Zd Zy)Gemma2PreTrainedModelNri   r8   rf   rU   r   r   t  rj   rf   r   c                       e Zd Zdef fdZ	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	   deej                     dee   d	ee   d
ee   deej                     dee   defdZ xZS )Gemma2Modelrq   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w rn   )r9   r:   r   
ModuleListrO   r?   r   r.   r   s      rU   r:   zGemma2Model.__init__y  sJ     mmDI&JbJbDcdDcy	2Dcd
ds   Ar)   r,   r   r&   r*   rE   r   output_hidden_statesr   rR   r{   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r$|"| j                  st        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }t#        |x}t$              s*| j                   |||	||d}t'        di |t)        di |d	}|}| j+                  ||      }t        j,                  | j                   j.                  d
z  |j0                        }||z  }|rdnd }|rdnd }| j2                  d | j                   j4                   D ]9  }|r||fz  } ||f|||j6                     |||||	d|
}|d   }|s1||d   fz  }; | j9                  |      }|r||fz  }t;        ||||      S )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.F)rq   r   r5   )device)rq   input_embedsr,   r   r&   r   )r7   r6   g      ?)r   r8   )r   r,   r   r&   r   rE   r   )last_hidden_stater&   r+   
attentions)rq   r   r   rE   
ValueErrorgradient_checkpointingr   loggerwarning_oncer-   r	   get_seq_lengthr   aranger   r   	unsqueeze
isinstancedictr   r   
rotary_embtensorr=   r   r.   r?   r   r/   r   )rQ   r)   r,   r   r&   r*   rE   r   r   r   rR   past_seen_tokenscausal_mask_mappingmask_kwargsr+   r   
normalizerall_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        rU   r   zGemma2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*$++>O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L ?-F ++ -"0"0#2 ,K #5"C{"C%F%U%U# & #oom\J
 \\$++"9"93">mFYFYZ
%
2 #7BD0d![[)H4;;+H+HIM#!m%55!)
$72=3O3OP) /"3#-
 
M *!,M =#3"55' J* 		-0-!11&+++%	
 	
rf   )	NNNNNNNNN)r\   r]   r^   r$   r:   r   r   r   r   r   r   rP   r   r   r   r   rd   re   s   @rU   r   r   x  s    
| 
 151537+/59$(,0/359k
E,,-k
 !.k
 u//0	k

 "%k
   1 12k
 D>k
 $D>k
 'tnk
 !!1!12k
 +,k
 
!k
rf   r   c                   @    e Zd Z fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee   deej                     deej                     dee
   d	ee
   d
ee
   deej                     deeej                  f   defdZ xZS )Gemma2ForCausalLMc                 d    t         |   |       t        |      | _        | j	                          y rn   )r9   r:   r   model	post_initrp   s     rU   r:   zGemma2ForCausalLM.__init__  s&      (
rf   r)   r,   r   r&   r*   labelsrE   r   r   r   logits_to_keepr{   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                  G|| j                   j                  z  }t        j                  |      }|| j                   j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma2ForCausalLM

        >>> model = Gemma2ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)	r)   r,   r   r&   r*   rE   r   r   r   )losslogitsr&   r+   r   r8   )rq   r   r   r   r   r   r   slicelm_headrL   r   r   loss_functionr;   r   r&   r+   r   )rQ   r)   r,   r   r&   r*   r   rE   r   r   r   r   rR   r   r+   slice_indicesr   r   s                     rU   r   zGemma2ForCausalLM.forward  sW   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
rf   )NNNNNNNNNNr   )r\   r]   r^   r:   r   r   r   r   r   r   rP   r   r   r   r   rd   re   s   @rU   r   r     s    151537+/59-1$(,0/35934F
E,,-F
 !.F
 u//0	F

 "%F
   1 12F
 ))*F
 D>F
 $D>F
 'tnF
 !!1!12F
 c5<</0F
 
 F
rf   r   c                       e Zd Zy)Gemma2ForSequenceClassificationNri   r8   rf   rU   r   r   <  rj   rf   r   c                       e Zd Zy)Gemma2ForTokenClassificationNri   r8   rf   rU   r   r   @  rj   rf   r   )r$   r   r   r   r   r   )r[   NN)Btypingr   r   r   r   torch.nnr   activationsr   cache_utilsr   r	   configuration_utilsr
   r   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   utils.deprecationr   gemma.modeling_gemmar   r   r   r   r   r   r   r   r    r!   r"   
get_loggerr\   r   r$   rh   rl   rs   Moduler   floatr   r   r   r   r   r   r   r   r   __all__r8   rf   rU   <module>r     s    - ,   ! . J R B 9 O 5 & 0 0    
		H	%ZH# ZHz	L 	7 7	0 	 ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %F5)n 5)p93 9x	0 	r
* r
jL
( L
^	&D 		#> 	rf   