
    <h              	       h   S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SK	J
r
  S SKrSSKJrJr  SSKJrJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5J6r6J7r7J8r8  SSK9J:r:  \'Rv                  " \<5      r= " S S\)\5      r> " S S\5      r? " S S\85      r@ " S S\55      rA " S S\
R                  5      rC " S S\-5      rD " S  S!\05      rE " S" S#\15      rF " S$ S%\+5      rG " S& S'\5      rHSrI " S( S)\/5      rJ " S* S+\.5      rK " S, S-\,5      rL " S. S/\
R                  5      rNS0\\R                     S1\\R                     S2\PS3\\   4S4 jrQ " S5 S6\75      rR " S7 S8\65      rS " S9 S:\J5      rT/ S;QrUg)<    N)Callable)AnyOptionalUnion   )CacheDynamicCache)PretrainedConfiglayer_type_validation)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPast SequenceClassifierOutputWithPast)rope_config_validation)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging   )Gemma2Config)	Gemma2AttentionGemma2ForCausalLM	Gemma2MLPGemma2ModelGemma2PreTrainedModelGemma2RMSNormGemma2RotaryEmbeddingapply_rotary_pos_embeager_attention_forward)PaligemmaCausalLMOutputWithPast!PaliGemmaForConditionalGenerationPaliGemmaModelPaligemmaModelOutputWithPast)SiglipVisionConfigc                       \ rS rSrSrSr                          S	S jr\S 5       r\R                  S 5       rSr
g)
Gemma3TextConfig;   a  
This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Gemma3Text-7B.
e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    vocab_size (`int`, *optional*, defaults to 262208):
        Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Gemma3TextModel`]
    hidden_size (`int`, *optional*, defaults to 2304):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 9216):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 26):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    head_dim (`int`, *optional*, defaults to 256):
        The attention head dimension.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
        if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
    max_position_embeddings (`int`, *optional*, defaults to 131072):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 2):
        Beginning of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 1000000.0):
        The base period of the RoPE embeddings.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        Scaling factor used on the attention scores
    sliding_window (`int`, *optional*, defaults to 4096):
        In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.
    final_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the attention scores.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    rope_local_base_freq (float, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings for local attention.

```python
>>> from transformers import Gemma3TextModel, Gemma3TextConfig
>>> # Initializing a Gemma3Text gemma3_text-7b style configuration
>>> configuration = Gemma3TextConfig()
>>> # Initializing a model from the gemma3_text-7b style configuration
>>> model = Gemma3TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
gemma3_textNc                    [         R                  " SUUUUS.UD6  Xl        Xl        X l        X0l        X@l        XPl        Xpl        X`l	        Xl
        Xl        Xl        UU l        UU l        UU l        Xl        UU l        UU l        UU l        UU l        UU l        UU l        UU l        [1        U 5        UR3                  SS5      U l        U R*                  cL  [7        U R                  5       Vs/ sH'  n[9        US-   U R4                  -  5      (       a  SOSPM)     snU l        [;        U R*                  5        g s  snf )N)pad_token_idbos_token_ideos_token_idtie_word_embeddingssliding_window_pattern      sliding_attentionfull_attention )r
   __init__
vocab_sizemax_position_embeddingshidden_sizeintermediate_sizenum_hidden_layersnum_attention_headshead_dimnum_key_value_headsinitializer_rangerms_norm_eps	use_cache
rope_thetaattention_biasattention_dropouthidden_activationquery_pre_attn_scalarsliding_windowfinal_logit_softcappingattn_logit_softcappinglayer_typesrope_local_base_freqrope_scalingr   get_sliding_window_patternrangeboolr   )selfr:   r<   r=   r>   r?   rA   r@   rH   r;   rB   rC   rD   r/   r1   r0   r2   rE   rF   rG   rI   rJ   rM   rK   rL   rO   rN   kwargsis                                a/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.pyr9   Gemma3TextConfig.__init__   sO   < 	!! 	
%%% 3		

 	
 %'>$&!2!2#6  #6 !2("$,!2!2%:",'>$&<#&$8!(t$ (.zz2JA'N$# t556 6A (,QUd6R6R,R'S'S#Yii6 D 	d../	 s   0-D;c                 P    [         R                  " S[        5        U R                  $ )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)warningswarnFutureWarningrQ   rT   s    rW   r3   'Gemma3TextConfig.sliding_window_pattern   s"    b	
 +++    c                     Xl         g N)rQ   rT   values     rW   r3   r^     s    ',$r_   )rQ   rF   rG   rL   rK   r@   rH   r<   rB   r=   rM   r;   r?   r>   rA   rI   rC   rN   rO   rE   rJ   rD   r:   )i@  i 	  i $              gelu_pytorch_tanhi   {Gz?ư>Tr   r5   r   Tg    .AF        rg   i   NNNNg     @)__name__
__module____qualname____firstlineno____doc__
model_typer9   propertyr3   setter__static_attributes__r8   r_   rW   r+   r+   ;   s    rh J - ' ! $#%7F0P , , ""- #-r_   r+   c                      ^  \ rS rSrSrSrSSSS.r\\S.r	       SS	\
\\\\\4   4      S
\
\\\\\4   4      S\S\S\S\S\4U 4S jjjrSrU =r$ )Gemma3Configi  a  
This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the PaliGemma-2B.

e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
        The config object of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*):
        Custom vision config or dict.
    mm_tokens_per_image (`int`, *optional*, defaults to 256):
        The number of tokens per image embedding.
    boi_token_index (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_index (`int`, *optional*, defaults to 256000):
        The end-of-image token index to wrap the image prompt.
    image_token_index (`int`, *optional*, defaults to 262144):
        The image token index to encode the image prompt.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


Example:

```python
>>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

>>> # Initializing a Siglip-like vision config
>>> vision_config = SiglipVisionConfig()

>>> # Initializing a Gemma3 Text config
>>> text_config = Gemma3TextConfig()

>>> # Initializing a Gemma3 gemma-3-4b style configuration
>>> configuration = Gemma3Config(vision_config, text_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3TextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```gemma3image_token_indexboi_token_indexeoi_token_index)image_token_idboi_token_ideoi_token_id)text_configvision_configr~   r   mm_tokens_per_imagerB   c                   > Uc   [        5       n[        R                  S5        O [        U[        5      (       a  [        S0 UD6n[        U[        5      (       a  [        S0 UD6nO"Uc  [        5       n[        R                  S5        Xl        X l        X0l        X@l	        XPl
        X`l        Xpl        [        T	U ]8  " S0 UD6  g )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.r8   )r+   loggerinfo
isinstancedictr)   r~   r   r   ry   rz   rx   rB   superr9   )
rT   r~   r   r   ry   rz   rx   rB   rU   	__class__s
            rW   r9   Gemma3Config.__init__C  s     *,KKKZ[T***9[9KmT**.??M".0MKK`a&*#6 ..!2!2"6"r_   )ry   rz   rx   rB   r   r~   r   )NNrg   i i  i   ri   )rl   rm   rn   ro   rp   rq   attribute_mapr+   r)   sub_configsr   r   r   strr   intfloatr9   rt   __classcell__r   s   @rW   rv   rv     s    .` J-))M (+K JNMQ#&&&!(#'#e$4d38n$DEF#  &8$sCx.&H IJ# !	#
 # # # !# #r_   rv   c                       \ rS rSrSrg)Gemma3ModelOutputWithPastie  r8   Nrl   rm   rn   ro   rt   r8   r_   rW   r   r   e      r_   r   c                       \ rS rSrSrg)Gemma3CausalLMOutputWithPastii  r8   Nr   r8   r_   rW   r   r   i  r   r_   r   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbeddingim  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )Nr   F)
persistent)r   r9   register_buffertorchtensor)rT   r   r   r   r   r   s        rW   r9   &Gemma3TextScaledWordEmbedding.__init__r  s1    D]ELL,ERWXr_   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ ra   )r   forwardr   toweightdtype)rT   r   r   s     rW   r   %Gemma3TextScaledWordEmbedding.forwardv  s2    wy)D,<,<,?,?@Q@Q,RRRr_   r8   )g      ?)rl   rm   rn   ro   rp   r   r   r9   r   Tensorr   rt   r   r   s   @rW   r   r   m  sM    Ys Y3 YS Y_d Y YS S Sr_   r   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )	Gemma3MLPiz  configc                 $   > [         TU ]  U5        g ra   r   r9   rT   r   r   s     rW   r9   Gemma3MLP.__init__{       r_   r8   rl   rm   rn   ro   r+   r9   rt   r   r   s   @rW   r   r   z  s    !/ ! !r_   r   c                   8   ^  \ rS rSrSS\S\4U 4S jjjrSrU =r$ )Gemma3RMSNormi  dimepsc                 "   > [         TU ]  5         g ra   r   )rT   r   r   r   s      rW   r9   Gemma3RMSNorm.__init__  s    r_   r8   )rj   )	rl   rm   rn   ro   r   r   r9   rt   r   r   s   @rW   r   r     s    C e  r_   r   c                   4   ^  \ rS rSrSS\4U 4S jjjrSrU =r$ )Gemma3RotaryEmbeddingi  r   c                 $   > [         TU ]  U5        g ra   r   )rT   r   devicer   s      rW   r9   Gemma3RotaryEmbedding.__init__  r   r_   r8   ra   r   r   s   @rW   r   r     s    !/ ! !r_   r   c                   $  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\	\R                     S\	\
   S	\	\R                     S
\\   S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )Gemma3Attentioni  r   	layer_idxc                 $  > UR                   U   S:H  U l        [        TU ]  5         U R                  (       a  UR                  OS U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l	        g )Nr6   )r   r   )
rM   
is_slidingr   r9   rJ   r   r@   rC   q_normk_normrT   r   r   r   s      rW   r9   Gemma3Attention.__init__  sl     ,,Y7;NN7;f33D#V=P=PQ#V=P=PQr_   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionrU   returnc                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       a  U R"                  OSU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr5   r   )sincosr   eagerrk   )dropoutscalingrJ   )shaper@   q_projview	transposek_projv_projr   r   r#   updater   r$   r   _attn_implementationr   trainingrG   r   rJ   reshape
contiguouso_proj)rT   r   r   r   r   r   rU   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsattention_interfaceattn_outputattn_weightss                     rW   r   Gemma3Attention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r_   )r   r   r   rJ   )NN)rl   rm   rn   ro   r+   r   r9   r   r   r   r   
LongTensorr   r   tupler   rt   r   r   s   @rW   r   r     s    R/ RC R +/59-)||-) #\\-) !.	-)
 !-) !!1!12-) -.-) 
u||Xell3XeELL>Q5RR	S-) -)r_   r   c                   h  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S\	\R                     S	\	\R                     S
\	\   S\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Gemma3DecoderLayeri  r   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        UR
                  U   U l        [        XS9U l        [        U5      U l
        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)r   r   r   )r   r9   r   r<   r   rM   attention_typer   	self_attnr   mlpr   rC   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      rW   r9   Gemma3DecoderLayer.__init__  s    !--"$00;(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r_   r   position_embeddings_globalposition_embeddings_localr   position_idsr   output_attentionsrD   r   r   c
                 `   UnU R                  U5      nU R                  R                  (       a  UnOUnU R                  " SUUUUUUUU	S.U
D6u  pU R                  U5      nX-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )N)r   r   r   r   r   r   rD   r   r8   )r   r   r   r   r   r   r   )rT   r   r   r   r   r   r   r   rD   r   rU   residualr   self_attn_weightsoutputss                  rW   r   Gemma3DecoderLayer.forward  s     !,,]; >>$$";"<+/>> 
,
' 3)%)/)
,
 
,
( 55mD 0 66}E/77F 0 "++Gr_   )
r   r   r<   r   r   r   r   r   r   r   )NNNFFN)rl   rm   rn   ro   r+   r   r9   r   r   r   r   r   rS   r   FloatTensorr   rt   r   r   s   @rW   r   r     s    c/ cC c$ 2637*.,1$)590||0 %*LL0 $)<<	0
 !.0 u//00 !0 $D>0 D>0 !!1!120 
u  (51B1BEDUDU1U+V"WW	X0 0r_   r   c                   &    \ rS rSrSr/ SQrS rSrg)Gemma3PreTrainedModeli   )r   SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadc                     [         R                  " U5        [        U[        5      (       a%  UR                  R
                  R                  5         g g ra   )r    _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_)rT   modules     rW   r	  #Gemma3PreTrainedModel._init_weights  s;    ++F3f788--2288: 9r_   r8   N)rl   rm   rn   ro   base_model_prefix_no_split_modulesr	  rt   r8   r_   rW   r  r    s    ;r_   r  c                   (  ^  \ rS rSr% \\S'   S\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )Gemma3TextModeli  r   c                 ,  > [         TU ]  U5        [        UR                  UR                  U R
                  U R                  R                  S-  S9U l        [        R                  " U5      nUR                  Ul        SS0Ul        [        US9U l        g )N      ?)r   	rope_typedefault)r   )r   r9   r   r:   r<   r   r   embed_tokenscopydeepcopyrN   rE   rO   r   rotary_emb_localr   s     rW   r9   Gemma3TextModel.__init__  s      :v1143C3CQUQ\Q\QhQhjmQm
 v&"77*I6 5V Dr_   r   r   r   past_key_valuesinputs_embedsrD   r   output_hidden_statesr   rU   r   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  U R                  (       d
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " UXR                  S   -   UR                  S9n	Uc  U	R!                  S5      n[#        U=n[$        5      (       d*  U R                   UUU	UUS.n['        S	0 UD6[)        S	0 UD6S.nUnU R+                  X5      nU R-                  X5      nU(       a  S	OS nU(       a  S	OS nU R.                  S U R                   R0                    HF  nU(       a  UU4-  nU" U4UUUUR2                     UUUUU	S
.U
D6nUS   nU(       d  M=  UUS   4-  nMH     U R5                  U5      nU(       a  UU4-  n[7        UUUUS9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r5   r   r   input_embedsr   r   r  r   r7   r6   r8   )r   r   r   r   r   r   rD   r   )last_hidden_stater  r   
attentions)r   r   r  rD   
ValueErrorgradient_checkpointingr   r   warning_oncer  r	   get_seq_lengthr   aranger   r   	unsqueezer   r   r   r   
rotary_embr  layersr>   r   normr   )rT   r   r   r   r  r  rD   r   r  r   rU   past_seen_tokenscausal_mask_mappingmask_kwargsr   r   r   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        rW   r   Gemma3TextModel.forward)  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*nO!CRC^==?de"\\  #6#6q#99$++N )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%F%U%U# & &*__]%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!)+E*C2=3O3OP)."3#- M *!,M  =#3"55) J, 		-0-!11&+++%	
 	
r_   )r  r  	NNNNNNNNN)rl   rm   rn   ro   r+   __annotations__r9   r   r   r   r   r   r  rS   r   r   r   r   rt   r   r   s   @rW   r  r    s   E/ E" 151537+/59$(,0/359i
E,,-i
 !.i
 u//0	i

 "%i
   1 12i
 D>i
 $D>i
 'tni
 !!1!12i
 +,i
 
!i
 i
r_   r  c                   @   ^  \ rS rSr% \\S'   SrS\4U 4S jjrSrU =r	$ )Gemma3ForCausalLMi  r   language_modelc                 D   > [         TU ]  U5        [        U5      U l        g ra   )r   r9   r  modelr   s     rW   r9   Gemma3ForCausalLM.__init__  s     $V,
r_   )r?  )
rl   rm   rn   ro   r+   r:  r  r9   rt   r   r   s   @rW   r<  r<    s!    (-/ - -r_   r<  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r
  i  r   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr   r  )kernel_sizestride)r   r9   nn	Parameterr   zerosr   r<   r~   r  r   layer_norm_epsmm_soft_emb_normr   
image_size
patch_sizepatches_per_imager   tokens_per_siderC  	AvgPool2davg_poolr   s     rW   r9   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r_   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr5   r   )r   r   r   rL  r   rO  flattenrI  r   matmulr  type_as)	rT   rQ  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            rW   r   !Gemma3MultiModalProjector.forward  s    $2$8$8!
z"0":":1a"@"9"A"AD$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r_   )rO  rC  r  rI  rL  rM  )rl   rm   rn   ro   rv   r9   r   r   r   rt   r   r   s   @rW   r
  r
    s)    \| \ @ell @ @r_   r
  token_type_idsimage_group_idstokens_per_imager   c           
      `   ^ ^ T c  gS[         S[         S[         S[         S[        4
UU 4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
N	batch_idxhead_idxq_idxkv_idxr   c                 D  > [         R                  " UT
R                  S   :  US5      nT
X4   n[         R                  " UT
R                  S   :  US5      nT	X4   n[         R                  " UT	R                  S   :  US5      nT
X4   S:H  US:H  -  nT	X4   U:H  nXx-  $ )Nr5   r   r   )r   wherer   )rb  rc  rd  re  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr_  r^  s            rW   
inner_mask0token_type_ids_mask_function.<locals>.inner_mask  s     ;;v(<(<Q(??K#1)2E#F #(;;v8L8LQ8O/OQikl#m $3I4G$H!$)KK9N9Nq9Q0QSlnp$q!()9:a?D\`aDab*9+;<@YY 00r_   )r   rS   )r^  r_  r`  rm  s   ``  rW   token_type_ids_mask_functionro    sC     1c 1S 1 1c 1d 1 1" r_   c            !          \ rS rSrSrS\R                  S\R                  4S jrS r\	\
             SS\R                  S\R                  S	\\R                     S
\\R                     S\\\\R                     \4      S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       rSrg)Gemma3Modeli  Fpixel_valuesr   c                 Z    U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)rr  )vision_towerr&  multi_modal_projector)rT   rr  rQ  image_featuress       rW   get_image_featuresGemma3Model.get_image_features  s3     ***EWW33NCr_   c                     [        S5      eNzWe don't want to inherit itAttributeErrorrT   super_kwargss     rW   _update_causal_maskGemma3Model._update_causal_mask      :;;r_   Nr   r   r   r  r^  r   r  labelsrD   r   r  return_dictc                 @   US L US L-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUbR  U R                  R
                  U R                  :  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUbY  U R                  U5      nUR                  UR                  UR                   5      nU R#                  XUS9nUR%                  UU5      n['        U=n[(        5      (       Gd(  U R                  R+                  5       UUUUUS.nUb  UR                  S   S:w  a  US:H  R                  UR                  5      nU[,        R.                  R1                  USSS9S S 2S S	24   ) -  n[        R2                  " UR5                  5       SS
9S-
  n[        R6                  " UU[        R8                  " US	5      5      n[;        UR                  UR                  5      UU R                  R<                  5      US'   [?        S0 UD6[A        S0 UD6S.nU RB                  " SUUUUU
UUSUS.	UD6n[E        URF                  U
(       a  URH                  OS URJ                  URL                  Ub  WS9$ S S9$ )Nr!  r   r5   r"  )r  rv  r#  r5   r   rc   r   r   or_mask_functionr%  T)	r   r   r  r  rD   r   r  r  r   )r&  r  r   r'  image_hidden_statesr8   )'r(  r   r   r  use_return_dictr{   r:   cloneget_input_embeddingsr+  r   r,  r   r   rw  r   r   get_placeholder_maskmasked_scatterr   r   get_text_configrE  
functionalpadcumsumr   rg  	full_likero  r   r   r   r=  r   r&  r  r   r'  )rT   r   rr  r   r   r  r^  r   r  r  rD   r   r  r  	lm_kwargsspecial_image_maskllm_input_idsr1  rv  r2  r3  is_imagenew_image_startr_  r   s                            rW   r   Gemma3Model.forward  sI   & -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*kk.H.H!H%OO-M01M,-%M  557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-FF ++557 -"0"0#2 ,K )m.A.A!.D.I
 +a/33N4I4IJ"*bmm.?.?&XY.?.Z[\^a_a^a[a.b-b"b"',,/B/B/D!"Lq"P"'++hYgikIl"m2N"%%n&;&;<ot{{OnOn3./ #5"C{"C%F%U%U#
 %% 
.%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r_   r8   )NNNNNNNNNNNNN)rl   rm   rn   ro   accepts_loss_kwargsr   r   rw  r  r   r   r   r  r   r   listr   rS   r   r   r   rt   r8   r_   rW   rq  rq    s   u||  <  '+*.1537KO595959-1$(,0/3&*e
##e
 ''e
 !.	e

 u//0e
 "%U->->(?(F"GHe
 !!1!12e
 !!1!12e
   1 12e
 ))*e
 D>e
 $D>e
 'tne
 d^e
  
u//	0!e
  e
r_   rq  c            "         ^  \ rS rSr\              SS\R                  S\R                  S\\R                     S\\R                     S\\
\\R                     \4      S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\
\\R                  4   S\
\\4   4S jj5       r          SU 4S jjrS r\ SS\S\R                  S\\R                     S\R                  S\\   S\\R                     S\\R                     S\4S jj5       rSrU =r$ )Gemma3ForConditionalGenerationib  r   rr  r   r   r  r^  r   r  r  rD   r   r  r  logits_to_keepr   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)r   rr  r^  r   r   r  r  rD   r  r   r  r  r   r   .r   r5   )losslogitsr  r   r'  r  r8   )r   r   r  r  r?  r   r   slicelm_headr   r   r   r   r   rE  CrossEntropyLossr   r~   r:   r   r  r   r'  r  )rT   r   rr  r   r   r  r^  r   r  r  rD   r   r  r  r  r  r   r   slice_indicesr  r  shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsoutputs                               rW   r   &Gemma3ForConditionalGeneration.forwardc  s~   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%))%+'/!5#)
 
"  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r_   c                 V   > [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   U$ )N)r  r  r   r   r   rD   r  r^  r   rr  )r   prepare_inputs_for_generation)rT   r   r  r  r   r   rr  r   r^  rD   r  r  rU   model_inputsr   s                 rW   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s[      w<
+')%)))
 
 !!+7(r_   c                     [        S5      erz  r{  r}  s     rW   5_prepare_4d_causal_attention_mask_with_cache_positionTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  r  r_   r   r$  c                    U R                  5       UUUUUS.nUb  UR                  S   S:w  a  US:H  R                  UR                  5      n	U	[        R
                  R                  U	SSS9S S 2S S24   ) -  n
[        R                  " U
R                  5       SS9S-
  n[        R                  " X[        R                  " US5      5      n[        UR                  UR                  5      XR                  5      US'   [        S	0 UD6$ )
Nr#  r5   r  r   r  r   r  r  r8   )r  r   r   r   rE  r  r  r   r  r   rg  r  ro  r   r   )r   r$  r   r   r  r   r^  rU   r3  r  r  r_  s               rW   r   8Gemma3ForConditionalGeneration.create_masks_for_generate	  s
    ,,.(,,.(
 %,*<*<Q*?1*D
 '!+//0E0EFH&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(U__UcegEhiO.J!!."7"78/KeKe/K*+ )7;77r_   r8   )NNNNNNNNNNNNNr   )
NNNNNNNTNNra   )rl   rm   rn   ro   r   r   r   r  r   r   r   r  r   rS   r   r   r   r   r  r  staticmethodr
   r   r   rt   r   r   s   @rW   r  r  b  s-    '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B "H<  26!8 !8ll!8 !.!8 	!8
 "%!8 u||,!8 !.!8 
!8 !8r_   r  c                   V  ^  \ rS rSrU 4S jrS rS r\\         SS\	R                  S\\	R                     S\\	R                     S\\	R                     S	\\   S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForSequenceClassificationi.  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g )NF)bias)r   r9   
num_labelsrq  r?  rE  Linearr~   r<   score	post_initr   s     rW   r9   (Gemma3ForSequenceClassification.__init__/  sZ      ++ (
YYv11==tUZ[
 	r_   c                 6    U R                   R                  5       $ ra   )r?  r  r]   s    rW   r  4Gemma3ForSequenceClassification.get_input_embeddings8  s    zz..00r_   c                 :    U R                   R                  U5        g ra   )r?  set_input_embeddingsrb   s     rW   r  4Gemma3ForSequenceClassification.set_input_embeddings;  s    

''.r_   r   rr  r   r   r  r  r^  r  rD   rU   r   c
                    U R                   " U4UUUUUUU	S.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOUb  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S35        U[        R                  " XR                  S	9U4   nSnUb  U R%                  XUU R                  S
9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   rr  r   r  r  r^  rD   Nr   r5   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   r   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r"  )r  r  pooled_logitsr   )r  r  r  r   r'  )r?  r&  r  r   r   r~   r/   r(  r   r   r   int32r,  argmaxr   r*  r   rl   loss_functionr   r  r   r'  )rT   r   rr  r   r   r  r  r^  r  rD   rU   transformer_outputsr   r  rV  last_non_pad_tokennon_pad_masktoken_indicesr  r  s                       rW   r   'Gemma3ForSequenceClassification.forward>  s   , #jj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r_   )r?  r  r  r9  )rl   rm   rn   ro   r9   r  r  r   r   r   r   r   r  r   r   rS   r   r   r   r   rt   r   r   s   @rW   r  r  .  s   1/  '+481537+/5959-1$(C
##C
 u001C
 !.	C

 u//0C
 "%C
   1 12C
 !!1!12C
 ))*C
 D>C
 +,C
 
*C
  C
r_   r  )rv   r+   r  r  r<  r  rq  r  )Vr  rZ   collections.abcr   typingr   r   r   r   torch.nnrE  torch.utils.checkpointcache_utilsr   r	   configuration_utilsr
   r   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   gemma2.configuration_gemma2r   gemma2.modeling_gemma2r   r   r   r   r    r!   r"   r#   r$   paligemma.modeling_paligemmar%   r&   r'   r(   siglipr)   
get_loggerrl   r   r+   rv   r   r   	Embeddingr   r   r   r   r   r   GEMMA3_START_DOCSTRINGr  r  r<  Moduler
  r   r   ro  rq  r  r  __all__r8   r_   rW   <module>r     s      $ ' '    . J m m B 9 Y 9 5 & R R 6
 
 
  ( 
		H	%I-|%5 I-X[## [#|	 < 		#B 	
SBLL 
S!	 !
M 
!1 !7)o 7)t>3 >B  ;1 ;{
k {
|-) -!@		 !@HU\\*ell+  h	B|
. |
~I8%F I8XU
&; U
p	r_   