ó
    <±hÍÌ  ã            	       óh  • S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SK	J
r
  S SKrSSKJrJr  SSKJrJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2J3r3  SSK4J5r5J6r6J7r7J8r8  SSK9J:r:  \'Rv                  " \<5      r= " S S\)\5      r> " S S\5      r? " S S\85      r@ " S S\55      rA " S S\
R„                  5      rC " S S\-5      rD " S  S!\05      rE " S" S#\15      rF " S$ S%\+5      rG " S& S'\5      rHSrI " S( S)\/5      rJ " S* S+\.5      rK " S, S-\,5      rL " S. S/\
Rš                  5      rNS0\\Rž                     S1\\Rž                     S2\PS3\\   4S4 jrQ " S5 S6\75      rR " S7 S8\65      rS " S9 S:\J5      rT/ S;QrUg)<é    N)ÚCallable)ÚAnyÚOptionalÚUnioné   )ÚCacheÚDynamicCache)ÚPretrainedConfigÚlayer_type_validation)Úcreate_causal_maskÚcreate_masks_for_generateÚ!create_sliding_window_causal_mask)ÚFlashAttentionKwargs)ÚGradientCheckpointingLayer)ÚBaseModelOutputWithPastÚ SequenceClassifierOutputWithPast)Úrope_config_validation)ÚALL_ATTENTION_FUNCTIONS)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tupleÚloggingé   )ÚGemma2Config)	ÚGemma2AttentionÚGemma2ForCausalLMÚ	Gemma2MLPÚGemma2ModelÚGemma2PreTrainedModelÚGemma2RMSNormÚGemma2RotaryEmbeddingÚapply_rotary_pos_embÚeager_attention_forward)ÚPaligemmaCausalLMOutputWithPastÚ!PaliGemmaForConditionalGenerationÚPaliGemmaModelÚPaligemmaModelOutputWithPast)ÚSiglipVisionConfigc                   óŽ   • \ rS rSrSrSr                          S	S jr\S 5       r\R                  S 5       rSr
g)
ÚGemma3TextConfigé;   aŠ  
This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the Gemma3Text-7B.
e.g. [google/gemma3_text-7b](https://huggingface.co/google/gemma3_text-7b)
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
    vocab_size (`int`, *optional*, defaults to 262208):
        Vocabulary size of the Gemma3Text model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`Gemma3TextModel`]
    hidden_size (`int`, *optional*, defaults to 2304):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 9216):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 26):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
        `num_attention_heads`.
    head_dim (`int`, *optional*, defaults to 256):
        The attention head dimension.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
        if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
    max_position_embeddings (`int`, *optional*, defaults to 131072):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*, defaults to 0):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 2):
        Beginning of stream token id.
    tie_word_embeddings (`bool`, *optional*, defaults to `True`):
        Whether to tie weight embeddings
    rope_theta (`float`, *optional*, defaults to 1000000.0):
        The base period of the RoPE embeddings.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    query_pre_attn_scalar (`float`, *optional*, defaults to 256):
        Scaling factor used on the attention scores
    sliding_window (`int`, *optional*, defaults to 4096):
        In Gemma3Text, every other layer uses sliding window attention. This is the size of the sliding window.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.
    final_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the logits.
    attn_logit_softcapping (`float`, *optional*):
        Scaling factor when applying tanh softcapping on the attention scores.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings used in global attention. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`list[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    rope_local_base_freq (float, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings for local attention.

```python
>>> from transformers import Gemma3TextModel, Gemma3TextConfig
>>> # Initializing a Gemma3Text gemma3_text-7b style configuration
>>> configuration = Gemma3TextConfig()
>>> # Initializing a model from the gemma3_text-7b style configuration
>>> model = Gemma3TextModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```
Úgemma3_textNc                 ó€  • [         R                  " SUUUUS.UD6  Xl        Xl        X l        X0l        X@l        XPl        Xpl        X`l	        X l
        X°l        XÀl        UU l        UU l        UU l        X€l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        [1        U 5        UR3                  SS5      U l        U R*                  cL  [7        U R                  5       Vs/ sH'  n[9        US-   U R4                  -  5      (       a  SOSPM)     snU l        [;        U R*                  5        g s  snf )N)Úpad_token_idÚbos_token_idÚeos_token_idÚtie_word_embeddingsÚsliding_window_patterné   é   Úsliding_attentionÚfull_attention© )r
   Ú__init__Ú
vocab_sizeÚmax_position_embeddingsÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚhead_dimÚnum_key_value_headsÚinitializer_rangeÚrms_norm_epsÚ	use_cacheÚ
rope_thetaÚattention_biasÚattention_dropoutÚhidden_activationÚquery_pre_attn_scalarÚsliding_windowÚfinal_logit_softcappingÚattn_logit_softcappingÚlayer_typesÚrope_local_base_freqÚrope_scalingr   ÚgetÚ_sliding_window_patternÚrangeÚboolr   )Úselfr:   r<   r=   r>   r?   rA   r@   rH   r;   rB   rC   rD   r/   r1   r0   r2   rE   rF   rG   rI   rJ   rM   rK   rL   rO   rN   ÚkwargsÚis                                Úa/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/gemma3/modular_gemma3.pyr9   ÚGemma3TextConfig.__init__²   sO  € ô< 	×!Ò!ð 	
Ø%Ø%Ø%Ø 3ñ		
ð
 ò	
ð %ŒØ'>Ô$Ø&ÔØ!2ÔØ!2ÔØ#6Ô Ø ŒØ#6Ô Ø!2ÔØ(ÔØ"ŒØ$ˆŒØ,ˆÔØ!2ˆÔØ!2ÔØ%:ˆÔ"Ø,ˆÔØ'>ˆÔ$Ø&<ˆÔ#Ø&ˆÔà$8ˆÔ!Ø(ˆÔÜ˜tÔ$ð (.§z¡zÐ2JÈAÓ'NˆÔ$à×ÑÑ#ô ˜t×5Ñ5Ô6ó á6Aô (,¨Q°©U°d×6RÑ6RÑ,R×'SÑ'SÑ#ÐYiÒiÙ6ñ ˆDÔô 	˜d×.Ñ.Õ/ùò	 s   Ã0-D;c                 óP   • [         R                  " S[        5        U R                  $ )NzTThe `sliding_window_pattern` attribute is deprecated and will be removed in v4.55.0.)ÚwarningsÚwarnÚFutureWarningrQ   ©rT   s    rW   r3   Ú'Gemma3TextConfig.sliding_window_patternú   s"   € äŠØbÜô	
ð ×+Ñ+Ð+ó    c                 ó   • Xl         g ©N)rQ   ©rT   Úvalues     rW   r3   r^     s   € à',Õ$r_   )rQ   rF   rG   rL   rK   r@   rH   r<   rB   r=   rM   r;   r?   r>   rA   rI   rC   rN   rO   rE   rJ   rD   r:   )i@  i 	  i $  é   é   é   é   Úgelu_pytorch_tanhi   ç{®Gáz”?çíµ ÷Æ°>Tr   r5   r   Tg    €„.AFç        rg   i   NNNNg     ˆÃ@)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typer9   Úpropertyr3   ÚsetterÚ__static_attributes__r8   r_   rW   r+   r+   ;   s•   † ñrðh €Jð ØØØØØØØ-Ø 'ØØØØØØØ ØØØØ!ØØØ $Ø#ØØ%ô7F0ðP ñ,ó ð,ð ×"Ñ"ñ-ó #ó-r_   r+   c                   ó¬   ^ • \ rS rSrSrSrSSSS.r\\S.r	       SS	\
\\\\\4   4      S
\
\\\\\4   4      S\S\S\S\S\4U 4S jjjrSrU =r$ )ÚGemma3Configi  a  
This is the configuration class to store the configuration of a [`Gemma3ForConditionalGeneration`]. It is used to instantiate an
Gemma3ForConditionalGeneration according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the PaliGemma-2B.

e.g. [google/gemma-3-4b](https://huggingface.co/google/gemma-3-4b)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`Union[Gemma3TextConfig, dict]`, *optional*):
        The config object of the text backbone.
    vision_config (`Union[AutoConfig, dict]`,  *optional*):
        Custom vision config or dict.
    mm_tokens_per_image (`int`, *optional*, defaults to 256):
        The number of tokens per image embedding.
    boi_token_index (`int`, *optional*, defaults to 255999):
        The begin-of-image token index to wrap the image prompt.
    eoi_token_index (`int`, *optional*, defaults to 256000):
        The end-of-image token index to wrap the image prompt.
    image_token_index (`int`, *optional*, defaults to 262144):
        The image token index to encode the image prompt.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.


Example:

```python
>>> from transformers import Gemma3ForConditionalGeneration, Gemma3Config, SiglipVisionConfig, Gemma3TextConfig

>>> # Initializing a Siglip-like vision config
>>> vision_config = SiglipVisionConfig()

>>> # Initializing a Gemma3 Text config
>>> text_config = Gemma3TextConfig()

>>> # Initializing a Gemma3 gemma-3-4b style configuration
>>> configuration = Gemma3Config(vision_config, text_config)

>>> # Initializing a model from the gemma-3-4b style configuration
>>> model = Gemma3TextConfig(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úgemma3Úimage_token_indexÚboi_token_indexÚeoi_token_index)Úimage_token_idÚboi_token_idÚeoi_token_id)Útext_configÚvision_configr~   r   Úmm_tokens_per_imagerB   c                 ó†  >• Uc   [        5       n[        R                  S5        O [        U[        5      (       a  [        S0 UD6n[        U[        5      (       a  [        S0 UD6nO"Uc  [        5       n[        R                  S5        Xl        X l        X0l        X@l	        XPl
        X`l        Xpl        [        T	U ]8  " S0 UD6  g )Nz@text_config is None, using default Gemma3TextConfig text config.zFvision_config is None, using default SiglipVisionConfig vision config.r8   )r+   ÚloggerÚinfoÚ
isinstanceÚdictr)   r~   r   r€   ry   rz   rx   rB   Úsuperr9   )
rT   r~   r   r€   ry   rz   rx   rB   rU   Ú	__class__s
            €rW   r9   ÚGemma3Config.__init__C  s©   ø€ ð ÑÜ*Ó,ˆKÜK‰KÐZÕ[Ü˜¤T×*Ñ*Ü*Ñ9¨[Ñ9ˆKäm¤T×*Ñ*Ü.Ñ?°Ñ?‰MØÑ"Ü.Ó0ˆMÜK‰KÐ`Ôaà&ÔØ*ÔØ#6Ô Ø.ÔØ.ÔØ!2ÔØ!2Ôä‰ÒÑ"˜6Ó"r_   )ry   rz   rx   rB   r€   r~   r   )NNrg   iÿç i è i   ri   )rl   rm   rn   ro   rp   rq   Úattribute_mapr+   r)   Úsub_configsr   r   r…   Ústrr   ÚintÚfloatr9   rt   Ú__classcell__©r‡   s   @rW   rv   rv     sÆ   ø† ñ.ð` €Jà-Ø)Ø)ñ€Mð (Ø+ñ€Kð JNØMQØ#&Ø&Ø&Ø!(Ø#'ñ#à˜eÐ$4°d¸3À¸8±nÐ$DÑEÑFð#ð   Ð&8¸$¸sÀC¸x¹.Ð&HÑ IÑJð#ð !ð	#ð
 ð#ð ð#ð ð#ð !÷#ö #r_   rv   c                   ó   • \ rS rSrSrg)ÚGemma3ModelOutputWithPastie  r8   N©rl   rm   rn   ro   rt   r8   r_   rW   r‘   r‘   e  ó   † Úr_   r‘   c                   ó   • \ rS rSrSrg)ÚGemma3CausalLMOutputWithPastii  r8   Nr’   r8   r_   rW   r•   r•   i  r“   r_   r•   c            	       ól   ^ • \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )ÚGemma3TextScaledWordEmbeddingim  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
Únum_embeddingsÚembedding_dimÚpadding_idxÚembed_scalec                 óp   >• [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )Nr›   F)Ú
persistent)r†   r9   Úregister_bufferÚtorchÚtensor)rT   r˜   r™   rš   r›   r‡   s        €rW   r9   Ú&Gemma3TextScaledWordEmbedding.__init__r  s1   ø€ Ü‰Ñ˜¸ÔDØ×Ñ˜]¬E¯LªL¸Ó,EÐRWÐÒXr_   Ú	input_idsc                 ó‚   >• [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ ra   )r†   Úforwardr›   ÚtoÚweightÚdtype)rT   r¢   r‡   s     €rW   r¤   Ú%Gemma3TextScaledWordEmbedding.forwardv  s2   ø€ Ü‰w‰˜yÓ)¨D×,<Ñ,<×,?Ñ,?ÀÇÁ×@QÑ@QÓ,RÑRÐRr_   r8   )g      ð?)rl   rm   rn   ro   rp   rŒ   r   r9   rŸ   ÚTensorr¤   rt   rŽ   r   s   @rW   r—   r—   m  sM   ø† ññY sð Y¸3ð YÈSð YÐ_d÷ Yð YðS §¡÷ Sõ Sr_   r—   c                   ó0   ^ • \ rS rSrS\4U 4S jjrSrU =r$ )Ú	Gemma3MLPiz  Úconfigc                 ó$   >• [         TU ]  U5        g ra   ©r†   r9   ©rT   r¬   r‡   s     €rW   r9   ÚGemma3MLP.__init__{  ó   ø€ Ü‰Ñ˜Õ r_   r8   ©rl   rm   rn   ro   r+   r9   rt   rŽ   r   s   @rW   r«   r«   z  s   ø† ð!Ð/÷ !õ !r_   r«   c                   ó8   ^ • \ rS rSrSS\S\4U 4S jjjrSrU =r$ )ÚGemma3RMSNormi  ÚdimÚepsc                 ó"   >• [         TU ]  5         g ra   r®   )rT   rµ   r¶   r‡   s      €rW   r9   ÚGemma3RMSNorm.__init__€  s   ø€ Ü‰ÑÕr_   r8   )rj   )	rl   rm   rn   ro   rŒ   r   r9   rt   rŽ   r   s   @rW   r´   r´     s   ø† ñ˜Cð  e÷ ö r_   r´   c                   ó4   ^ • \ rS rSrSS\4U 4S jjjrSrU =r$ )ÚGemma3RotaryEmbeddingi„  r¬   c                 ó$   >• [         TU ]  U5        g ra   r®   )rT   r¬   Údevicer‡   s      €rW   r9   ÚGemma3RotaryEmbedding.__init__…  r±   r_   r8   ra   r²   r   s   @rW   rº   rº   „  s   ø† ñ!Ð/÷ !ö !r_   rº   c                   ó$  ^ • \ rS rSrS\S\4U 4S jjr  SS\R                  S\R                  S\	\R                     S\	\
   S	\	\R                     S
\\   S\\R                  \	\R                     \	\\R                        4   4S jjrSrU =r$ )ÚGemma3AttentioniŠ  r¬   Ú	layer_idxc                 ó$  >• UR                   U   S:H  U l        [        TU ]  5         U R                  (       a  UR                  OS U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l	        g )Nr6   )rµ   r¶   )
rM   Ú
is_slidingr†   r9   rJ   r´   r@   rC   Úq_normÚk_norm©rT   r¬   rÀ   r‡   s      €rW   r9   ÚGemma3Attention.__init__‹  sl   ø€ Ø ×,Ñ,¨YÑ7Ð;NÑNˆŒä‰ÑÔØ7;··˜f×3Ò3ÈDˆÔä#¨¯©¸V×=PÑ=PÑQˆŒÜ#¨¯©¸V×=PÑ=PÑQˆr_   Úhidden_statesÚposition_embeddingsÚattention_maskÚpast_key_valueÚcache_positionrU   Úreturnc                 óŽ  • UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  pÍ[        XšXÍ5      u  pšUb$  XÜUS.nUR                  X«U R                  U5      u  p«[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       a  U R"                  OSU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Néÿÿÿÿr5   r   )ÚsinÚcosrË   Úeagerrk   )ÚdropoutÚscalingrJ   )Úshaper@   Úq_projÚviewÚ	transposeÚk_projÚv_projrÃ   rÄ   r#   ÚupdaterÀ   r$   r¬   Ú_attn_implementationr   ÚtrainingrG   rÓ   rJ   ÚreshapeÚ
contiguousÚo_proj)rT   rÇ   rÈ   rÉ   rÊ   rË   rU   Úinput_shapeÚhidden_shapeÚquery_statesÚ
key_statesÚvalue_statesrÐ   rÏ   Úcache_kwargsÚattention_interfaceÚattn_outputÚattn_weightss                     rW   r¤   ÚGemma3Attention.forward”  s¾  € ð $×)Ñ)¨#¨2Ð.ˆØ8˜Ð8 bÐ8¨$¯-©-Ñ8ˆà—{‘{ =Ó1×6Ñ6°|ÓD×NÑNÈqÐRSÓTˆØ—[‘[ Ó/×4Ñ4°\ÓB×LÑLÈQÐPQÓRˆ
Ø—{‘{ =Ó1×6Ñ6°|ÓD×NÑNÈqÐRSÓTˆà—{‘{ <Ó0ˆØ—[‘[ Ó,ˆ
à&‰ˆÜ#7¸ÐRUÓ#[Ñ ˆàÑ%à#&ÀnÑUˆLØ'5×'<Ñ'<¸ZÐW[×WeÑWeÐgsÓ'tÑ$ˆJä(?ÐØ;‰;×+Ñ+¨wÓ6Ü"9¸$¿+¹+×:ZÑ:ZÑ"[Ðá$7ØØØØØð
%
ð /3¯m¯mD×*Ò*ÀØ—L‘LØ×.Ñ.ñ
%
ð ñ
%
Ñ!ˆ\ð "×)Ò)Ð;¨;Ð;¸Ò;×FÑFÓHˆØ—k‘k +Ó.ˆØ˜LÐ(Ð(r_   )rÂ   rÄ   rÃ   rJ   )NN)rl   rm   rn   ro   r+   rŒ   r9   rŸ   r©   r   r   Ú
LongTensorr   r   Útupler¤   rt   rŽ   r   s   @rW   r¿   r¿   Š  s½   ø† ðRÐ/ð R¸C÷ Rð +/Ø59ñ-)à—|‘|ð-)ð #Ÿ\™\ð-)ð ! §¡Ñ.ð	-)ð
 ! ™ð-)ð ! ×!1Ñ!1Ñ2ð-)ð Ð-Ñ.ð-)ð 
ˆu|‰|˜X e§l¡lÑ3°X¸eÀEÇLÁLÑ>QÑ5RÐRÑ	S÷-)ó -)r_   r¿   c                   óh  ^ • \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S\	\R                     S	\	\R                     S
\	\   S\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )ÚGemma3DecoderLayeriÄ  r¬   rÀ   c                 óØ  >• [         TU ]  5         Xl        UR                  U l        X l        UR
                  U   U l        [        XS9U l        [        U5      U l
        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)r¬   rÀ   ©r¶   )r†   r9   r¬   r<   rÀ   rM   Úattention_typer¿   Ú	self_attnr«   Úmlpr´   rC   Úinput_layernormÚpost_attention_layernormÚpre_feedforward_layernormÚpost_feedforward_layernormrÅ   s      €rW   r9   ÚGemma3DecoderLayer.__init__Å  s½   ø€ Ü‰ÑÔØŒØ!×-Ñ-ˆÔØ"ŒØ$×0Ñ0°Ñ;ˆÔÜ(°ÑLˆŒÜ˜VÓ$ˆŒÜ,¨T×-=Ñ-=À6×CVÑCVÑWˆÔÜ(5°d×6FÑ6FÈF×L_ÑL_Ñ(`ˆÔ%Ü)6°t×7GÑ7GÈV×M`ÑM`Ñ)aˆÔ&Ü*7¸×8HÑ8HÈf×NaÑNaÑ*bˆÕ'r_   rÇ   Úposition_embeddings_globalÚposition_embeddings_localrÉ   Úposition_idsrÊ   Úoutput_attentionsrD   rË   rÌ   c
                 ó`  • UnU R                  U5      nU R                  R                  (       a  UnOUnU R                  " SUUUUUUUU	S.U
D6u  pU R                  U5      nX±-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX±-   nU4nU(       a  Xí4-  nU$ )N)rÇ   rÈ   rÉ   rú   rÊ   rû   rD   rË   r8   )ró   rñ   rÂ   rô   rõ   rò   rö   )rT   rÇ   rø   rù   rÉ   rú   rÊ   rû   rD   rË   rU   ÚresidualrÈ   Úself_attn_weightsÚoutputss                  rW   r¤   ÚGemma3DecoderLayer.forwardÒ  sÜ   € ð !ˆà×,Ñ,¨]Ó;ˆð >‰>×$×$Ø";Ñà"<Ðà+/¯>ª>ð 
,
Ø'Ø 3Ø)Ø%Ø)Ø/ØØ)ñ
,
ð ñ
,
Ñ(ˆð ×5Ñ5°mÓDˆØ Ñ0ˆà ˆØ×6Ñ6°}ÓEˆØŸ™ Ó/ˆØ×7Ñ7¸ÓFˆØ Ñ0ˆà Ð"ˆæØÐ+Ñ+ˆGàˆr_   )
rð   r¬   r<   ró   rÀ   rò   rô   rö   rõ   rñ   )NNNFFN)rl   rm   rn   ro   r+   rŒ   r9   rŸ   r©   r   rê   r   rS   rë   ÚFloatTensorr¤   rt   rŽ   r   s   @rW   rí   rí   Ä  sü   ø† ðcÐ/ð c¸C÷ cð$ 26Ø37Ø*.Ø,1Ø$)Ø59ñ0à—|‘|ð0ð %*§L¡Lð0ð $)§<¡<ð	0ð
 ! §¡Ñ.ð0ð ˜u×/Ñ/Ñ0ð0ð ! ™ð0ð $ D™>ð0ð ˜D‘>ð0ð ! ×!1Ñ!1Ñ2ð0ð 
ˆu× Ñ  (¨5°×1BÑ1BÀE×DUÑDUÐ1UÑ+VÑ"WÐWÑ	X÷0ó 0r_   rí   c                   ó&   • \ rS rSrSr/ SQrS rSrg)ÚGemma3PreTrainedModeli  Ú )rí   ÚSiglipVisionEmbeddingsÚSiglipEncoderLayerÚ#SiglipMultiheadAttentionPoolingHeadc                 ó¤   • [         R                  " U5        [        U[        5      (       a%  UR                  R
                  R                  5         g g ra   )r    Ú_init_weightsr„   ÚGemma3MultiModalProjectorÚmm_input_projection_weightÚdataÚzero_)rT   Úmodules     rW   r	  Ú#Gemma3PreTrainedModel._init_weights  s;   € Ü×+Ò+¨FÔ3ÜfÔ7×8Ñ8Ø×-Ñ-×2Ñ2×8Ñ8Õ:ð 9r_   r8   N)rl   rm   rn   ro   Úbase_model_prefixÚ_no_split_modulesr	  rt   r8   r_   rW   r  r    s   † ØÐòÐõ;r_   r  c                   ó(  ^ • \ rS rSr% \\S'   S\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )ÚGemma3TextModeli  r¬   c                 ó,  >• [         TU ]  U5        [        UR                  UR                  U R
                  U R                  R                  S-  S9U l        [        R                  " U5      nUR                  Ul        SS0Ul        [        US9U l        g )Nç      à?)r›   Ú	rope_typeÚdefault)r¬   )r†   r9   r—   r:   r<   rš   r¬   Úembed_tokensÚcopyÚdeepcopyrN   rE   rO   rº   Úrotary_emb_localr¯   s     €rW   r9   ÚGemma3TextModel.__init__  s…   ø€ Ü‰Ñ˜Ô ô :Ø×Ñ˜v×1Ñ1°4×3CÑ3CÐQU×Q\ÑQ\×QhÑQhÐjmÑQmñ
ˆÔô —’˜vÓ&ˆØ"×7Ñ7ˆÔØ*¨IÐ6ˆÔÜ 5¸VÑ DˆÕr_   r¢   rÉ   rú   Úpast_key_valuesÚinputs_embedsrD   rû   Úoutput_hidden_statesrË   rU   rÌ   c
                 ó¢  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  U R                  (       d
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " UXµR                  S   -   UR                  S9n	Uc  U	R!                  S5      n[#        U=n[$        5      (       d*  U R                   UUU	UUS.n['        S	0 UD6[)        S	0 UD6S.nUnU R+                  Xã5      nU R-                  Xã5      nU(       a  S	OS nU(       a  S	OS nU R.                  S U R                   R0                    HF  nU(       a  UU4-  nU" U4UUUUR2                     UUUUU	S
.U
D6nUS   nU(       d  M=  UUS   4-  nMH     U R5                  U5      nU(       a  UU4-  n[7        UUUUS9$ )Nú:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r5   ©r¼   ©r¬   Úinput_embedsrÉ   rË   r  rú   ©r7   r6   r8   )rø   rù   rÉ   rú   rÊ   rû   rD   rË   )Úlast_hidden_stater  rÇ   Ú
attentions)r¬   rû   r  rD   Ú
ValueErrorÚgradient_checkpointingrÜ   r‚   Úwarning_oncer  r	   Úget_seq_lengthrŸ   ÚarangerÔ   r¼   Ú	unsqueezer„   r…   r   r   Ú
rotary_embr  Úlayersr>   rð   Únormr   )rT   r¢   rÉ   rú   r  r  rD   rû   r  rË   rU   Úpast_seen_tokensÚcausal_mask_mappingÚmask_kwargsrÇ   rø   rù   Úall_hidden_statesÚall_self_attnsÚdecoder_layerÚlayer_outputss                        rW   r¤   ÚGemma3TextModel.forward)  s…  € ð 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	à˜Ð -°tÐ";×<ÜÐYÓZÐZà×&×&¨4¯=¯=¾YÜ×ÑØjôð ˆIàÑ Ø ×-Ñ-¨iÓ8ˆMæ˜Ñ0¸¿¿Ü*›nˆOàÑ!ØCRÑC^˜×=Ñ=Ô?ÐdeÐÜ"Ÿ\š\Ø Ø ×#6Ñ#6°qÑ#9Ñ9Ø$×+Ñ+ñˆNð ÑØ)×3Ñ3°AÓ6ˆLô °Ð?Ð-Ä×FÑFð Ÿ+™+Ø -Ø"0Ø"0Ø#2Ø ,ñˆKô #5Ñ"C°{Ñ"CÜ%FÑ%UÈÑ%Uñ#Ðð &ˆð &*§_¡_°]Ó%QÐ"Ø$(×$9Ñ$9¸-Ó$VÐ!ö #7™B¸DÐÞ0™°dˆà!Ÿ[™[Ð)H¨4¯;©;×+HÑ+HÓIˆMÞ#Ø! mÐ%5Ñ5Ð!á)Øðà+EØ*CØ2°=×3OÑ3OÑPØ)Ø.Ø"3Ø#Ø-ñð ñˆMð *¨!Ñ,ˆMç Ð Ø =°Ñ#3Ð"5Ñ5’ñ) Jð, Ÿ	™	 -Ó0ˆæØ -Ð!1Ñ1Ðä&Ø+Ø+Ø+Ø%ñ	
ð 	
r_   )r  r  ©	NNNNNNNNN)rl   rm   rn   ro   r+   Ú__annotations__r9   r   rŸ   rê   r©   r   r  rS   r   r   r   r¤   rt   rŽ   r   s   @rW   r  r    s  ø‡ ØÓðEÐ/÷ Eð" 15Ø15Ø37Ø+/Ø59Ø$(Ø,0Ø/3Ø59ñi
à˜E×,Ñ,Ñ-ði
ð ! §¡Ñ.ði
ð ˜u×/Ñ/Ñ0ð	i
ð
 " %™ði
ð   × 1Ñ 1Ñ2ði
ð ˜D‘>ði
ð $ D™>ði
ð ' t™nði
ð ! ×!1Ñ!1Ñ2ði
ð Ð+Ñ,ði
ð 
!÷i
ó i
r_   r  c                   ó@   ^ • \ rS rSr% \\S'   SrS\4U 4S jjrSrU =r	$ )ÚGemma3ForCausalLMi•  r¬   Úlanguage_modelc                 óD   >• [         TU ]  U5        [        U5      U l        g ra   )r†   r9   r  Úmodelr¯   s     €rW   r9   ÚGemma3ForCausalLM.__init__™  s   ø€ Ü‰Ñ˜Ô Ü$ VÓ,ˆ
r_   )r?  )
rl   rm   rn   ro   r+   r:  r  r9   rt   rŽ   r   s   @rW   r<  r<  •  s!   ø‡ ØÓØ(Ðð-Ð/÷ -õ -r_   r<  c                   óR   ^ • \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r
  iž  r¬   c                 óˆ  >• [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nrï   r  )Úkernel_sizeÚstride)r†   r9   ÚnnÚ	ParameterrŸ   Úzerosr   r<   r~   r  r´   Úlayer_norm_epsÚmm_soft_emb_normrŒ   Ú
image_sizeÚ
patch_sizeÚpatches_per_imager€   Útokens_per_siderC  Ú	AvgPool2dÚavg_poolr¯   s     €rW   r9   Ú"Gemma3MultiModalProjector.__init__Ÿ  sì   ø€ Ü‰ÑÔä*,¯,ª,ÜKŠK˜×,Ñ,×8Ñ8¸&×:LÑ:L×:XÑ:XÓYó+
ˆÔ'ô !.Ø× Ñ ×,Ñ,°&×2FÑ2F×2UÑ2Uñ!
ˆÔô "% V×%9Ñ%9×%DÑ%DÈ×H\ÑH\×HgÑHgÑ%gÓ!hˆÔÜ" 6×#=Ñ#=¸sÑ#BÓCˆÔØ×1Ñ1°T×5IÑ5IÑIˆÔÜŸš°×1AÑ1AÈ$×JZÑJZÑ[ˆr_   Úvision_outputsc                 óž  • UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr5   r   )rÔ   r×   rÝ   rL  rÞ   rO  ÚflattenrI  rŸ   Úmatmulr  Útype_as)	rT   rQ  Ú
batch_sizeÚ_Ú
seq_lengthÚreshaped_vision_outputsÚpooled_vision_outputsÚnormed_vision_outputsÚprojected_vision_outputss	            rW   r¤   Ú!Gemma3MultiModalProjector.forward¯  sÇ   € Ø$2×$8Ñ$8Ñ!ˆ
zà"0×":Ñ":¸1¸aÓ"@ÐØ"9×"AÑ"AØ D×$:Ñ$:¸D×<RÑ<Ró#
Ðð #:×"DÑ"DÓ"FÐà $§¡Ð.EÓ FÐØ 5× =Ñ =¸aÓ @ÐØ 5× ?Ñ ?ÀÀ1Ó EÐà $× 5Ñ 5Ð6KÓ LÐä#(§<¢<Ð0E×GfÑGfÓ#gÐ Ø'×/Ñ/°Ó?Ð?r_   )rO  rC  r  rI  rL  rM  )rl   rm   rn   ro   rv   r9   rŸ   r©   r¤   rt   rŽ   r   s   @rW   r
  r
  ž  s)   ø† ð\˜|÷ \ð @ e§l¡l÷ @ò @r_   r
  Útoken_type_idsÚimage_group_idsÚtokens_per_imagerÌ   c           
      ó`   ^ ^• T c  gS[         S[         S[         S[         S[        4
UU 4S jjnU$ )z‰
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
NÚ	batch_idxÚhead_idxÚq_idxÚkv_idxrÌ   c                 óD  >• [         R                  " UT
R                  S   :  US5      nT
X4   n[         R                  " UT
R                  S   :  US5      nT	X4   n[         R                  " UT	R                  S   :  US5      nT
X4   S:H  US:H  -  nT	X4   U:H  nXx-  $ )Nr5   r   rÎ   )rŸ   ÚwhererÔ   )rb  rc  rd  re  Úsafe_idxÚtoken_type_ids_at_kv_idxÚimage_group_ids_at_kv_idxÚis_image_blockÚsame_image_blockr_  r^  s            €€rW   Ú
inner_maskÚ0token_type_ids_mask_function.<locals>.inner_maskÏ  sÊ   ø€ ô —;’;˜v¨×(<Ñ(<¸QÑ(?Ñ?ÀÈÓKˆØ#1°)Ð2EÑ#FÐ Ü#(§;¢;¨v¸×8LÑ8LÈQÑ8OÑ/OÐQiÐklÓ#mÐ à$3°IÐ4GÑ$HÐ!Ü$)§K¢K°¸×9NÑ9NÈqÑ9QÑ0QÐSlÐnpÓ$qÐ!à(¨Ð)9Ñ:¸aÑ?ÐD\Ð`aÑDaÑbˆØ*¨9Ð+;Ñ<Ð@YÑYÐð Ñ0Ð0r_   )rŒ   rS   )r^  r_  r`  rm  s   ``  rW   Útoken_type_ids_mask_functionro  Â  sC   ù€ ð ÑØð1œcð 1¬Sð 1¼ð 1Äcð 1Ìd÷ 1ð 1ð" Ðr_   c            !       óÜ  • \ rS rSrSrS\R                  S\R                  4S jrS r\	\
             SS\R                  S\R                  S	\\R                     S
\\R                     S\\\\R                     \4      S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       rSrg)ÚGemma3Modeliã  FÚpixel_valuesrÌ   c                 óZ   • U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)rr  )Úvision_towerr&  Úmulti_modal_projector)rT   rr  rQ  Úimage_featuress       rW   Úget_image_featuresÚGemma3Model.get_image_featuresç  s3   € ð ×*Ñ*¸Ð*ÐE×WÑWˆØ×3Ñ3°NÓCˆØÐr_   c                 ó   • [        S5      e©NzWe don't want to inherit it©ÚAttributeError©rT   Úsuper_kwargss     rW   Ú_update_causal_maskÚGemma3Model._update_causal_maskõ  ó   € ÜÐ:Ó;Ð;r_   Nr¢   rÉ   rú   r  r^  rË   r  ÚlabelsrD   rû   r  Úreturn_dictc                 ó@  • US L US L-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUbR  U R                  R
                  U R                  :¼  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUbY  U R                  U5      nUR                  UR                  UR                   5      nU R#                  XUS9nUR%                  UU5      n['        U=n[(        5      (       Gd(  U R                  R+                  5       UUUUUS.nUbí  UR                  S   S:w  aÚ  US:H  R                  UR                  5      nU[,        R.                  R1                  USSS9S S 2S S	24   ) -  n[        R2                  " UR5                  5       SS
9S-
  n[        R6                  " UU[        R8                  " US	5      5      n[;        UR                  UR                  5      UU R                  R<                  5      US'   [?        S0 UD6[A        S0 UD6S.nU RB                  " SUUUUU
UUSUS.	UD6n[E        URF                  U
(       a  URH                  OS URJ                  URL                  Ub  WS9$ S S9$ )Nr!  r   r5   r"  )r  rv  r#  ©r5   r   ©rc   rÎ   ©rµ   Úor_mask_functionr%  T)	rÉ   rú   r  r  rD   rû   r  rƒ  rË   )r&  r  rÇ   r'  Úimage_hidden_statesr8   )'r(  r¬   rû   r  Úuse_return_dictr{   r:   ÚcloneÚget_input_embeddingsr+  rŸ   r,  rÔ   r¼   rw  r¥   r§   Úget_placeholder_maskÚmasked_scatterr„   r…   Úget_text_configrE  Ú
functionalÚpadÚcumsumrŒ   rg  Ú	full_likero  r€   r   r   r=  r‘   r&  r  rÇ   r'  )rT   r¢   rr  rÉ   rú   r  r^  rË   r  r‚  rD   rû   r  rƒ  Ú	lm_kwargsÚspecial_image_maskÚllm_input_idsr1  rv  r2  r3  Úis_imageÚnew_image_startr_  rÿ   s                            rW   r¤   ÚGemma3Model.forwardø  sI  € ð& ˜Ð -°tÐ";×<ÜÐYÓZÐZà1BÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆð Ñ  T§[¡[×%?Ñ%?À4Ç?Á?Ó%RØ!*¯k©k×.HÑ.HÑ!HÐØ%ŸO™OÓ-ˆMØ01ˆMÐ,Ò-à%ˆMàÑ Ø ×5Ñ5Ô7¸ÓFˆMàÑ!ØCRÑC^˜×=Ñ=Ô?ÐdeÐÜ"Ÿ\š\Ø Ð"2°]×5HÑ5HÈÑ5KÑ"KÐTa×ThÑThñˆNð
 Ñ#Ø!×4Ñ4°\ÓBˆNØ+×.Ñ.¨}×/CÑ/CÀ]×EXÑEXÓYˆNØ!%×!:Ñ!:ØÀ~ð ";ð "Ðð *×8Ñ8Ð9KÈ^Ó\ˆMô °Ð?Ð-Ä×FÒFð Ÿ+™+×5Ñ5Ó7Ø -Ø"0Ø"0Ø#2Ø ,ñˆKð Ñ)¨m×.AÑ.AÀ!Ñ.DÈÓ.Ið
 +¨aÑ/×3Ñ3°N×4IÑ4IÓJØ"*¬b¯m©m×.?Ñ.?ÀÈ&ÐXYÐ.?Ð.ZÒ[\Ð^aÐ_aÐ^aÐ[aÑ.bÐ-bÑ"bÜ"'§,¢,¨×/BÑ/BÓ/DÈ!Ñ"LÈqÑ"PÜ"'§+¢+¨h¸ÌÏÊÐYgÐikÓIlÓ"mÜ2NØ"×%Ñ% n×&;Ñ&;Ó<¸oÈtÏ{É{×OnÑOnó3Ð.Ñ/ô #5Ñ"C°{Ñ"CÜ%FÑ%UÈÑ%Uñ#Ðð
 ×%Ò%ð 
Ø.Ø%Ø+Ø'ØØ/Ø!5ØØ)ñ
ð ñ
ˆô )Ø%×7Ñ7Þ7@˜G×3Ò3ÀdØ!×/Ñ/Ø×)Ñ)Ø2>Ñ2J ñ
ð 	
ð
 QUñ
ð 	
r_   r8   )NNNNNNNNNNNNN)rl   rm   rn   ro   Úaccepts_loss_kwargsrŸ   r©   rw  r  r   r   rê   r  r   r   Úlistr   rS   rë   r‘   r¤   rt   r8   r_   rW   rq  rq  ã  sˆ  † àÐð¨u¯|©|ð ÀÇÁô ò<ð Øð '+Ø*.Ø15Ø37ØKOØ59Ø59Ø59Ø-1Ø$(Ø,0Ø/3Ø&*ñe
à×#Ñ#ðe
ð ×'Ñ'ðe
ð ! §¡Ñ.ð	e
ð
 ˜u×/Ñ/Ñ0ðe
ð " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHðe
ð ! ×!1Ñ!1Ñ2ðe
ð ! ×!1Ñ!1Ñ2ðe
ð   × 1Ñ 1Ñ2ðe
ð ˜×)Ñ)Ñ*ðe
ð ˜D‘>ðe
ð $ D™>ðe
ð ' t™nðe
ð ˜d‘^ðe
ð  
ˆuÐ/Ð/Ñ	0ô!e
ó ó óe
r_   rq  c            "       óš  ^ • \ rS rSr\              SS\R                  S\R                  S\\R                     S\\R                     S\\
\\R                     \4      S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\   S\\   S\
\\R                  4   S\
\\4   4S jj5       r          SU 4S jjrS r\ SS\S\R                  S\\R                     S\R                  S\\   S\\R                     S\\R                     S\4S jj5       rSrU =r$ )ÚGemma3ForConditionalGenerationib  r¢   rr  rÉ   rú   r  r^  rË   r  r‚  rD   rû   r  rƒ  Úlogits_to_keeprÌ   c                 ó¼  • Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb‘  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a±  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)r¢   rr  r^  rÉ   rú   r  r  rD   r‚  rû   r  rƒ  rË   r   .rÎ   r5   )ÚlossÚlogitsr  rÇ   r'  r‰  r8   )r¬   rû   r  rŠ  r?  r„   rŒ   ÚsliceÚlm_headr   rÔ   r¥   r¼   rÞ   rE  ÚCrossEntropyLossrÖ   r~   r:   r•   r  rÇ   r'  r‰  )rT   r¢   rr  rÉ   rú   r  r^  rË   r  r‚  rD   rû   r  rƒ  rž  r”  rÿ   rÇ   Úslice_indicesr¡  r   Úshift_logitsÚshift_labelsÚshift_attention_maskÚloss_fctÚflat_logitsÚflat_labelsÚoutputs                               rW   r¤   Ú&Gemma3ForConditionalGeneration.forwardc  s~  € ð@ 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà—*’*ð 
ØØ%Ø)Ø)Ø%Ø+Ø'ØØØ/Ø!5Ø#Ø)ñ
ð ñ
ˆð"   ™
ˆä8BÀ>ÔSV×8WÑ8Wœ˜~˜o¨tÔ4Ð]kˆØ—‘˜mªA¨}ºaÐ,?Ñ@ÓAˆàˆØÒà—\‘\“^ˆFØ! # s¨ sªA +Ñ.ˆLØ! # q¡r '™?ˆLØÑ)ð (6²a¸,×:LÑ:LÈQÑ:OÐ9OÑ9QÐ6QÑ'R×'UÑ'UÐV\×VcÑVcÓ'dÐ$Ø+Ð,@×,CÑ,CÀFÇMÁMÓ,RÐVWÑ,WÑX×cÑcÓeØ+Ð,@×,CÑ,CÀL×DWÑDWÓ,XÐ\]Ñ,]Ñ^×iÑiÓk‘à+×6Ñ6Ó8Ø+×6Ñ6Ó8ä×*Ò*Ó,ˆHà&×+Ñ+¨B°·±×0GÑ0G×0RÑ0RÓSˆKØ&×+Ñ+¨BÓ/×2Ñ2°<×3FÑ3FÓGˆKÙ˜K¨Ó5ˆDæØY ¨¨ Ñ,ˆFØ'+Ñ'7D7˜VÑ#ÐC¸VÐCä+ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø '× ;Ñ ;ñ
ð 	
r_   c                 óV   >• [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   U$ )N)r  r  rÉ   rú   rË   rD   rž  r^  r   rr  )r†   Úprepare_inputs_for_generation)rT   r¢   r  r  rË   rú   rr  rÉ   r^  rD   rž  r‚  rU   Úmodel_inputsr‡   s                 €rW   r¯  Ú<Gemma3ForConditionalGeneration.prepare_inputs_for_generationâ  s[   ø€ ô  ‘wÒ<Øð
à+Ø'Ø)Ø%Ø)ØØ)Ø)ñ
ð ñ
ˆð ˜!Ñ Ó!Ø+7˜Ñ(àÐr_   c                 ó   • [        S5      erz  r{  r}  s     rW   Ú5_prepare_4d_causal_attention_mask_with_cache_positionÚTGemma3ForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  r  r_   r¬   r$  c                 ó  • U R                  5       UUUUUS.nUbá  UR                  S   S:w  aÎ  US:H  R                  UR                  5      n	U	[        R
                  R                  U	SSS9S S 2S S24   ) -  n
[        R                  " U
R                  5       SS9S-
  n[        R                  " X›[        R                  " US5      5      n[        UR                  UR                  5      X°R                  5      US'   [        S	0 UD6$ )
Nr#  r5   r…  r   r†  rÎ   r‡  rˆ  r8   )r  rÔ   r¥   r¼   rE  r  r‘  rŸ   r’  rŒ   rg  r“  ro  r€   r   )r¬   r$  rÉ   rË   r  rú   r^  rU   r3  r—  r˜  r_  s               rW   r   Ú8Gemma3ForConditionalGeneration.create_masks_for_generate	  s
  € ð ×,Ñ,Ó.Ø(Ø,Ø,Ø.Ø(ñ
ˆð Ñ%¨,×*<Ñ*<¸QÑ*?À1Ó*Dð
 '¨!Ñ+×/Ñ/°×0EÑ0EÓFˆHØ&¬"¯-©-×*;Ñ*;¸HÀfÐTUÐ*;Ð*VÒWXÐZ]Ð[]ÐZ]ÐW]Ñ*^Ð)^Ñ^ˆOÜ#Ÿlšl¨?×+>Ñ+>Ó+@ÀaÑHÈ1ÑLˆOÜ#Ÿkšk¨(ÄUÇ_Â_ÐUcÐegÓEhÓiˆOÜ.JØ×!Ñ! .×"7Ñ"7Ó8¸/×KeÑKeó/ˆKÐ*Ñ+ô )Ñ7¨;Ñ7Ð7r_   r8   )NNNNNNNNNNNNNr   )
NNNNNNNTNNra   )rl   rm   rn   ro   r   rŸ   rê   r  r   r©   r   r›  r   rS   rŒ   rë   r•   r¤   r¯  r³  Ústaticmethodr
   r…   r   rt   rŽ   r   s   @rW   r  r  b  s-  ø† Øð '+Ø*.Ø15Ø37ØKOØ59Ø59Ø59Ø-1Ø$(Ø,0Ø/3Ø&*Ø34ñ|
à×#Ñ#ð|
ð ×'Ñ'ð|
ð ! §¡Ñ.ð	|
ð
 ˜u×/Ñ/Ñ0ð|
ð " %¨¨U×->Ñ->Ñ(?ÀÐ(FÑ"GÑHð|
ð ! ×!1Ñ!1Ñ2ð|
ð ! ×!1Ñ!1Ñ2ð|
ð   × 1Ñ 1Ñ2ð|
ð ˜×)Ñ)Ñ*ð|
ð ˜D‘>ð|
ð $ D™>ð|
ð ' t™nð|
ð ˜d‘^ð|
ð ˜c 5§<¡<Ð/Ñ0ð|
ð" 
ˆuÐ2Ð2Ñ	3ô#|
ó ð|
ðB ØØØØØØØØØ÷"òH<ð ð 26ñ!8Ø ð!8à—l‘lð!8ð ! §¡Ñ.ð!8ð Ÿ™ð	!8ð
 " %™ð!8ð ˜uŸ|™|Ñ,ð!8ð ! §¡Ñ.ð!8ð 
ô!8ó ö!8r_   r  c                   óV  ^ • \ rS rSrU 4S jrS rS r\\         SS\	R                  S\\	R                     S\\	R                     S\\	R                     S	\\   S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\4S jj5       5       rSrU =r$ )ÚGemma3ForSequenceClassificationi.  c                 óø   >• [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g )NF)Úbias)r†   r9   Ú
num_labelsrq  r?  rE  ÚLinearr~   r<   ÚscoreÚ	post_initr¯   s     €rW   r9   Ú(Gemma3ForSequenceClassification.__init__/  sZ   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒÜ  Ó(ˆŒ
Ü—Y’Y˜v×1Ñ1×=Ñ=¸t¿¹ÐUZÑ[ˆŒ
ð 	‰Õr_   c                 ó6   • U R                   R                  5       $ ra   )r?  rŒ  r]   s    rW   rŒ  Ú4Gemma3ForSequenceClassification.get_input_embeddings8  s   € Øz‰z×.Ñ.Ó0Ð0r_   c                 ó:   • U R                   R                  U5        g ra   )r?  Úset_input_embeddingsrb   s     rW   rÄ  Ú4Gemma3ForSequenceClassification.set_input_embeddings;  s   € Ø
‰
×'Ñ'¨Õ.r_   r¢   rr  rÉ   rú   r  r  r^  r‚  rD   rU   rÌ   c
                 óÔ  • U R                   " U4UUUUUUU	S.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOËUbš  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S35        U[        R                  " XíR                  S	9U4   nSnUb  U R%                  XØUU R                  S
9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)rÉ   rr  rú   r  r  r^  rD   Nr   r5   z=Cannot handle batch sizes > 1 if no padding token is defined.rÎ   )r¼   r§   zŠ will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r"  )r¡  r‚  Úpooled_logitsr¬   )r   r¡  r  rÇ   r'  )r?  r&  r¾  rÔ   r¬   r~   r/   r(  r¥   r¼   rŸ   Úint32r,  Úargmaxr‚   r*  r‡   rl   Úloss_functionr   r  rÇ   r'  )rT   r¢   rr  rÉ   rú   r  r  r^  r‚  rD   rU   Útransformer_outputsrÇ   r¡  rV  Úlast_non_pad_tokenÚnon_pad_maskÚtoken_indicesrÇ  r   s                       rW   r¤   Ú'Gemma3ForSequenceClassification.forward>  sÙ  € ð, #ŸjšjØð

à)Ø%Ø%Ø+Ø'Ø)Øñ

ð ñ

Ðð ,×=Ñ=ˆØ—‘˜MÓ*ˆàÑ Ø"Ÿ™¨Ñ+‰Jà&×,Ñ,¨QÑ/ˆJà;‰;×"Ñ"×/Ñ/Ñ7¸JÈ!»OÜÐ\Ó]Ð]Ø;‰;×"Ñ"×/Ñ/Ñ7Ø!#ÑØÑ"à%¯©×)@Ñ)@×)MÑ)MÑM×QÑQÐRX×R_ÑR_Ôaf×alÑalÓmˆLÜ!ŸLšL¨¯©¸Ñ)<ÀVÇ]Á]ÔZ_×ZeÑZeÑfˆMØ"/°,Ñ">×!FÑ!FÀrÓ!JÑà!#ÐÜ×ÑØ—>‘>×*Ñ*Ð+ð ,Zð Zôð
 œuŸ|š|¨J¿}¹}ÑMÐOaÐaÑbˆàˆØÑØ×%Ñ%¨VÐR_Ðhl×hsÑhsÐ%ÐtˆDä/ØØ Ø/×?Ñ?Ø-×;Ñ;Ø*×5Ñ5ñ
ð 	
r_   )r?  r¼  r¾  r9  )rl   rm   rn   ro   r9   rŒ  rÄ  r   r   rŸ   rê   r   r  r©   r   rS   r   r   r   r¤   rt   rŽ   r   s   @rW   r¹  r¹  .  s  ø† õò1ò/ð Øð '+Ø48Ø15Ø37Ø+/Ø59Ø59Ø-1Ø$(ñC
à×#Ñ#ðC
ð ˜u×0Ñ0Ñ1ðC
ð ! §¡Ñ.ð	C
ð
 ˜u×/Ñ/Ñ0ðC
ð " %™ðC
ð   × 1Ñ 1Ñ2ðC
ð ! ×!1Ñ!1Ñ2ðC
ð ˜×)Ñ)Ñ*ðC
ð ˜D‘>ðC
ð Ð+Ñ,ðC
ð 
*ôC
ó ó öC
r_   r¹  )rv   r+   r  r  r<  r  rq  r¹  )Vr  rZ   Úcollections.abcr   Útypingr   r   r   rŸ   Útorch.nnrE  Útorch.utils.checkpointÚcache_utilsr   r	   Úconfiguration_utilsr
   r   Úmasking_utilsr   r   r   Úmodeling_flash_attention_utilsr   Úmodeling_layersr   Úmodeling_outputsr   r   Úmodeling_rope_utilsr   Úmodeling_utilsr   Úprocessing_utilsr   Úutilsr   r   r   r   Úgemma2.configuration_gemma2r   Úgemma2.modeling_gemma2r   r   r   r   r    r!   r"   r#   r$   Úpaligemma.modeling_paligemmar%   r&   r'   r(   Úsiglipr)   Ú
get_loggerrl   r‚   r+   rv   r‘   r•   Ú	Embeddingr—   r«   r´   rº   r¿   rí   ÚGEMMA3_START_DOCSTRINGr  r  r<  ÚModuler
  r©   rŒ   ro  rq  r  r¹  Ú__all__r8   r_   rW   Ú<module>rç     s¿  ðó  Û Ý $ß 'Ñ 'ã Ý Û ç .ß Jß mÑ mÝ BÝ 9ß YÝ 9Ý 5Ý &ß RÓ RÝ 6÷
÷ 
õ 
÷ó õ (ð 
×	Ò	˜HÓ	%€ôI-|Ð%5ô I-ôX[#Ð#ô [#ô|	Ð <ô 	ô	Ð#Bô 	ô
S B§L¡Lô 
Sô!	ô !ô
Mô ô
!Ð1ô !ô7)oô 7)ôt>Ð3ô >ðB Ð ô;Ð1ô ;ô{
kô {
ô|-Ð)ô -ô!@ §	¡	ô !@ðHØ˜UŸ\™\Ñ*ðà˜eŸl™lÑ+ðð ðð ˆhÑô	ôB|
.ô |
ô~I8Ð%Fô I8ôXU
Ð&;ô U
òp	r_   