
    <h                        S SK r S SKJr  S SKJr  S SKJrJr  S SKrS SK	J
r
  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/  SSK0J1r1  SSK2J3r3J4r4  \-Rj                  " \65      r7\\+" SS9 " S S\5      5       5       r8\\+" SS9 " S S\)5      5       5       r9 " S S\
Rt                  5      r; " S  S!\
Rx                  5      r= " S" S#\
Rx                  5      r> " S$ S%\
Rx                  5      r?S& r@SNS' jrAS(\R                  S)\CS*\R                  4S+ jrD   SOS,\
Rx                  S-\R                  S.\R                  S/\R                  S0\\R                     S1\ES2\\E   S3\\E   S*\F\R                  \R                  4   4S4 jjrG " S5 S6\
Rx                  5      rH " S7 S8\5      rI\+ " S9 S:\%5      5       rJ\+ " S; S<\J5      5       rK\+ " S= S>\J\5      5       rL " S? S@\
Rx                  5      rMSA\\R                     SB\\R                     SC\CS*\\   4SD jrN\+" SES9 " SF SG\J5      5       rO\+" SHS9 " SI SJ\J\5      5       rP " SK SL\J5      rQ/ SMQrRg)P    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )custom_introc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)Gemma3ModelOutputWithPast2   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r)   r   torchFloatTensor__annotations____static_attributes__r*       b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/gemma3/modeling_gemma3.pyr'   r'   2   s    
 8<%"3"34;r4   r'   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                   &   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                     \4      \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)Gemma3CausalLMOutputWithPastH   a-  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr)   r*   )r+   r,   r-   r.   r/   r9   r   r0   r1   r2   r:   r;   r   listr	   r<   tupler=   r)   r3   r*   r4   r5   r7   r7   H   s      )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;r4   r7   c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3TextScaledWordEmbeddingg   zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )NrF   F
persistent)super__init__register_bufferr0   tensor)selfrC   rD   rE   rF   	__class__s        r5   rK   &Gemma3TextScaledWordEmbedding.__init__l   s1    D]ELL,ERWXr4   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ N)rJ   forwardrF   toweightdtype)rN   rQ   rO   s     r5   rT   %Gemma3TextScaledWordEmbedding.forwardp   s2    wy)D,<,<,?,?@Q@Q,RRRr4   r*   )      ?)r+   r,   r-   r.   r/   intfloatrK   r0   TensorrT   r3   __classcell__rO   s   @r5   rA   rA   g   sM    Ys Y3 YS Y_d Y YS S Sr4   rA   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )	Gemma3MLPt   configc                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFbias)rJ   rK   rb   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrN   rb   rO   s     r5   rK   Gemma3MLP.__init__u   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r4   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rS   )rm   ro   rk   rl   )rN   xrm   s      r5   rT   Gemma3MLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r4   )ro   rb   rm   rk   rg   rh   rl   )	r+   r,   r-   r.   r$   rK   rT   r3   r]   r^   s   @r5   r`   r`   t   s    7/ 7 r4   r`   c                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
Gemma3RMSNorm   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g rS   )rJ   rK   ry   ri   	Parameterr0   zerosrV   )rN   rx   ry   rO   s      r5   rK   Gemma3RMSNorm.__init__   s,    ll5;;s#34r4   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr    T)keepdim)r0   rsqrtpowmeanry   )rN   rs   s     r5   _normGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr4   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )NrY   )r   r[   rV   type_as)rN   rs   outputs      r5   rT   Gemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r4   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r?   rV   shapery   rN   s    r5   
extra_reprGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r4   )ry   rV   )gư>)r+   r,   r-   r.   rZ   r[   rK   r   rT   r   r3   r]   r^   s   @r5   rv   rv      s0    5C 5e 5 5
K!= =r4   rv   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Gemma3RotaryEmbedding   rb   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqFrH   )rJ   rK   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrb   r   rope_init_fnattention_scalingrL   r   original_inv_freq)rN   rb   devicer   rO   s       r5   rK   Gemma3RotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r4   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r   r"   mpscpuF)device_typeenabledr    rx   )rW   )r   r[   expandr   rU   r   r   r   strr0   autocast	transposecatcosr   sinrW   )
rN   rs   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   rT   Gemma3RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   rb   r   r   r   r   r   rS   )r+   r,   r-   r.   r$   rK   r0   no_gradr   rT   r3   r]   r^   s   @r5   r   r      s7    // / /" ]]_<  <r4   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r    r   )r   r0   r   )rs   x1x2s      r5   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr4   r<   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)r   r   reshape)r<   r   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr4   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r    r   r   )rx   rW   )ptrainingr"   )r   r   num_key_value_groupsr0   matmulr   tanhr   ri   
functionalsoftmaxfloat32rU   rW   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r5   eager_attention_forwardr      s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r4   c                   (  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\
\R                     S\\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )Gemma3Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrb   	layer_idxc                   > [         TU ]  5         UR                  U   S:H  U l        Xl        X l        [        USUR                  UR                  -  5      U l	        UR                  UR                  -  U l        UR                  S-  U l        U R                  R                  U l        SU l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  UR                  U R                  -  UR$                  S9U l        [         R"                  " UR                  U R                  -  UR                  UR$                  S9U l        U R                  R.                  U l        U R                  (       a  UR0                  OS U l        [3        UR                  UR4                  S9U l        [3        UR                  UR4                  S9U l        g )Nsliding_attentionr   r   Tre   )rx   ry   )rJ   rK   layer_types
is_slidingrb   r   getattrrg   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout	is_causalri   rj   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrv   rms_norm_epsq_normk_normrN   rb   r   rO   s      r5   rK   Gemma3Attention.__init__  s    ,,Y7;NN"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;f33D#V=P=PQ#V=P=PQr4   r<   position_embeddingsr   past_key_valuecache_positionr   r   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nU R                  U	5      n	U R                  U
5      n
Uu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       a  U R"                  OSU R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr   r"   r    )r   r   r  eager        )r   r   r   )r   r   r   viewr   r   r   r   r   r   updater   r   rb   _attn_implementationr   r   r   r   r   r   r   r   )rN   r<   r   r   r  r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r5   rT   Gemma3Attention.forward+  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ "));;;;FFHkk+.L((r4   )r   r   rb   r   r   r   r   r   r   r   r   r   r   r   r   r   )NN)r+   r,   r-   r.   r/   r$   rZ   rK   r0   r\   r   r	   
LongTensorr   r   r?   rT   r3   r]   r^   s   @r5   r   r     s    GR/ RC RD +/59-)||-) #\\-) !.	-)
 !-) !!1!12-) -.-) 
u||Xell3XeELL>Q5RR	S-) -)r4   r   c                   h  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S\	\R                     S	\	\R                     S
\	\   S\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Gemma3DecoderLayeri[  rb   r   c                   > [         TU ]  5         Xl        UR                  U l        X l        UR
                  U   U l        [        XS9U l        [        U5      U l
        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        g )N)rb   r   ry   )rJ   rK   rb   rg   r   r   attention_typer   	self_attnr`   mlprv   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      r5   rK   Gemma3DecoderLayer.__init__\  s    !--"$00;(LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r4   r<   position_embeddings_globalposition_embeddings_localr   r   r  output_attentions	use_cacher  r   c
                 `   UnU R                  U5      nU R                  R                  (       a  UnOUnU R                  " SUUUUUUUU	S.U
D6u  pU R                  U5      nX-   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX-   nU4nU(       a  X4-  nU$ )N)r<   r   r   r   r  r  r  r  r*   )r  r  r   r  r  r  r  )rN   r<   r  r  r   r   r  r  r  r  r   residualr   self_attn_weightsoutputss                  r5   rT   Gemma3DecoderLayer.forwardi  s     !,,]; >>$$";"<+/>> 
,
' 3)%)/)
,
 
,
( 55mD 0 66}E/77F 0 "++Gr4   )
r  rb   rg   r  r   r  r  r  r  r  )NNNFFN)r+   r,   r-   r.   r$   rZ   rK   r0   r\   r   r  r	   boolr?   r1   rT   r3   r]   r^   s   @r5   r  r  [  s    c/ cC c$ 2637*.,1$)590||0 %*LL0 $)<<	0
 !.0 u//00 !0 $D>0 D>0 !!1!120 
u  (51B1BEDUDU1U+V"WW	X0 0r4   r  c                   h   ^  \ rS rSr% \\S'   SrSr/ SQrS/r	Sr
SrSrSrSr\\S.rU 4S jrS	rU =r$ )
Gemma3PreTrainedModeli  rb    T)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr;   )r<   r=   c                    > [         TU ]  U5        [        U[        5      (       a%  UR                  R
                  R                  5         g g rS   )rJ   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_)rN   r   rO   s     r5   r-  #Gemma3PreTrainedModel._init_weights  s;    f%f788--2288: 9r4   r*   )r+   r,   r-   r.   r#   r2   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr-  r3   r]   r^   s   @r5   r'  r'    s]    &*# $5"5N!"&+%
; ;r4   r'  c                   <  ^  \ rS rSr% \\S'   S\4U 4S jjr\\         SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\
R                     S	\	\   S
\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       rSrU =r$ )Gemma3TextModeli  rb   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        US9U l        SU l        [*        R,                  " U5      nUR.                  Ul        SS0Ul        [%        US9U l        U R7                  5         g s  snf )N      ?)rF   r  rb   Fr   r   )rJ   rK   pad_token_idrE   
vocab_sizerA   rg   rb   embed_tokensri   
ModuleListrangenum_hidden_layersr  layersrv   r   normr   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar   rotary_emb_local	post_initr   s      r5   rK   Gemma3TextModel.__init__  s    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdDcy2Dcd
 "&"4"4&:M:MN	/v>&+# v&"77*I6 5V D 	 es    ErQ   r   r   r;   inputs_embedsr  r  output_hidden_statesr  r   r   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nU(       a  Uc  U R                  (       d
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " UXR                  S   -   UR                  S9n	Uc  U	R!                  S5      n[#        U=n[$        5      (       d*  U R                   UUU	UUS.n['        S	0 UD6[)        S	0 UD6S.nUnU R+                  X5      nU R-                  X5      nU(       a  S	OS nU(       a  S	OS nU R.                  S U R                   R0                    HF  nU(       a  UU4-  nU" U4UUUUR2                     UUUUU	S
.U
D6nUS   nU(       d  M=  UUS   4-  nMH     U R5                  U5      nU(       a  UU4-  n[7        UUUUS9$ )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r"   r   rb   input_embedsr   r  r;   r   full_attentionr   r*   )r  r  r   r   r  r  r  r  )last_hidden_stater;   r<   r=   )rb   r  rT  r  
ValueErrorrK  r   loggerwarning_oncerD  r
   get_seq_lengthr0   aranger   r   r   r   r   r   r   rJ  rP  rH  rG  r  rI  r   )rN   rQ   r   r   r;   rS  r  r  rT  r  r   past_seen_tokenscausal_mask_mappingmask_kwargsr<   r  r  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                        r5   rT   Gemma3TextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*nO!CRC^==?de"\\  #6#6q#99$++N )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%F%U%U# & &*__]%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!)+E*C2=3O3OP)."3#- M *!,M  =#3"55) J, 		-0-!11&+++%	
 	
r4   )rD  rK  rH  rI  rE   rJ  rP  rC  	NNNNNNNNN)r+   r,   r-   r.   r$   r2   rK   r   r   r   r0   r  r\   r	   r1   r%  r   r   r   rT   r3   r]   r^   s   @r5   r>  r>    s   / 4  151537+/59$(,0/359i
E,,-i
 !.i
 u//0	i

 "%i
   1 12i
 D>i
 $D>i
 'tni
 !!1!12i
 +,i
 
!i
  i
r4   r>  c                     ^  \ rS rSr% S/rSS0rSS/S/40r\\S'   Sr	S\4U 4S	 jjr
S
 rS r\\           SS\\R"                     S\\R$                     S\\R"                     S\\   S\\R(                     S\\R"                     S\\   S\\   S\\   S\\R"                     S\\\R$                  4   S\4S jj5       5       rSrU =r$ )Gemma3ForCausalLMiE  lm_head.weightlm_headcolwise_repr<   r:   rb   language_modelc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g rd   )
rJ   rK   r>  modelrC  ri   rj   rg   rn  rQ  rp   s     r5   rK   Gemma3ForCausalLM.__init__M  sU     $V,
 ++yy!3!3V5F5FUS 	r4   c                     Xl         g rS   rr  rN   decoders     r5   set_decoderGemma3ForCausalLM.set_decoderV  s    
r4   c                     U R                   $ rS   ru  r   s    r5   get_decoderGemma3ForCausalLM.get_decoderY  s    zzr4   rQ   r   r   r;   rS  labelsr  r  rT  r  logits_to_keepr   c                 F   U R                   (       aG  U R                  R                  S:w  a-  [        R	                  SU R                  R                   S35        Ub  UOU R                  R
                  nU	b  U	OU R                  R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bH  UU R                  R                  -  n[        R                  " U5      nUU R                  R                  -  nSnUb  U R                   " UX`R"                  40 UD6n[%        UUUR&                  UR(                  UR*                  S9$ )a"  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3ForCausalLM

>>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```r  zhIt is strongly recommended to train Gemma3 models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	rQ   r   r   r;   rS  r  r  rT  r  r9   r:   r;   r<   r=   r*   )r   rb   r  r^  r_  r  rT  rr  r\  r   rZ   slicern  final_logit_softcappingr0   r   loss_functionrC  r   r;   r<   r=   )rN   rQ   r   r   r;   rS  r}  r  r  rT  r  r~  r   r#  r<   slice_indicesr:   r9   s                     r5   rT   Gemma3ForCausalLM.forward\  s   F ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%%ffooPPD%#33!//))
 	
r4   )rn  rr  rC  )NNNNNNNNNNr   )r+   r,   r-   r.   _tied_weights_keys_tp_plan_pp_planr$   r2   r3  rK   rx  r{  r   r   r   r0   r  r\   r	   r1   r%  r   rZ   r   rT   r3   r]   r^   s   @r5   rl  rl  E  sn   *+=)H_-z:;H(/   151537+/59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 "%K
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
  K
r4   rl  c                   R   ^  \ rS rSrS\4U 4S jjrS\R                  4S jrSr	U =r
$ )r.  i  rb   c                   > [         TU ]  5         [        R                  " [        R
                  " UR                  R                  UR                  R                  5      5      U l	        [        UR                  R                  UR                  R                  S9U l        [        UR                  R                  UR                  R                  -  5      U l        [        UR"                  S-  5      U l        U R                   U R$                  -  U l        [        R(                  " U R&                  U R&                  S9U l        g )Nr  r@  )kernel_sizestride)rJ   rK   ri   r{   r0   r|   vision_configrg   text_configr/  rv   layer_norm_epsmm_soft_emb_normrZ   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider  	AvgPool2davg_poolrp   s     r5   rK   "Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r4   vision_outputsc                    UR                   u  p#nUR                  SS5      nUR                  X$U R                  U R                  5      nUR	                  5       nU R                  U5      nUR                  S5      nUR                  SS5      nU R                  U5      n[        R                  " XpR                  5      nUR                  U5      $ )Nr"   r    )r   r   r   r  r   r  flattenr  r0   r   r/  r   )	rN   r  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r5   rT   !Gemma3MultiModalProjector.forward  s    $2$8$8!
z"0":":1a"@"9"A"AD$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EGfGf#g '//??r4   )r  r  r/  r  r  r  )r+   r,   r-   r.   r#   rK   r0   r\   rT   r3   r]   r^   s   @r5   r.  r.    s)    \| \ @ell @ @r4   r.  token_type_idsimage_group_idstokens_per_imagec           
      `   ^ ^ T c  gS[         S[         S[         S[         S[        4
UU 4S jjnU$ )z
This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
not start and end indices.
N	batch_idxhead_idxq_idxkv_idxr   c                 D  > [         R                  " UT
R                  S   :  US5      nT
X4   n[         R                  " UT
R                  S   :  US5      nT	X4   n[         R                  " UT	R                  S   :  US5      nT
X4   S:H  US:H  -  nT	X4   U:H  nXx-  $ )Nr"   r   r   )r0   wherer   )r  r  r  r  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr  r  s            r5   
inner_mask0token_type_ids_mask_function.<locals>.inner_mask  s     ;;v(<(<Q(??K#1)2E#F #(;;v8L8LQ8O/OQikl#m $3I4G$H!$)KK9N9Nq9Q0QSlnp$q!()9:a?D\`aDab*9+;<@YY 00r4   )rZ   r%  )r  r  r  r  s   ``  r5   token_type_ids_mask_functionr    sC     1c 1S 1 1c 1d 1 1" r4   zx
    The Base Gemma3 model which consists of a vision backbone and a language model withou language modeling head.,
    c            !       d  ^  \ rS rSrSS0rSrS\4U 4S jjrS rS r	S	 r
S
 rS\R                  S\R                  4S jrS\R                  S\R                   S\R                   4S jr\\             SS\R                  S\R                   S\\R                     S\\R                     S\\\\R                      \4      S\\R                     S\\R                     S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )Gemma3Modeli  zlanguage_model.modelrp  Frb   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        [        U5      U l        UR                  R                  U l	        [        R                  " UR                  S9nX l
        U R                  R                  b  U R                  R                  OSU l        U R                  5         g )NrA  r   )rJ   rK   r!   from_configr  vision_towerr.  multi_modal_projectorr  rC  rp  rb   rB  rQ  )rN   rb   rp  rO   s      r5   rK   Gemma3Model.__init__  s     %119M9MN%>v%F" ,,77"..f6H6HI,8<8P8P8\DKK44bdr4   c                 6    U R                   R                  5       $ rS   )rp  get_input_embeddingsr   s    r5   r   Gemma3Model.get_input_embeddings  s    ""7799r4   c                 :    U R                   R                  U5        g rS   )rp  set_input_embeddingsrN   r   s     r5   r   Gemma3Model.set_input_embeddings
  s    007r4   c                     Xl         g rS   rp  rv  s     r5   rx  Gemma3Model.set_decoder  s    %r4   c                     U R                   $ rS   r  r   s    r5   r{  Gemma3Model.get_decoder  s    """r4   pixel_valuesr   c                 Z    U R                  US9R                  nU R                  U5      nU$ )a]  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
)r  )r  r\  r  )rN   r  r  image_featuress       r5   get_image_featuresGemma3Model.get_image_features  s3     ***EWW33NCr4   rQ   rS  r  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)rW   r   r   r   r"   z6Image features and image tokens do not match: tokens: z, features )r  r0   rM   rb   image_token_idlongr   allsumr   	expand_asrU   r   numelr]  )rN   rQ   rS  r  special_image_maskn_image_tokensn_image_featuress          r5   get_placeholder_mask Gemma3Model.get_placeholder_mask!  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r4   r   r   r;   r  r  r}  r  r  rT  return_dictc                 @   USL USL-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUb  UOU R                  R                  nUbR  U R                  R
                  U R                  :  a.  XR                  R
                  :H  nUR                  5       nSUU'   OUnUc  U R                  5       " U5      nUcE  Ub  UR                  5       OSn[        R                  " UUUR                  S   -   UR                  S9nUbY  U R                  U5      nUR                  UR                  UR                   5      nU R#                  XUS9nUR%                  UU5      n['        U=n[(        5      (       Gd(  U R                  R+                  5       UUUUUS.nUb  UR                  S   S:w  a  US:H  R                  UR                  5      nU[,        R.                  R1                  USSS	9SS2SS
24   ) -  n[        R2                  " UR5                  5       SS9S-
  n[        R6                  " UU[        R8                  " US
5      5      n[;        UR                  UR                  5      UU R                  R<                  5      US'   [?        S0 UD6[A        S0 UD6S.nU RB                  " SUUUUU
UUSUS.	UD6n[E        URF                  U
(       a  URH                  OSURJ                  URL                  Ub  WS9$ SS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```NrV  r   r"   rW  )rS  r  rX  r"   r   r   r   r   or_mask_functionrZ  T)	r   r   r;   rS  r  r  rT  r  r  )r\  r;   r<   r=   r)   r*   )'r]  rb   r  rT  use_return_dictr  rC  cloner  r`  r0   ra  r   r   r  rU   rW   r  masked_scatterr   r   get_text_configri   r   padcumsumrZ   r  	full_liker  r  r   r   rp  r'   r\  r;   r<   r=   )rN   rQ   r  r   r   r;   r  r  rS  r}  r  r  rT  r  	lm_kwargsr  llm_input_idsrb  r  rc  rd  is_imagenew_image_startr  r#  s                            r5   rT   Gemma3Model.forward9  sJ   \ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*kk.H.H!H%OO-M01M,-%M  557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-FF ++557 -"0"0#2 ,K )m.A.A!.D.I
 +a/33N4I4IJ"*bmm.?.?&XY.?.Z[\^a_a^a[a.b-b"b"',,/B/B/D!"Lq"P"'++hYgikIl"m2N"%%n&;&;<ot{{OnOn3./ #5"C{"C%F%U%U#
 %% 
.%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r4   )rp  r  rB  r  rC  )NNNNNNNNNNNNN)r+   r,   r-   r.   _checkpoint_conversion_mappingaccepts_loss_kwargsr#   rK   r  r  rx  r{  r0   r\   r  r  r1   r  r   r   r   r   r>   r	   r%  r?   r'   rT   r3   r]   r^   s   @r5   r  r    s    '=>N%O"
| 
:8&#u||  "))":?:K:K"]b]n]n"0  '+*.1537KO595959-1$(,0/3&*@
##@
 ''@
 !.	@

 u//0@
 "%U->->(?(F"GH@
 !!1!12@
 !!1!12@
   1 12@
 ))*@
 D>@
 $D>@
 'tn@
 d^@
  
u//	0!@
  @
r4   r  zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c            "       
  ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S r
S rS r\S 5       r\S 5       r\S 5       r\              S&S\R&                  S\R(                  S\\R,                     S\\R&                     S\\\\R(                     \4      S\\R&                     S\\R&                     S\\R(                     S\\R&                     S\\   S\\   S\\   S\\   S\\\R,                  4   S \\\4   4S! jj5       r          S'U 4S" jjr\  S(S\!S#\R,                  S\\R,                     S\R,                  S\\   S\\R,                     S\\R,                     S \"4S$ jj5       r#S%r$U =r%$ ))Gemma3ForConditionalGenerationi  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorrn  )z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headrm  rb   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g rd   )rJ   rK   r  rr  ri   rj   r  rg   rC  rn  rQ  rp   s     r5   rK   'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr4   c                 6    U R                   R                  5       $ rS   rr  r  r   s    r5   r  3Gemma3ForConditionalGeneration.get_input_embeddings      zz..00r4   c                 :    U R                   R                  U5        g rS   rr  r  r  s     r5   r  3Gemma3ForConditionalGeneration.set_input_embeddings      

''.r4   c                 :    U R                   R                  U5        g rS   )rr  rx  rv  s     r5   rx  *Gemma3ForConditionalGeneration.set_decoder  s    

w'r4   c                 6    U R                   R                  5       $ rS   )rr  r{  r   s    r5   r{  *Gemma3ForConditionalGeneration.get_decoder  s    zz%%''r4   c                 8    U R                   R                  U5      $ rS   )rr  r  )rN   r  s     r5   r  1Gemma3ForConditionalGeneration.get_image_features  s    zz,,\::r4   c                 .    U R                   R                  $ rS   )rr  rp  r   s    r5   rp  -Gemma3ForConditionalGeneration.language_model  s    zz(((r4   c                 .    U R                   R                  $ rS   )rr  r  r   s    r5   r  +Gemma3ForConditionalGeneration.vision_tower  s    zz&&&r4   c                 .    U R                   R                  $ rS   )rr  r  r   s    r5   r  4Gemma3ForConditionalGeneration.multi_modal_projector  s    zz///r4   rQ   r  r   r   r;   r  r  rS  r}  r  r  rT  r  r~  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  " SUUUUUUUU
U	UUUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	GbQ  UR                  5       nUSSS2SS24   nU	SSS24   nUb  USS2UR                  S   * S24   R                  UR                  5      nUUR                  UR                  5      S:g     R                  5       nUUR                  UR                  5      S:g     R                  5       nO UR                  5       nUR                  5       n[        R                  " 5       nUR!                  SU R                   R"                  R$                  5      nUR!                  S5      R                  UR                  5      nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ ['        UUUR(                  UR*                  UR,                  UR.                  S9$ )	a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenize=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
N)rQ   r  r  r   r   r;   rS  r  r}  r  rT  r  r  r   .r   r"   )r9   r:   r;   r<   r=   r)   r*   )rb   r  rT  r  rr  r   rZ   r  rn  r[   r   rU   r   r   ri   CrossEntropyLossr  r  rC  r7   r;   r<   r=   r)   )rN   rQ   r  r   r   r;   r  r  rS  r}  r  r  rT  r  r~  r  r#  r<   r  r:   r9   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr   s                               r5   rT   &Gemma3ForConditionalGeneration.forward  s~   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%))%+'/!5#)
 
"  
8B>SV8W8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r4   c                 V   > [         TU ]  " U4UUUUUU	U
US.UD6nUS   S:X  a  XmS'   U$ )N)r;   rS  r   r   r  r  r~  r  r   r  )rJ   prepare_inputs_for_generation)rN   rQ   r;   rS  r  r   r  r   r  r  r~  r}  r   model_inputsrO   s                 r5   r  <Gemma3ForConditionalGeneration.prepare_inputs_for_generationm  s[      w<
+')%)))
 
 !!+7(r4   rY  c                    U R                  5       UUUUUS.nUb  UR                  S   S:w  a  US:H  R                  UR                  5      n	U	[        R
                  R                  U	SSS9S S 2S S24   ) -  n
[        R                  " U
R                  5       SS9S-
  n[        R                  " X[        R                  " US5      5      n[        UR                  UR                  5      XR                  5      US'   [        S	0 UD6$ )
NrX  r"   r  r   r  r   r   r  r*   )r  r   rU   r   ri   r   r  r0   r  rZ   r  r  r  r  r   )rb   rY  r   r  r;   r   r  r   rd  r  r  r  s               r5   r   8Gemma3ForConditionalGeneration.create_masks_for_generate  s
    ,,.(,,.(
 %,*<*<Q*?1*D
 '!+//0E0EFH&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(U__UcegEhiO.J!!."7"78/KeKe/K*+ )7;77r4   )rn  rr  )NNNNNNNNNNNNNr   )
NNNNNNNTNNrS   )&r+   r,   r-   r.   r  r  r#   rK   r  r  rx  r{  r  propertyrp  r  r  r   r0   r  r1   r   r\   r   r>   r	   r%  rZ   r?   r7   rT   r  staticmethodr   r   r   r3   r]   r^   s   @r5   r  r    s    "8-"?#,	&" ++| 1/((; ) ) ' ' 0 0  '+*.1537KO595959-1$(,0/3&*34|
##|
 ''|
 !.	|

 u//0|
 "%U->->(?(F"GH|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B "H  26!8 !8ll!8 !.!8 	!8
 "%!8 u||,!8 !.!8 
!8 !8r4   r  c                   V  ^  \ rS rSrU 4S jrS rS r\\         SS\	R                  S\\	R                     S\\	R                     S\\	R                     S	\\   S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\4S jj5       5       rSrU =r$ )Gemma3ForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        [        U5      U l        [
        R                  " UR                  R                  U R                  SS9U l	        U R                  5         g rd   )rJ   rK   
num_labelsr  rr  ri   rj   r  rg   scorerQ  rp   s     r5   rK   (Gemma3ForSequenceClassification.__init__  sZ      ++ (
YYv11==tUZ[
 	r4   c                 6    U R                   R                  5       $ rS   r  r   s    r5   r  4Gemma3ForSequenceClassification.get_input_embeddings  r  r4   c                 :    U R                   R                  U5        g rS   r  r  s     r5   r  4Gemma3ForSequenceClassification.set_input_embeddings  r  r4   rQ   r  r   r   r;   rS  r  r}  r  r   r   c
                    U R                   " U4UUUUUUU	S.U
D6nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                  R
                  R                  c  US:w  a  [        S5      eU R                  R
                  R                  c  SnOUb  XR                  R
                  R                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                   R"                   S35        U[        R                  " XR                  S	9U4   nSnUb  U R%                  XUU R                  S
9n['        UUUR(                  UR*                  UR,                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
)r   r  r   r;   rS  r  r  Nr   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.r   )r   rW   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rW  )r:   r}  pooled_logitsrb   r  )rr  r\  r  r   rb   r  rB  r]  rU   r   r0   int32ra  argmaxr^  r_  rO   r+   r  r   r;   r<   r=   )rN   rQ   r  r   r   r;   rS  r  r}  r  r   transformer_outputsr<   r:   r  last_non_pad_tokennon_pad_masktoken_indicesr!  r9   s                       r5   rT   'Gemma3ForSequenceClassification.forward  s   , #jj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaab%%VR_hlhshs%tD/ /??-;;*55
 	
r4   )rr  r  r  rj  )r+   r,   r-   r.   rK   r  r  r   r   r0   r  r   r1   r\   r	   r%  r   r   r   rT   r3   r]   r^   s   @r5   r  r    s   1/  '+481537+/5959-1$(C
##C
 u001C
 !.	C

 u//0C
 "%C
   1 12C
 !!1!12C
 ))*C
 D>C
 +,C
 
*C
  C
r4   r  )r'  r>  rl  r  r  r  )Nr"   )r  NN)SrL  collections.abcr   dataclassesr   typingr   r   r0   torch.nnri   activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   autor!   configuration_gemma3r#   r$   
get_loggerr+   r^  r'   r7   	EmbeddingrA   Moduler`   rv   r   r   r   r\   rZ   r   r[   r?   r   r   r  r'  r>  rl  r.  r  r  r  r  __all__r*   r4   r5   <module>r@     s  ,  $ ! "   ! . 3 ) m m B 9 q q K F & _ _ /  @ 
		H	% 
< 7 < <  
<; < <2
SBLL 
S		  =BII =(<BII <D(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FM)bii M)`>3 >B ;O ; ;8 H
+ H
 H
V c
- c
 c
L!@		 !@HU\\*ell+  h	B 
E
' E

E
P 
p8%:O p8
p8fU
&; U
pr4   