
    PhM                     r   d dl Z d dlmZ d dlmZ d dlmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZ ddlmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4 ddl5m6Z6m7Z7  e.jp                  e9      Z:e e,d       G d de                    Z;e e,d       G d de*                    Z< G d d e
jz                        Z> G d! d"e
j~                        Z@ G d# d$e
j~                        ZA G d% d&e
j~                        ZBd' ZCdRd(ZDd)ej                  d*eFd+ej                  fd,ZG	 	 	 dSd-e
j~                  d.ej                  d/ej                  d0ej                  d1eej                     d2eHd3eeH   d4eeH   d+eIej                  ej                  f   fd5ZJ G d6 d7e
j~                        ZK G d8 d9e      ZLe, G d: d;e&             ZMd<eFd+eeFeFeFeFgeNf   fd=ZOe, G d> d?eM             ZPe, G d@ dAeMe             ZQ G dB dCe
j~                        ZRdDeej                     dEeej                     dFeFd+ee   fdGZS e,dH       G dI dJeM             ZT e,dH       G dK dLeMe             ZU G dM dNeM      ZV G dO dPeeM      ZWg dQZXy)T    N)Callable)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCache)PretrainedConfig)GenerationMixin)create_causal_maskcreate_masks_for_generate!create_sliding_window_causal_mask)FlashAttentionKwargs) GenericForSequenceClassificationGradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging)deprecate_kwarg)check_model_inputs   )	AutoModel   )Gemma3ConfigGemma3TextConfigzK
    Base class for Gemma3 outputs, with hidden states and attentions.
    )custom_introc                   :    e Zd ZU dZdZeej                     ed<   y)Gemma3ModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r*   r   torchFloatTensor__annotations__     d/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/gemma3/modeling_gemma3.pyr)   r)   3   s     8<%"3"34;r3   r)   zR
    Base class for Gemma3 causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	Gemma3CausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr*   )r+   r,   r-   r.   r7   r   r/   r0   r1   r8   r9   r	   r:   tupler;   r*   r2   r3   r4   r6   r6   C   s     )-D(5$$
%,*.FHU&&'.'+OXe_+8<M8E%"3"345<59Ju001297;%"3"34;r3   r6   c            	       Z     e Zd ZdZd	dedededef fdZdej                  f fdZ	 xZ
S )
Gemma3TextScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    num_embeddingsembedding_dimpadding_idxembed_scalec                 v    t         |   |||       | j                  dt        j                  |      d       y )NrB   F
persistent)super__init__register_bufferr/   tensor)selfr?   r@   rA   rB   	__class__s        r4   rG   z&Gemma3TextScaledWordEmbedding.__init__f   s3    D]ELL,ERWXr3   	input_idsc                     t         |   |      | j                  j                  | j                  j
                        z  S N)rF   forwardrB   toweightdtype)rJ   rL   rK   s     r4   rO   z%Gemma3TextScaledWordEmbedding.forwardj   s2    wy)D,<,<,?,?@Q@Q,RRRr3   )      ?)r+   r,   r-   r.   intfloatrG   r/   TensorrO   __classcell__rK   s   @r4   r>   r>   a   sG    Ys Y3 YS Y_d YS S Sr3   r>   c                   *     e Zd Zdef fdZd Z xZS )	Gemma3MLPconfigc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y NFbias)rF   rG   r[   hidden_sizeintermediate_sizennLinear	gate_projup_proj	down_projr   hidden_activationact_fnrJ   r[   rK   s     r4   rG   zGemma3MLP.__init__o   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556r3   c                     | j                  | j                  | j                  |            | j                  |      z        }|S rN   )rf   rh   rd   re   )rJ   xrf   s      r4   rO   zGemma3MLP.forwardy   s6    NN4;;t~~a/@#ADLLQRO#ST	r3   )r+   r,   r-   r&   rG   rO   rW   rX   s   @r4   rZ   rZ   n   s    7/ 7r3   rZ   c                   <     e Zd Zddedef fdZd Zd Zd Z xZ	S )Gemma3RMSNormdimepsc                     t         |           || _        t        j                  t        j                  |            | _        y rN   )rF   rG   ro   rb   	Parameterr/   zerosrQ   )rJ   rn   ro   rK   s      r4   rG   zGemma3RMSNorm.__init__   s.    ll5;;s#34r3   c                     |t        j                  |j                  d      j                  dd      | j                  z         z  S )Nr"   T)keepdim)r/   rsqrtpowmeanro   )rJ   rk   s     r4   _normzGemma3RMSNorm._norm   s4    5;;quuQx}}R}>IJJJr3   c                     | j                  |j                               }|d| j                  j                         z   z  }|j                  |      S )NrS   )ry   rU   rQ   type_as)rJ   rk   outputs      r4   rO   zGemma3RMSNorm.forward   sC    AGGI& 3!2!2!445~~a  r3   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r<   rQ   shapero   rJ   s    r4   
extra_reprzGemma3RMSNorm.extra_repr   s'    ))*+6$((<<r3   )gư>)
r+   r,   r-   rT   rU   rG   ry   rO   r   rW   rX   s   @r4   rm   rm   ~   s&    5C 5e 5
K!=r3   rm   c                   ~     e Zd ZU ej                  ed<   ddef fdZ ej                         e	d               Z
 xZS )Gemma3RotaryEmbeddinginv_freqr[   c                    t         |           t        |d      rUt        |j                  t
              r;|j                  j                  d|j                  j                  d            | _        nd| _        |j                  | _	        |j                  | _
        || _        t        | j                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                   | _        y )Nrope_scaling	rope_typetypedefaultr   FrD   )rF   rG   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr[   r   rope_init_fnattention_scalingrH   r   original_inv_freq)rJ   r[   devicer   rK   s       r4   rG   zGemma3RotaryEmbedding.__init__   s    6>*z&:M:Mt/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r3   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rt   r$   mpscpuF)device_typeenabledr"   rn   rR   )r   rU   expandr~   rP   r   r   r   strr/   autocast	transposecatcosr   sinrR   )
rJ   rk   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r4   rO   zGemma3RotaryEmbedding.forward   sQ    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs    BF%%F.rN   )r+   r,   r-   r/   rV   r1   r&   rG   no_gradr   rO   rW   rX   s   @r4   r   r      s>    ll// /" U]]_<  <r3   r   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nrt   r"   r   )r~   r/   r   )rk   x1x2s      r4   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r3   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r4   apply_rotary_pos_embr      sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr3   r:   n_repreturnc                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r$   N)r~   r   reshape)r:   r   batchnum_key_value_headsslenhead_dims         r4   	repeat_kvr      so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr3   modulequerykeyvalueattention_maskdropoutscalingsoftcapc                    || j                   dz  }t        || j                        }	t        || j                        }
t        j                  ||	j                  dd            |z  }|||z  }t        j                  |      }||z  }|#|d d d d d d d |	j                  d   f   }||z   }t        j                  j                  |dt        j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||
      }|j                  dd      j!                         }||fS )	N      r"   r   rt   )rn   rR   )ptrainingr$   )r   r   num_key_value_groupsr/   matmulr   tanhr~   rb   
functionalsoftmaxfloat32rP   rR   r   r   
contiguous)r   r   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r4   eager_attention_forwardr      sA    //4'3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL#g-zz,/#g-!$Q1.D
0@0@0D.D%DE#k1 ==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r3   c                   4    e Zd ZdZdedef fdZ eddd      	 	 dd	ej                  d
ej                  de
ej                     de
e   de
ej                     dee   deej                  e
ej                     e
eej                        f   fd       Z xZS )Gemma3Attentionz=Multi-headed attention from 'Attention Is All You Need' paperr[   	layer_idxc                    t         |           |j                  |   dk(  | _        || _        || _        t        |d|j                  |j                  z        | _	        |j                  |j                  z  | _        |j                  dz  | _        | j                  j                  | _        | j                  j                   | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  |j                  | j                  z  |j&                        | _        t#        j$                  |j                  | j                  z  |j                  |j&                        | _        | j                  j0                  | _        | j                  r|j2                  nd | _        t5        |j                  |j6                        | _        t5        |j                  |j6                        | _        y )Nsliding_attentionr   r   r^   )rn   ro   )rF   rG   layer_types
is_slidingr[   r   getattrr`   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropoutuse_bidirectional_attention	is_causalrb   rc   attention_biasq_projk_projv_projo_projattn_logit_softcappingsliding_windowrm   rms_norm_epsq_normk_normrJ   r[   r   rK   s      r4   rG   zGemma3Attention.__init__
  s    ,,Y7;NN"
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>![[DDDii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7;f33D#V=P=PQ#V=P=PQr3   past_key_valuer9   4.58new_nameversionr:   position_embeddingsr   cache_positionr   r   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }| j                  |	      }	| j                  |
      }
|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  rt        | j                  j                     } || |	|
||f| j                   r| j"                  nd| j$                  | j&                  d|\  }} |j(                  g |d j+                         }| j-                  |      }||fS )Nrt   r$   r"   )r   r   r   eager        )r   r   r   )r~   r   r   viewr   r   r   r   r   r   updater   r   r[   _attn_implementationr   r   r   r   r   r   r   r   )rJ   r:   r   r   r9   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r4   rO   zGemma3Attention.forward'  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST{{<0[[,
&S#7jRUWZ#[ j&#&snUL'6'='=j,X\XfXfht'u$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**LL..
%
 
%
!\ *k));;;;FFHkk+.L((r3   )NN)r+   r,   r-   r.   r&   rT   rG   r    r/   rV   r   r	   
LongTensorr   r   r<   rO   rW   rX   s   @r4   r   r     s    GR/ RC R: %0A6R ,059-)||-) #\\-) !.	-)
 "%-) !!1!12-) -.-) 
u||Xell3XeELL>Q5RR	S-) S-)r3   r   c                   t    e Zd Zdedef fdZ eddd      	 	 	 	 	 	 ddej                  d	ej                  d
ej                  de	ej                     de	ej                     de	e   de	e   de	e   de	ej                     deej                  e	eej                  ej                  f      f   fd       Z xZS )Gemma3DecoderLayerr[   r   c                    t         |           || _        |j                  | _        || _        |j
                  |   | _        t        ||      | _        t        |      | _
        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        t        | j                  |j                        | _        y )N)r[   r   ro   )rF   rG   r[   r`   r   r   attention_typer   	self_attnrZ   mlprm   r   input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormr   s      r4   rG   zGemma3DecoderLayer.__init__Y  s    !--"$00;()LV$,T-=-=6CVCVW(5d6F6FFL_L_(`%)6t7G7GVM`M`)a&*78H8HfNaNa*b'r3   r   r9   r   r   r:   position_embeddings_globalposition_embeddings_localr   r   output_attentions	use_cacher   r   c
                 T   |}| j                  |      }| j                  j                  r|}n|} | j                  d||||||||	d|
\  }}| j                  |      }||z   }|}| j	                  |      }| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )N)r:   r   r   r   r9   r  r  r   r2   )r
  r  r   r  r  r	  r  )rJ   r:   r  r  r   r   r9   r  r  r   r   residualr   self_attn_weightsoutputss                  r4   rO   zGemma3DecoderLayer.forwardf  s     !,,]; >>$$";"<+94>> 
,
' 3)%+/)
,
 
,
(( 55mD =0 66}E/77F =0 ")++Gr3   )NNNFFN)r+   r,   r-   r&   rT   rG   r    r/   rV   r   r  r	   boolr<   r0   rO   rW   rX   s   @r4   r  r  X  s   c/ cC c %0A6R 2637+/,1$)590||0 %*LL0 $)<<	0
 !.0 u//00 "%0 $D>0 D>0 !!1!120 
u  (51B1BEDUDU1U+V"WW	X0 S0r3   r  c                   ^     e Zd ZU eed<   dZdZg dZdgZdZ	dZ
dZdZdZeedZ fdZ xZS )Gemma3PreTrainedModelr[    T)r  SiglipVisionEmbeddingsSiglipEncoderLayer#SiglipMultiheadAttentionPoolingHeadr9   )r:   r;   c                    t         |   |       t        |t              r%|j                  j
                  j                          y d|j                  j                  v r%|j                  j
                  j                          y y )NRMSNorm)
rF   _init_weightsr   Gemma3MultiModalProjectormm_input_projection_weightdatazero_rK   r+   rQ   )rJ   r   rK   s     r4   r  z#Gemma3PreTrainedModel._init_weights  sb    f%f78--2288:&**333MM$$& 4r3   )r+   r,   r-   r%   r1   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r   _can_record_outputsr  rW   rX   s   @r4   r  r    s]    &*# $5"5N!"&+%
' 'r3   r  r   c           
      P     dt         dt         dt         dt         dt        f
 fd}|S )zA
    Enables a bidirectional mask within the sliding window.
    	batch_idxhead_idxq_idxkv_idxr   c                 &    t        ||z
        k  S )zA token can attend to any other token if their absolute distance is within
        the (exclusive) sliding window size (distance < sliding_window).)abs)r/  r0  r1  r2  r   s       r4   
inner_maskz1_bidirectional_window_overlay.<locals>.inner_mask  s     56>"^33r3   rT   r  )r   r5  s   ` r4   _bidirectional_window_overlayr7    s3    
4c 4S 4 4c 4d 4
 r3   c                   ,    e Zd ZU eed<   def fdZee	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee   dee	j                     dee   d	ee   d
ee   dee	j                     dee   defd              Z xZS )Gemma3TextModelr[   c           	         t         |   |       |j                  | _        |j                  | _        t        |j                  |j                  | j                  | j                  j                  dz        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        |j                  |j                         | _        t%        |      | _        d| _        t+        j,                  |      }|j.                  |_        ddi|_        t%        |      | _        | j7                          y c c}w )N      ?)rB   r  r[   Fr   r   )rF   rG   pad_token_idrA   
vocab_sizer>   r`   r[   embed_tokensrb   
ModuleListrangenum_hidden_layersr  layersrm   r   normr   
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetar   rotary_emb_local	post_initr   s      r4   rG   zGemma3TextModel.__init__  s    !.. ++ :v1143C3CQUQ\Q\QhQhjmQm
 mmDI&JbJbDcdDcy	2Dcd
 "&"4"4&:M:MN	/v>&+# v&"77*I6 5V D 	 es   "ErL   r   r   r9   inputs_embedsr  r  output_hidden_statesr   r   r   c
                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r$|"| j                  st        | j                         }|	F||j                         nd}t        j                  |||j                  d   z   |j                        }	||	j!                  d      }t#        |x}t$              sx| j                   |||	||d}|j'                         }| j                   j(                  r(d	 |d
<   t+        | j                   j,                        |d
<   t/        di |t1        di |d}|}| j3                  ||      }| j5                  ||      }|rdnd }|rdnd }| j6                  d | j                   j8                   D ]:  }|r||fz  } ||f||||j:                     |||||	d|
}|d   }|s2||d   fz  }< | j=                  |      }|r||fz  }t?        ||||      S )N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr<  r   r$   r   r[   input_embedsr   r   r9   r   c                  L    t        j                  dt         j                        S )NTr   )r/   rI   r  )argss    r4   <lambda>z)Gemma3TextModel.forward.<locals>.<lambda>"  s    TY^YcYc@dr3   or_mask_functionfull_attentionr   r2   )r  r  r   r   r9   r  r  r   )last_hidden_stater9   r:   r;   ) r[   r  rN  r  
ValueErrorrF  r   loggerwarning_oncer?  r
   get_seq_lengthr/   aranger~   r   r   r   r   rG  r   r7  r   r   r   rE  rK  rC  rB  r  rD  r   )rJ   rL   r   r   r9   rM  r  r  rN  r   r   past_seen_tokenscausal_mask_mappingmask_kwargssliding_mask_kwargsr:   r  r  all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                         r4   rO   zGemma3TextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0*$++>O!CRC^==?de"\\  =#6#6q#99$++N )33A6L ?-F ++ -"0"0#2 ,K #."2"2"4{{662d./:WX\XcXcXrXr:s#$67 #5"C{"C%F%]I\%]# & &*__]L%Q"$($9$9-$V! #7BD0d![[)H4;;+H+HIM#!m%55!)+E*C2=3O3OP) /"3#- M *!,M =#3"55) J, 		-0-!11&+++%	
 	
r3   	NNNNNNNNN)r+   r,   r-   r&   r1   rG   r!   r   r   r/   r  rV   r	   r0   r  r   r   r   rO   rW   rX   s   @r4   r9  r9    s   / 4  151537+/59$(,0/359o
E,,-o
 !.o
 u//0	o

 "%o
   1 12o
 D>o
 $D>o
 'tno
 !!1!12o
 +,o
 
!o
  o
r3   r9  c                       e Zd ZU dgZddiZddgdgfiZeed<   dZdef fdZ	e
e	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     dee   deej"                     deej                     dee   dee   dee   deej                     deeej                  f   defd              Z xZS )Gemma3ForCausalLMlm_head.weightlm_headcolwise_repr:   r8   r[   language_modelc                     t         |   |       t        |      | _        |j                  | _        t        j                  |j                  |j                  d      | _        | j                          y r]   )
rF   rG   r9  modelr>  rb   rc   r`   rl  rL  ri   s     r4   rG   zGemma3ForCausalLM.__init__a  sU     $V,
 ++yy!3!3V5F5FUS 	r3   rL   r   r   r9   rM  labelsr  r  rN  r   logits_to_keepr   c                    ||n| j                   j                  }|	|	n| j                   j                  }	 | j                  d||||||||	|
d	|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }| j                   j                  G|| j                   j                  z  }t        j                  |      }|| j                   j                  z  }d}| | j                  ||| j                  fi |}t        |||j                  |j                   |j"                        S )a  
        Example:

        ```python
        >>> from transformers import AutoTokenizer, Gemma3ForCausalLM

        >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

        >>> prompt = "What is your favorite condiment?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is your favorite condiment?"
        ```N)	rL   r   r   r9   rM  r  r  rN  r   r7   r8   r9   r:   r;   r2   )r[   r  rN  rp  rZ  r   rT   slicerl  final_logit_softcappingr/   r   loss_functionr>  r   r9   r:   r;   )rJ   rL   r   r   r9   rM  rq  r  r  rN  r   rr  r   r  r:   slice_indicesr8   r7   s                     r4   rO   zGemma3ForCausalLM.forwardj  sW   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,64:: ,
)%+'/!5),
 ,
  118B>SV8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%4%%ffdooPPD%#33!//))
 	
r3   )NNNNNNNNNNr   )r+   r,   r-   _tied_weights_keys_tp_plan_pp_planr&   r1   r$  rG   r   r   r   r/   r  rV   r	   r0   r  r   rT   r   rO   rW   rX   s   @r4   rj  rj  Y  sd   *+=)H_-z:;H(/   151537+/59-1$(,0/35934F
E,,-F
 !.F
 u//0	F

 "%F
   1 12F
 ))*F
 D>F
 $D>F
 'tnF
 !!1!12F
 c5<</0F
 
 F
  F
r3   rj  c                   D     e Zd Zdef fdZdej                  fdZ xZS )r   r[   c                    t         |           t        j                  t	        j
                  |j                  j                  |j                  j                              | _	        t        |j                  j                  |j                  j                        | _        t        |j                  j                  |j                  j                  z        | _        t        |j"                  dz        | _        | j                   | j$                  z  | _        t        j(                  | j&                  | j&                        | _        y )Nr  r;  )kernel_sizestride)rF   rG   rb   rq   r/   rr   vision_configr`   text_configr!  rm   layer_norm_epsmm_soft_emb_normrT   
image_size
patch_sizepatches_per_imagemm_tokens_per_imagetokens_per_sider~  	AvgPool2davg_poolri   s     r4   rG   z"Gemma3MultiModalProjector.__init__  s    *,,,KK,,88&:L:L:X:XY+
' !.  ,,&2F2F2U2U!
 "%V%9%9%D%DH\H\HgHg%g!h"6#=#=s#BC11T5I5II1A1A$JZJZ[r3   vision_outputsc                    |j                   \  }}}|j                  dd      }|j                  ||| j                  | j                        }|j	                         }| j                  |      }|j                  d      }|j                  dd      }| j                  |      }t        j                  || j                        }|j                  |      S )Nr$   r"   )r~   r   r   r  r   r  flattenr  r/   r   r!  r{   )	rJ   r  
batch_size_
seq_lengthreshaped_vision_outputspooled_vision_outputsnormed_vision_outputsprojected_vision_outputss	            r4   rO   z!Gemma3MultiModalProjector.forward  s    $2$8$8!
Az"0":":1a"@"9"A"A
D$:$:D<R<R#
 #:"D"D"F $.E F 5 = =a @ 5 ? ?1 E $ 5 56K L#(<<0EtGfGf#g '//??r3   )	r+   r,   r-   r%   rG   r/   rV   rO   rW   rX   s   @r4   r   r     s#    \| \ @ell @r3   r   token_type_idsimage_group_idstokens_per_imagec           
      Z      ydt         dt         dt         dt         dt        f
 fd}|S )z
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Nr/  r0  r1  r2  r   c                 H   t        j                  |
j                  d   k  |d      }
| |f   }t        j                  |
j                  d   k  |d      }	| |f   }t        j                  |	j                  d   k  |d      }
| |f   dk(  |dk(  z  }	| |f   |k(  }||z  S )Nr$   r   rt   )r/   wherer~   )r/  r0  r1  r2  safe_idxtoken_type_ids_at_kv_idximage_group_ids_at_kv_idxis_image_blocksame_image_blockr  r  s            r4   r5  z0token_type_ids_mask_function.<locals>.inner_mask  s     ;;v(<(<Q(??K#1)X2E#F #(;;v8L8LQ8O/OQikl#m $3Ix4G$H!$)KK9N9Nq9Q0QSlnp$q!(E)9:a?D\`aDab*9e+;<@YY  000r3   r6  )r  r  r  r5  s   ``  r4   token_type_ids_mask_functionr    s>     1c 1S 1 1c 1d 1" r3   zy
    The Base Gemma3 model which consists of a vision backbone and a language model without language modeling head.,
    c            !       8    e Zd ZddiZdZdef fdZd Zd Zd Z	d	 Z
d
ej                  dej                  fdZdej                  dej                  dej                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     d
eej                     deej                     deej                     dee   deej                     deej                     deej                     deej                     dee   dee   dee   dee   deeef   fd              Z xZS )Gemma3Modelzlanguage_model.modelrn  Fr[   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                  | j                  j                  nd| _        | j                          y )Nr<  rt   )rF   rG   r#   from_configr  vision_towerr   multi_modal_projectorr  r>  rn  r[   r=  rL  )rJ   r[   rn  rK   s      r4   rG   zGemma3Model.__init__  s     %119M9MN%>v%F" ,,77"..f6H6HI,8<8P8P8\DKK44bdr3   c                 6    | j                   j                         S rN   )rn  get_input_embeddingsr   s    r4   r  z Gemma3Model.get_input_embeddings  s    ""7799r3   c                 :    | j                   j                  |       y rN   )rn  set_input_embeddingsrJ   r   s     r4   r  z Gemma3Model.set_input_embeddings  s    007r3   c                     || _         y rN   rn  rJ   decoders     r4   set_decoderzGemma3Model.set_decoder  s
    %r3   c                     | j                   S rN   r  r   s    r4   get_decoderzGemma3Model.get_decoder  s    """r3   pixel_valuesr   c                 `    | j                  |      j                  }| j                  |      }|S )a  
        Projects the last hidden state from the vision model into language model space.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        )r  )r  rZ  r  )rJ   r  r  image_featuress       r4   get_image_featureszGemma3Model.get_image_features  s3     ***EWW33NCr3   rL   rM  r  c                 P   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d      j                  |      j                  |j                        }|j                  d   |j                  d   z  }||   j                         |j                         k7  rt        d| d|       |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )rR   r   rt   r   r$   z6Image features and image tokens do not match: tokens: z, features )r  r/   rI   r[   image_token_idlongr   allsumr   	expand_asrP   r~   numelr[  )rJ   rL   rM  r  special_image_maskn_image_tokensn_image_featuress          r4   get_placeholder_maskz Gemma3Model.get_placeholder_mask*  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL+,2248L8L8NNHHXXcdtcuv  "!r3   r   r   r9   r  r   rq  r  r  rN  return_dictc                 ^   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }|]| j                  |      }|j                  |j                  |j                         }| j#                  |||      }|j%                  ||      }t'        |x}t(              sC| j                  j+                         |||||d}|
 xs |du xs |j,                   xs |du}||r|dk(  j                  |j                        }|t.        j0                  j3                  |dd	      dddd
f    z  }t        j4                  |j7                         d      dz
  }t        j8                  ||t        j:                  |d
|j                              }t=        |j                  |j                        || j                  j>                        |d<   tA        di |tC        di |d} | jD                  d|||||
||d|d	|}tG        |jH                  |
r|jJ                  nd|jL                  |jN                  |      S d      S )a]  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma32-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/gemma32-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```NrP  r   r$   rQ  )rM  r  rR  r$   r   r   rt   r   rW  rX  T)	r   r   r9   rM  r  r  rN  r  r   )rZ  r9   r:   r;   r*   r2   )(r[  r[   r  rN  use_return_dictr  r>  cloner  r^  r/   r_  r~   r   r  rP   rR   r  masked_scatterr   r   get_text_configis_initializedrb   r   padcumsumrT   r  	full_liker  r  r   r   rn  r)   rZ  r9   r:   r;   )rJ   rL   r  r   r   r9   r  r   rM  rq  r  r  rN  r  	lm_kwargsr  llm_input_idsr`  r  ra  rb  
is_prefillis_imagenew_image_startr  r  s                             r4   rO   zGemma3Model.forwardB  s   \ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN
 #!44\BN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F ++557 -"0"0#2 ,K  ,"d*,&555,  t+	  )j +a/33N4I4IJ"*bmm.?.?&XY.?.Z[\^a_a^a[a.b-b"b"',,/B/B/D!"Lq"P"'++ou~rZbZiZi/j# 3O"%%n&;&;<ot{{OnOn3./ #5"C{"C%F%U%U#
 &$%% 
.%+'/!5)
 
 )%777@G33d!//))2>2J
 	

 QU
 	
r3   )NNNNNNNNNNNNN)r+   r,   r-   _checkpoint_conversion_mappingaccepts_loss_kwargsr%   rG   r  r  r  r  r/   rV   r  r  r0   r  r   r   r   r	   r  r   r<   r)   rO   rW   rX   s   @r4   r  r    s    '=>N%O"
| 
:8&#u||  "))":?:K:K"]b]n]n"0  15481537+/595959-1$(,0/3&*L
E,,-L
 u001L
 !.	L

 u//0L
 "%L
 !!1!12L
 !!1!12L
   1 12L
 ))*L
 D>L
 $D>L
 'tnL
 d^L
  
u//	0!L
  L
r3   r  c            "           e Zd ZdddddZdgZdZdef fd	Zd
 Zd Z	d Z
d Zd Zed        Zed        Zed        Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej(                     deej*                     deej,                     deej(                     dee   deej(                     deej(                     deej*                     deej(                     dee   dee   dee   dee   deeej,                  f   d eeef   fd!       Z	 	 	 	 	 	 	 	 	 	 d& fd"	Ze	 d'de d#ej,                  deej,                     dej,                  dee   deej,                     deej,                     d e!fd$       Z" xZ#S )(Gemma3ForConditionalGenerationmodel.language_modelmodel.vision_towermodel.multi_modal_projectorrl  )^language_model.model^vision_tower^multi_modal_projectorz^language_model.lm_headrk  Fr[   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y r]   )rF   rG   r  rp  rb   rc   r  r`   r>  rl  rL  ri   s     r4   rG   z'Gemma3ForConditionalGeneration.__init__  sS      (
yy!3!3!?!?ASASA^A^ejkr3   c                 6    | j                   j                         S rN   rp  r  r   s    r4   r  z3Gemma3ForConditionalGeneration.get_input_embeddings      zz..00r3   c                 :    | j                   j                  |       y rN   rp  r  r  s     r4   r  z3Gemma3ForConditionalGeneration.set_input_embeddings      

''.r3   c                 :    | j                   j                  |       y rN   )rp  r  r  s     r4   r  z*Gemma3ForConditionalGeneration.set_decoder  s    

w'r3   c                 6    | j                   j                         S rN   )rp  r  r   s    r4   r  z*Gemma3ForConditionalGeneration.get_decoder  s    zz%%''r3   c                 8    | j                   j                  |      S rN   )rp  r  )rJ   r  s     r4   r  z1Gemma3ForConditionalGeneration.get_image_features  s    zz,,\::r3   c                 .    | j                   j                  S rN   )rp  rn  r   s    r4   rn  z-Gemma3ForConditionalGeneration.language_model  s    zz(((r3   c                 .    | j                   j                  S rN   )rp  r  r   s    r4   r  z+Gemma3ForConditionalGeneration.vision_tower  s    zz&&&r3   c                 .    | j                   j                  S rN   )rp  r  r   s    r4   r  z4Gemma3ForConditionalGeneration.multi_modal_projector  s    zz///r3   rL   r  r   r   r9   r  r   rM  rq  r  r  rN  r  rr  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	O|j                         }|dddddf   }|	dddf   }||dd|j                  d    df   j                  |j                        }||j                  |j                        dk7     j                         }||j                  |j                        dk7     j                         }n |j                         }|j                         }t        j                         }|j!                  d| j                   j"                  j$                        }|j!                  d      j                  |j                        } |||      }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                  |j,                  |j.                        S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

        >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
        >>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

        >>> messages = [
        ...     {
        ...         "role": "system",
        ...         "content": [
        ...             {"type": "text", "text": "You are a helpful assistant."}
        ...         ]
        ...     },
        ...     {
        ...         "role": "user", "content": [
        ...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
        ...             {"type": "text", "text": "Where is the cat standing?"},
        ...         ]
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(
        ...     messages,
        ...     tokenize=True,
        ...     return_dict=True,
        ...     return_tensors="pt",
        ...     add_generation_prompt=True
        ... )
        >>> # Generate
        >>> generate_ids = model.generate(**inputs)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
        ```
        N)rL   r  r  r   r   r9   rM  r  rq  r  rN  r  r   r   .rt   r$   )r7   r8   r9   r:   r;   r*   r2   )r[   r  rN  r  rp  r   rT   ru  rl  rU   r~   rP   r   r   rb   CrossEntropyLossr   r  r>  r6   r9   r:   r;   r*   )rJ   rL   r  r   r   r9   r  r   rM  rq  r  r  rN  r  rr  r  r  r:   rx  r8   r7   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelsr|   s                               r4   rO   z&Gemma3ForConditionalGeneration.forward  s}   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5#)
 
"  
8B>SV8W~ot4]kmA}a,?@A\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5DY,F'+'7D7V#CVC+#33!//)) ' ; ;
 	
r3   c                 T    t        |   |f||||||	|
|d|}|d   dk(  r||d<   |S )N)r9   rM  r   r   r   r  rr  r  r   r  )rF   prepare_inputs_for_generation)rJ   rL   r9   rM  r   r   r  r   r  r  rr  rq  r   model_inputsrK   s                 r4   r  z<Gemma3ForConditionalGeneration.prepare_inputs_for_generation  s]      w<
+')%)))
 
 !!+7L(r3   rS  c                    | j                         |||||d}||j                  d   dk7  r|dk(  j                  |j                        }	|	t        j
                  j                  |	dd      d d d df    z  }
t        j                  |
j                         d      dz
  }t        j                  |	|t        j                  |d            }t        |j                  |j                        || j                        |d<   t        d	i |S )
NrR  r$   r  r   r  rt   r   rW  r2   )r  r~   rP   r   rb   r   r  r/   r  rT   r  r  r  r  r   )r[   rS  r   r   r9   r   r  r   rb  r  r  r  s               r4   r   z8Gemma3ForConditionalGeneration.create_masks_for_generate  s    ,,.(,,.(
 %,*<*<Q*?1*D
 '!+//0E0EFH&"--*;*;HfTU*;*VWXZ][]Z]W]*^)^^O#ll?+>+>+@aH1LO#kk(OU__UcegEhiO.J!!."7"78/6KeKe/K*+ )7;77r3   )NNNNNNNNNNNNNr   )
NNNNNNNTNNrN   )$r+   r,   r-   r  ry  r  r%   rG   r  r  r  r  r  propertyrn  r  r  r   r   r/   r  r0   rV   r	   r  r   rT   r<   r6   rO   r  staticmethodr   r   r   rW   rX   s   @r4   r  r    s    "8-"?#,	&" ++  | 1/((; ) ) ' ' 0 0  15481537+/595959-1$(,0/3&*34|
E,,-|
 u001|
 !.	|

 u//0|
 "%|
 !!1!12|
 !!1!12|
   1 12|
 ))*|
 D>|
 $D>|
 'tn|
 d^|
 c5<</0|
" 
u22	3#|
 |
B "H  26!8 !8ll!8 !.!8 	!8
 "%!8 u||,!8 !.!8 
!8 !8r3   r  c                   Z    e Zd ZddddZ fdZd Zd Zee	 	 	 	 	 	 	 	 	 dde	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e   de	e
j                     de	e
j                     de	e
j                     de	e   dee   defd              Z xZS )Gemma3ForSequenceClassificationr  r  r  )r  r  r  c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  j                  | j                  d      | _	        | j                          y r]   )rF   rG   
num_labelsr  rp  rb   rc   r  r`   scorerL  ri   s     r4   rG   z(Gemma3ForSequenceClassification.__init__  sZ      ++ (
YYv11==tUZ[
 	r3   c                 6    | j                   j                         S rN   r  r   s    r4   r  z4Gemma3ForSequenceClassification.get_input_embeddings  r  r3   c                 :    | j                   j                  |       y rN   r  r  s     r4   r  z4Gemma3ForSequenceClassification.set_input_embeddings  r  r3   rL   r  r   r   r9   rM  r  rq  r  r   r   c
                     | j                   |f|||||||	d|
}|j                  }| j                  |      }||j                  d   }n|j                  d   }| j                  j
                  j                  |dk7  rt        d      | j                  j
                  j                  d}n||| j                  j
                  j                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                   j"                   d       |t        j                  ||j                  	      |f   }d}|| j%                  |||| j                  
      }t'        |||j(                  |j*                  |j,                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        )r   r  r   r9   rM  r  r  Nr   r$   z=Cannot handle batch sizes > 1 if no padding token is defined.rt   )r   rR   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rQ  )r8   rq  pooled_logitsr[   rt  )rp  rZ  r  r~   r[   r  r=  r[  rP   r   r/   int32r_  argmaxr\  r]  rK   r+   rw  r   r9   r:   r;   )rJ   rL   r  r   r   r9   rM  r  rq  r  r   transformer_outputsr:   r8   r  last_non_pad_tokennon_pad_masktoken_indicesr   r7   s                       r4   rO   z'Gemma3ForSequenceClassification.forward  s   , )djj

)%%+')

 

 ,==M* "+J&,,Q/J;;""//7J!O\]];;""//7!#"%)@)@)M)MMQQRXR_R_afalalmL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD/ /??-;;*55
 	
r3   rh  )r+   r,   r-   r  rG   r  r  r   r   r   r/   r  r0   rV   r	   r  r   r   r   rO   rW   rX   s   @r4   r  r    s.   !7-"?&"1/  15481537+/5959-1$(C
E,,-C
 u001C
 !.	C

 u//0C
 "%C
   1 12C
 !!1!12C
 ))*C
 D>C
 +,C
 
*C
  C
r3   r  c                       e Zd ZU dZeed<   y)#Gemma3TextForSequenceClassificationz
    Gemma3TextForSequenceClassification is a text-only sequence classification model that works with Gemma3TextConfig.
    It uses the generic sequence classification implementation for efficiency and consistency.
    r[   N)r+   r,   r-   r.   r&   r1   r2   r3   r4   r  r  ,  s    
 r3   r  )r  r9  rj  r  r  r  r  )Nr$   )r   NN)YrG  collections.abcr   dataclassesr   typingr   r   r/   torch.nnrb   activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   r   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.deprecationr    utils.genericr!   autor#   configuration_gemma3r%   r&   
get_loggerr+   r\  r)   r6   	Embeddingr>   ModulerZ   rm   r   r   r   rV   rT   r   rU   r<   r   r   r  r  r  r7  r9  rj  r   r  r  r  r  r  __all__r2   r3   r4   <module>r!     sE  ,  $ ! "   ! . 3 ) m m B [ q q K F & _ _ 0 /  @ 
		H	% 
< 7 < < 
<; < <0
SBLL 
S		  =BII =(!<BII !<H(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FN)bii N)b?3 ?D 'O ' '>
# 
(CcSVCWY]C]:^ 
 N
+ N
 N
b X
- X
 X
v!@		 !@HU\\*ell+  h	B 
Q
' Q

Q
h 
s8%:O s8
s8l[
&; [
|*JLa r3   