
    <h(                        S SK r S SKrS SKJrJr  S SKJr  S SKJrJ	r	  S SK
r
S SKJr  S SKJs  Jr  SSKJr  SSKJrJrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/  SSK0J1r1  SSK2J3r3J4r4J5r5J6r6  \/Rn                  " \85      r9\\-" SS9 " S S\ 5      5       5       r:\\-" SS9 " S S\+5      5       5       r; " S S\Rx                  5      r= " S S\Rx                  5      r> " S  S!\Rx                  5      r? " S" S#\Rx                  5      r@ " S$ S%\Rx                  5      rA " S& S'\Rx                  5      rB " S( S)\Rx                  5      rC " S* S+\Rx                  5      rD " S, S-\Rx                  5      rE " S. S/\Rx                  5      rF " S0 S1\'5      rG " S2 S3\R                  5      rI " S4 S5\Rx                  5      rJ " S6 S7\Rx                  5      rK " S8 S9\Rx                  5      rL " S: S;\Rx                  5      rMS< rNS=\
R                  S>\PS?\
R                  4S@ jrQ   SeSA\Rx                  SB\
R                  SC\
R                  SD\
R                  SE\\
R                     SF\RSG\\R   SH\\R   S?\S\
R                  \
R                  4   4SI jjrT  SfSJ\
R                  SK\
R                  SL\
R                  SM\\
R                     SN\P4
SO jjrU " SP SQ\Rx                  5      rV " SR SS\5      rW\- " ST SU\'5      5       rX\-" SVS9 " SW SX\X5      5       rY\-" SYS9 " SZ S[\X\5      5       rZ " S\ S]\Rx                  5      r[\-" S^S9 " S_ S`\X5      5       r\\-" SaS9 " Sb Sc\X\5      5       r]/ SdQr^g)g    N)CallableSequence)	dataclass)OptionalUnion   )ACT2FN)CacheDynamicCacheSlidingWindowLayer)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging   )	AutoModel   )Gemma3nAudioConfigGemma3nConfigGemma3nTextConfigGemma3nVisionConfigzL
    Base class for Gemma3n outputs, with hidden states and attentions.
    )custom_introc                   j    \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Srg)Gemma3nModelOutputWithPast2   a  
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nimage_hidden_statesaudio_hidden_states )__name__
__module____qualname____firstlineno____doc__r)   r   torchFloatTensor__annotations__r*   __static_attributes__r+       d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/gemma3n/modeling_gemma3n.pyr'   r'   2   s5     8<%"3"34;7;%"3"34;r5   r'   zS
    Base class for Gemma3n causal language model (or autoregressive) outputs.
    c                   N   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\\R                     \4      \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   Sr\\R                     \	S
'   Srg)Gemma3nCausalLMOutputWithPastM   aI  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
audio_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    audio_hidden_states of the model produced by the audio encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr)   r*   r+   )r,   r-   r.   r/   r0   r:   r   r1   r2   r3   r;   r<   r   listr
   r=   tupler>   r)   r*   r4   r+   r5   r6   r8   r8   M   s    & )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;7;%"3"34;r5   r8   c                      ^  \ rS rSrSS\S\S\4U 4S jjjrS rS\	R                  S\	R                  4S	 jrS
 rSrU =r$ )Gemma3nRMSNormq   dimeps
with_scalec                   > [         TU ]  5         X l        X0l        U R                  (       a0  [        R
                  " [        R                  " U5      5      U l        g U R                  S[        R                  " S5      SS9  g )Nweight      ?F
persistent)super__init__rE   rF   nn	Parameterr1   onesrH   register_buffertensor)selfrD   rE   rF   	__class__s       r6   rM   Gemma3nRMSNorm.__init__r   sU    $??,,uzz#7DK  5<<+< Or5   c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )Nr   T)keepdim)r1   sqrtpowmeanrE   )rS   xs     r6   _normGemma3nRMSNorm._norm|   s4    5::aeeAhmmBm=HIIIr5   r\   returnc                     U R                  UR                  5       5      U R                  R                  5       -  nUR                  U5      $ N)r]   floatrH   type_as)rS   r\   outputs      r6   forwardGemma3nRMSNorm.forward   s9     AGGI&):):)<<~~a  r5   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r@   rH   shaperE   rS   s    r6   
extra_reprGemma3nRMSNorm.extra_repr   s'    ))*+6$((<<r5   )rE   rH   rF   )gư>T)r,   r-   r.   r/   intrb   boolrM   r]   r1   Tensorre   rj   r4   __classcell__rT   s   @r6   rB   rB   q   sR    PC Pe P P PJ! !%,, != =r5   rB   c                   &  ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S\R                  S	\
S
\
S\
S\
S\
S\
S\R                  4S jrS\R                  S\R                  S\R                  4S jrSrU =r$ )%Gemma3nAudioRelativePositionEmbedding   configc                 L  > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R                  U R                  -  U l        [        SU R                  R                  S-
  5      U l
        U R                  R                  U l        [        R                  " U R                  U R                  U R                  -  SS9U l        SnSnU R                  S-  n[         R"                  " [%        U5      [%        U5      -  5      [        US-
  S5      -  nU[&        R(                  " [&        R*                  " U5      U* -  5      -  nU R-                  SUR%                  5       R/                  S5      R/                  S5      SS	9  g )
Nr   r    FbiasrI   g     @r   inv_timescalesrJ   )rL   rM   rt   conf_num_attention_heads	num_headshidden_sizechannelshead_dimmaxconf_attention_context_leftmax_backwardconf_attention_context_rightmax_forwardrN   Linearpos_projmathlogrb   r1   exparangerQ   	unsqueeze)rS   rt   min_timescalemax_timescalenum_timescaleslog_timescale_incrementrx   rT   s          r6   rM   .Gemma3nAudioRelativePositionEmbedding.__init__   sJ   ==//74;;#J#JQ#NO;;CC		$--$--1OV[\!+"&((5+?%BV+V"WZ]^lop^prsZt"t&5<<3OSjRj3j)kk  ",,Q/99!< 	 	
r5   positiondtyper_   c                 H   UR                  5       R                  S5      nXR                  R                  UR                  [
        R                  S9-  n[
        R                  " [
        R                  " U5      [
        R                  " U5      /SS9nUR                  U5      $ )NrW   )devicer   rD   )rb   r   rx   tor   r1   float32catsincostype)rS   r   r   scaled_timetiming_signals        r6   _get_timing_signal_1d_pos?Gemma3nAudioRelativePositionEmbedding._get_timing_signal_1d_pos   s{    >>#--b1!4!4!7!7xV[VcVc!7!dd		599[#9599[;Q"RXZ[!!%((r5   term_bd_before_shift
batch_sizerz   num_query_blocksquery_block_sizekey_context_sizemax_span_plus_1c                     US-   U-
  nSU4n	[         R                  R                  X5      n
U
R                  UUUXVS-   -  45      nUSS2SS2SS2SXV-  24   nUR                  UUUUU45      nU$ )a"  Performs the relative shift.

Args:
  term_bd_before_shift: Tensor of shape [B, N, U, W, F_span]. batch_size
    (B), num_heads (N), num_query_blocks (U), query_block_size (W),
    key_context_size (C = W+L+R), max_span_plus_1 (F_span = L+R+1).

Returns:
  Tensor of shape [B, N, U, W, C].
r    r   N)rN   
functionalpadreshape)rS   r   r   rz   r   r   r   r   pad_amount_last_dimpadding_tupleterm_bd_paddedterm_bd_reshapedterm_bd_slicedterm_bd_shifteds                 r6   _relative_shift5Gemma3nAudioRelativePositionEmbedding._relative_shift   s    4  0!3F /0**+?O
 *11  q$89	
 *!Q3X5E5X3X*XY )00   
 r5   querieskeysc           	      >   UR                   u  p4pVnUR                   u    p  n[        R                  " U R                  U R                  * S-
  SUR
                  S9R                  S5      n
U
R                   S   nU R                  XR                  S9nU R                  U5      nUR                  SXR                  U R                  5      R                  S5      nUR                  SSSSS5      nUR                  SSSSS5      n[        R                  " UU5      nUR                  SSSSS5      nUR                  SSS5      nUR                  X6XE-  U5      n[        R                  " UU5      nUR                  UUUUU5      nU R!                  UUUUUU	U5      nUU-   $ )	Nr    rW   r   r   r   r   r      )rh   r1   r   r   r   r   r   r   r   r   r   rz   r}   squeezepermutematmulr   )rS   r   r   r   r   r   rz   r}   _r   pos_indicesr   sin_emb_timing_signalprojected_sin_embsin_emb	queries_pkeys_p_tterm_ac
q_permuted
s_permuted
q_reshapedterm_bd_unshifed_matmulterm_bd_unshifedr   s                           r6   re   -Gemma3nAudioRelativePositionEmbedding.forward   s    OVmmK
&68'+zz$11 ll4#4#4t7G7G6G!6KRX_XfXfgqq
 &++A. $ > >}} !? !

 !MM*?@#++APTP]P]^ff
 OOAq!Q2	<<1aA.,,y(3 __Q1a3
 __Q1-
  ''
?O?bdlm

 #(,,z:"F 3::
 ..
 ((r5   )r|   rt   r}   r   r   rz   r   )r,   r-   r.   r/   r!   rM   r1   rn   r   r   rl   r   re   r4   ro   rp   s   @r6   rr   rr      s    
1 
.)%,, )u{{ )W\WcWc );#ll; ; 	;
 ; ; ; ; 
;zL)u|| L)5<< L)ELL L) L)r5   rr   c                   8  ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	S	\R                  S\R                  4S
 jr
S	\R                  S\R                  4S jrS	\R                  S\R                  S\R                  4S jrSrU =r$ )Gemma3nAudioAttentioni6  rt   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  R
                  U l        U R
                  U R                  -  U l        U R                  R                  U l        U R                  R                  U l
        [        SU R                  R                  S-
  5      U l        U R                  R                  U l        U R                  U R                  -   U R                  -   U l        [#        U5      U l        [&        R(                  " [*        R,                  " U R                  45      5      U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        [&        R0                  " U R
                  U R                  U R                  -  SS9U l        U R                  S-  nS[*        R&                  R8                  R;                  [*        R<                  " S5      5      -  nU R?                  SX#-  RA                  5       RC                  5       SS	9  [*        RD                  " [*        RF                  " U R                   U R                  4[*        RH                  S
9SS9RJ                  n[*        RD                  " [*        RF                  " U R                  U R                   4[*        RH                  S
9U R                  U R                  -   S9n[*        RF                  " U R                  U R                   4[*        RH                  S
9nXd-  U-  nU R?                  SUSS	9  U R?                  S[*        R<                  " U R                  5      RM                  5       SS	9  g )Nr   r    Frv         rI           q_scalerJ   r   )diagonallocal_causal_valid_masksoftcap)'rL   rM   rt   ry   rz   r{   r}   conf_attention_chunk_size
chunk_sizer   max_future_horizonr~   r   max_past_horizonconf_attention_logit_capattention_logits_soft_capcontext_sizerr   relative_position_embeddingrN   rO   r1   zerosper_dim_scaler   q_projk_projv_projr   softplusrR   rQ   clonedetachtrilrP   rm   Trb   )rS   rt   r   r_softplus_0lower_causal_maskupper_causal_maskr   rT   s          r6   rM   Gemma3nAudioAttention.__init__7  s   ==;;22((DNN:++??"&++"J"J #At{{'N'NQR'R S)-)M)M& OOd.C.CCdF]F]]+PQW+X(\\%++t}}6F*GHii 0 0$..4==2PW\]ii 0 0$..4==2PW\]ii 0 0$..4==2PW\]--%UXX0099%,,s:KLLY)?(F(F(H(O(O(Q^cd!JJJJ))4??;5::N
 ! 	 "JJJJ):):;5::N**T-D-DD
 #(**doot?P?P-QY^YcYc"d"9"MPa"a68O\abLL778>>@ 	 	
r5   r\   pad_left	pad_rightr_   c                     UR                   tpEnUR                  XB/UQ75      nUR                  XC/UQ75      n[        R                  " XqU/SS9nU$ )Nr    r   )rh   	new_zerosr1   r   )	rS   r\   r   r   batchr   
tail_shapeleftrights	            r6   	_pad_dim1Gemma3nAudioAttention._pad_dim1b  sV     !:{{E9j9:U;
;<IIt&A.r5   r=   c                 "   UR                   nUSS u  p4X@R                  -   S-
  U R                  -  nXPR                  -  U-
  =nS:  a  U R                  USU5      nX5U R                  4USS -   nUR                  U5      R	                  5       nU$ )a  Turns a sequence to non overlapping blocks.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, block_size, ...], with necessary
    paddings,
    where output[:, i, ...] are x[:, i*block_size:(i+1)*block_size, ...].
Nr   r    r   )rh   r   r   r   
contiguous)rS   r=   rh   bt
num_blockspadding_lenpermute_dimss           r6   _convert_to_block'Gemma3nAudioAttention._convert_to_blocki  s     ##Ray//)A-$//A
%7!;;Kq@ NN=![IMt7%)C%--l;FFHr5   c                 R   U R                   nU R                  U R                  -   S-
  nU R                  XU5      nU R                  nU R                  nUR                  SXES9nUR                  S:  a&  UR                  S:  a  [        R                  " USSS9nUR                  5       $ )a  Extracts temporal context for every block.

Args:
    hidden_states: a tensor of [batch, time, ...].

Returns:
    A tensor of [batch, num_blocks, context_size, ...], with necessary
    paddings,
    where context_size = block_size + left_context + right_context,
    and output[:, i, ...] are x[:, start-left_context:end+right_context,
    ...],
    start = i * block_size, end = (i + 1) * block_size.
r    )	dimensionsizestepr   r   rW   )sourcedestination)
r   r   r   r   r   unfoldndimr1   movedimr   )rS   r=   r   r   	frame_len
frame_step
x_unfoldeds          r6   _extract_block_context,Gemma3nAudioAttention._extract_block_context  s     (( ++doo=A	}	J%%	__
 #))AI)W
 !joo&9 z"!LJ$$&&r5   maskc                    / UR                   S S QU R                  PU R                  P7nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       nU R                  U5      R	                  U5      R                  5       n[        R                  R                  R                  U R                  5      nSSSU R                  4nUR                  U5      n	X@R                  -  U	-  nUR                   S S u  pU R                  U5      nU R!                  U5      nU R!                  U5      nUR                   S   nU) nU R!                  U5      nUR"                  S:X  aI  UR                   S   UR                   S   -  U R$                  :X  a  UR	                  XU R$                  5      nUR                   U
UU R$                  4:w  a,  ['        SUR                    SU
 SU SU R$                   S	3	5      eUR)                  S5      R)                  S
5      nU R*                  R)                  S5      R)                  S5      R)                  S5      n[        R,                  " UUR/                  UR0                  5      5      nU R3                  X5      nU R4                  R/                  UR0                  5      nUU-  n[        R6                  " U5      nUU-  n[        R8                  " UU[        R:                  " UR<                  5      R>                  5      n[        R                  R                  RA                  US[        RB                  S9R/                  UR<                  S9nUR                   u  nnnnnUR                   S   nURE                  SSSSS5      R	                  SUU5      nURE                  SSSSS5      R	                  SUU5      n[        RF                  " UU5      n U R	                  UUUUU5      RE                  SSSSS5      n!U!R	                  U
XRH                  -  U R                  U R                  45      n!U!S S 2S U24   n!U!$ )NrW   r    r   r   r   z%Shape of extracted_valid_mask_blocks z	 is not (z, z) after potential reshape.r   rD   r   r   )%rh   rz   r}   r   r   r   r   r   r1   rN   r   r   r   viewr   r   r  r   r   
ValueErrorr   r   logical_andr   r   r   r   tanhwherefinfor   minsoftmaxr   r   bmmr   )"rS   r=   r  	qkv_shapequery_states
key_statesvalue_statesper_dim_scale_spbroadcast_shapeper_dim_scale_sp_broadcastr   q_timequery_blocks
key_blocksvalue_blocksr   original_valid_maskextracted_valid_mask_blockscondition_from_input_validitycondition_from_causalityfinal_condition_for_wherer;   softcap_valprobabilitiesb_dimn_dimu_dimw_dimc_dimh_dimprob_bunv_bun
result_bmmcontext_vectorss"                                     r6   re   Gemma3nAudioAttention.forward  sI   Nm))#2.NNN	{{=199)DOOQ[[/77	BMMO
{{=199)DOOQ 88..778J8JKaDMM2%5%:%:?%K"#ll25OO)//3
--l;00<
22<@'--a0  $e '+&A&ABU&V# (,,1+11!47R7X7XYZ7[[_c_p_pp*E*M*Md.?.?+' ',,1
 

 /556i
| L$%R(9(9'::TV  )D(M(Ma(P(Z(Z[](^% $(#?#?#I#I!#L#V#VWX#Y#c#cde#f 
 %*$5$5)$''(E(L(LM%
! 11,K lloofmm4+%F#+% 6FLL@Y@]@]^++33F%--3X[[bnbtbt[u -:,?,?)ueUE""2& ((Aq!Q7??E5Q$$Q1a3;;BuMYYx/
$,,UE5%OWWXY[\^_abdef)11 ??2	
 *!WfW*5r5   )r   r   rt   r   r}   r{   r   r   r   rz   r   r   r   r   )r,   r-   r.   r/   r!   rM   r1   rn   rl   r   r   r  
BoolTensorre   r4   ro   rp   s   @r6   r   r   6  s    )
1 )
V5<< 3 3 5<< u||  ,.'ELL .'U\\ .'`dU\\ d9I9I dell d dr5   r   c                      ^  \ rS rSrSr SS\S\\   S\4U 4S jjjrS\	R                  S\	R                  4S	 jrS
rU =r$ )Gemma3nAudioCumulativeGroupNormi  a  Applies Group Normalization cumulatively over the time dimension.

This layer normalizes the input by calculating the mean and variance
cumulatively over the time dimension (dim 1). The statistics are computed
over all feature dimensions (specified by `feature_dims` and `num_channels`)
for elements marked as valid by the optional `mask`.

If a `mask` is provided (True for valid, False for invalid/padded),
invalid time steps do not contribute to the statistics calculation, and
their corresponding output values are zeroed out.

Scale and bias, if enabled, are applied per-channel (last dimension).
This behavior is similar to JAX's `GroupNormalization` with `num_groups=1`
and `cumulative=True`.
num_channelsfeature_dimsrE   c           	        > [         TU ]  5         Xl        [        U5      U l        X0l        [        R                  " [        R                  " U5      5      U l
        [        [        SS[        U R                  5      -   S-   5      5      U l        g )Nr   r    )rL   rM   r4  r@   r5  rE   rN   rO   r1   rP   rH   rangelenreduction_axes)rS   r4  r5  rE   rT   s       r6   rM   (Gemma3nAudioCumulativeGroupNorm.__init__'  sn     	(!,/ ll5::l#;< $E!QT5F5F1G-G!-K$LMr5   r=   r_   c                    U R                   U R                  4-   nUR                  SS U:w  a  [        SUR                  SS  SU 35      eUR                  n[
        R                  nUR                  U5      n[
        R                  " XTS9n[
        R                  " XPR                  SS9n[
        R                  " USS	9n[
        R                  " X`R                  SS9n	[
        R                  " U	SS	9n
[
        R                  " U
S
S9nX-  nX\-
  R                  S5      n[
        R                  " XR                  SS9n[
        R                  " USS	9nX-  nX\-
  [
        R                  " UU R                  -   5      -  nU R                   R                  U5      nS/UR#                  5       S-
  -  U R                  /-   nUUR%                  U5      -  nUU-  nUR                  U5      $ )zApplies cumulative group norm, optionally using a mask.

Args:
  hidden_states: Input tensor, shape [B, T, *feature_dims, C].

Returns:
  Normalized tensor with the same shape as x.
r   NzInput tensor shape suffix z> does not match expected suffix (feature_dims + num_channels) r   TrD   rX   r    r   rI   )r  )r5  r4  rh   r  r   r1   r   r   	ones_likesumr9  cumsumclamprZ   rsqrtrE   rH   rD   r  )rS   r=   expected_input_suffixinput_dtype
calc_dtypex_calc	mask_calcsum_values_at_tcum_sum_valueselements_in_group_at_tcum_count_elementssafe_cum_count_elementscum_meansquared_diff_from_meansum_sq_diff_at_tcum_sum_sq_diffcum_variancenormalized_xscalescale_view_shapefinal_outputs                        r6   re   'Gemma3nAudioCumulativeGroupNorm.forward9  s    !% 1 1T5F5F4H Hqr"&;;,]-@-@-D,E F99N8OQ 
 $))]]
!!*- OOF=	  ))F0C0CTRo1= "'9:M:MW[!\"\\*@aH"'++.@c"J ";
 #)"3!8!8!; 99%;ATAT^bc  ,,'7Q? '@ )U[[9P-QQ z*3-"3"3"5"9:d>O>O=PP#ejj1A&BB $i/{++r5   )rE   r5  r4  r9  rH   )gMbP?)r,   r-   r.   r/   r0   rl   r   rb   rM   r1   rn   re   r4   ro   rp   s   @r6   r3  r3    s`    ( 	NN smN 	N N$G,U\\ G,ell G, G,r5   r3  c                      ^  \ rS rSrSr SS\S\S\S\\\\\4   4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )Gemma3nAudioSSCPConvBlocki  zA single convolution block for the SubSampleConvProjection.

This block consists of a 2D convolution, followed by CumulativeGroupNorm,
and a ReLU activation. It handles manual padding for the convolution.
rt   idxinput_freq_dimmanual_paddingc           	      6  > [         TU ]  5         Xl        X@l        US:X  a  SOU R                  R                  US-
     nU R                  R                  U   nU R                  R
                  U   u  pxU R                  R                  U   u  p[        R                  " UUUU4X4SSS9U l	        X0R                  S   -   U R                  S   -   nX-
  U
-  S-   n[        UU4U R                  R                  S9U l        [        R                  " 5       U l        g )Nr   r    )r   r   F)in_channelsout_channelskernel_sizestridepaddingrw   )r4  r5  rE   )rL   rM   rt   rZ  sscp_conv_channel_sizesscp_conv_kernel_sizesscp_conv_stride_sizerN   Conv2dconvr3  sscp_conv_group_norm_epsnormReLU
activation)rS   rt   rX  rY  rZ  r\  r]  kernel_hkernel_wstride_hstride_wf_in_padded
f_out_convrT   s                r6   rM   "Gemma3nAudioSSCPConvBlock.__init__  s    	, !8a)K)KCRSG)T{{99#>![[>>sC![[>>sCII#% '

	 %':':1'==@S@STU@VV!,9A=
3%$44
	 '')r5   audio_encodingsr_   c                 0   [         R                  " XR                  SSS9nU R                  U5      nUR	                  SSSS5      R                  5       nU R                  U5      nUR	                  SSSS5      R                  5       nU R                  U5      $ )Nconstantr   )modevaluer   r   r   r    )Fr   rZ  re  r   r   rg  ri  )rS   rq  audio_encodings_paddedaudio_encodings_conv
x_for_normx_normedaudio_encodings_normeds          r6   re   !Gemma3nAudioSSCPConvBlock.forward  s     "#8K8KR\dg!h  $yy)?@ *11!Q1=HHJ
99Z(!)!1!1!Q1!=!H!H!J566r5   )ri  rt   re  rZ  rg  ))r   r   r   r   )r,   r-   r.   r/   r0   r!   rl   r@   rM   r1   rn   re   r4   ro   rp   s   @r6   rW  rW    sm     5A)$")$ )$ 	)$
 c3S01)$ )$V7u|| 7 7 7r5   rW  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )#Gemma3nAudioSubSampleConvProjectioni  rt   c                 Z  > [         TU ]  5         Xl        UR                  n/ n/ n[	        S5       Hk  nUR
                  U   u  pgUR                  U   u  pSn
US-
  nSnSnUUU
U4nUR                  U5        X,-   U-   nX-
  U	-  S-   nUR                  U5        UnMm     [        SUR                  UUS   S9U l	        [        SUS   UUS   S9U l
        UR                  S   nUS   nUU-  U l        [        R                  " U R                  U R                  R                  SS9U l        g )Nr   r   r    )rX  rY  rt   rZ  rW   Frv   )rL   rM   rt   input_feat_sizer7  rb  rc  appendrW  conv_0conv_1ra  input_proj_in_featuresrN   r   r{   input_proj_linear)rS   rt   current_f_for_block_inputcalculated_block_paddingcalculated_f_out_dimsirj  rk  rl  rm  	pad_t_toppad_t_bottom
pad_f_leftpad_f_rightmanual_padding_tuplern  f_out_after_convfinal_c_outfinal_f_outrT   s                      r6   rM   ,Gemma3nAudioSubSampleConvProjection.__init__  sr   $*$:$:!#%  "qA!'!=!=a!@H!'!=!=a!@H I#a<L JK 	$  %++,@A 4@;NK + 68CaG!(()9:(8%= @ 0!113A6	
 0033A6	
 33B7+B/&1K&?#!#4+F+FH_H_fk!lr5   rq  r_   c                    UR                  S5      nU R                  U5      nU R                  U5      nUR                  u  pEpgUR	                  SSSS5      R                  5       nUR                  XFXu-  5      n	U R                  U	5      n
U
$ )Nr    r   r   r   )r   r  r  rh   r   r   r  r  )rS   rq  audio_encodings_reshapedr\   r   c_outt_outf_out
x_permutedoutput_flattenedrd   s              r6   re   +Gemma3nAudioSubSampleConvProjection.forward   s     $3#<#<Q#? KK01KKN!"%YYq!Q*557
%??1U]C''(89r5   )rt   r  r  r  r  r,   r-   r.   r/   r!   rM   r1   rn   re   r4   ro   rp   s   @r6   r~  r~    s3    7m1 7mru||   r5   r~  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerAttentioni  rt   c                   > [         TU ]  5         Xl        U R                  R                  U l        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l
        [        U5      U l        [        R                  " U R                  U R                  R                  SS9U l        [        U R                  R                  5      U l        g )Ngradient_clippingFrJ   rv   )rL   rM   rt   r{   post_in_featuresrQ   r1   rR   r  rB   pre_attn_normr   attnrN   r   post	post_normrS   rt   rT   s     r6   rM   'Gemma3nAudioConformerAttention.__init__  s     $ 7 70%,,t{{?\?\2]jop+DKK,C,CD)&1	IId33T[[5L5LSXY	'(?(?@r5   rq  audio_mel_maskr_   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  XB5      nUR
                  u  pgpUR                  XgX-  5      n
U R                  U
5      n[         R                  " XR                  * U R                  5      nX0R                  U5      -   $ ra   )	r1   r@  r  r  r  rh   r   r  r  )rS   rq  r  audio_encodings_input_to_attnaudio_encodings_normaudio_encodings_attn_outr   r   rz   r}   r  s              r6   re   &Gemma3nAudioConformerAttention.forward  s    (7%++o8N8N7NPTPfPfg#11/B#'99-A#R  %=$B$B!i#;#C#CA)J^#_ ))$<=++o8N8N7NPTPfPfg,~~o/NNNr5   )r  rt   r  r  r  r  r,   r-   r.   r/   r!   rM   r1   rn   r1  re   r4   ro   rp   s   @r6   r  r    sG    A1 AOu|| OUEUEU OZ_ZfZf O Or5   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerFeedForwardi+  rt   c                   > [         TU ]  5         Xl        U R                  S[        R
                  " U R                  R                  5      SS9  [        U R                  R                  5      U l	        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  S-  U R                  R                  SS9U l        [        U R                  R                  5      U l        [        R
                  " U R                  R                  5      U l        g )Nr  FrJ   r   rv   )rL   rM   rt   rQ   r1   rR   r  rB   r{   pre_layer_normrN   r   ffw_layer_1ffw_layer_2post_layer_normconf_residual_weightpost_layer_scaler  s     r6   rM   )Gemma3nAudioConformerFeedForward.__init__,  s    0%,,t{{?\?\2]jop,T[[-D-DE99T[[%<%<dkk>U>UXY>Y`ef99T[[%<%<q%@$++BYBY`ef-dkk.E.EF %T[[-M-M Nr5   rq  r_   c                    Un[         R                  " XR                  * U R                  5      nU R                  U5      nU R	                  U5      n[
        R                  R                  U5      nU R                  U5      n[         R                  " XR                  * U R                  5      nU R                  U5      nX!U R                  -  -   $ ra   )r1   r@  r  r  r  rN   r   silur  r  r  )rS   rq  residuals      r6   re   (Gemma3nAudioConformerFeedForward.forward8  s    "++o8N8N7NPTPfPfg--o>(,(8(8(I--,,_=(,(8(8(I++o8N8N7NPTPfPfg..?T-B-BBCCr5   )rt   r  r  r  r  r  r  rp   s   @r6   r  r  +  s6    
O1 
O	Du|| 	D 	D 	Dr5   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ ) Gemma3nAudioConformerLightConv1diD  rt   c           
        > [         TU ]  5         Xl        [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l	        [        R                  " U R                  R                  U R                  R                  U R                  R                  SSU R                  R                  SS9U l        U R                  S[        R                  " U R                  R                   5      SS	9  [        U R                  R                  U R                  R
                  S9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        U R                  R                  S-
  U l        g )
NrE   r   Frv   r    r   )r\  r]  r^  r_  r`  groupsrw   r  rJ   )rL   rM   rt   rB   r{   rms_norm_epsr  rN   r   linear_startConv1dconf_conv_kernel_sizedepthwise_conv1drQ   r1   rR   r  	conv_norm
linear_endcausal_paddingr  s     r6   rM   )Gemma3nAudioConformerLightConv1d.__init__E  sB   ,T[[-D-D$++JbJbcIIdkk&=&=t{{?V?VYZ?Zafg "		//0099;;**!
 	0%,,t{{?\?\2]jop'(?(?T[[E]E]^))DKK$;$;T[[=T=T[`a"kk??!Cr5   rq  r_   c                 2   UnU R                  U5      nU R                  U5      n[        R                  R                  R                  USS9nUR                  SSS5      n[        R                  " X0R                  S45      nU R                  U5      nUR                  SSS5      n[        R                  " XR                  * U R                  5      nU R                  U5      n[        R                  R                  U5      nU R                  U5      nX-   nU$ )NrW   r   r   r   r    )r  r  r1   rN   r   glur   rv  r   r  r  r@  r  r  r  r  )rS   rq  audio_encodings_residualaudio_encodings_permutedaudio_encodings_permuted_paddedrd   s         r6   re   (Gemma3nAudioConformerLightConv1d.forwardZ  s    #2 --o>++O<((--11/r1J#2#:#:1a#C *+%%0HK^K^`aJb*c'//0OP)11!Q:++o8N8N7NPTPfPfg..9--,,_=///: ;r5   )r  rt   r  r  r  r  r  r  rp   s   @r6   r  r  D  s2    D1 D*u||   r5   r  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nAudioConformerBlockio  rt   c                   > [         TU ]  5         Xl        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l        [        U R                  5      U l	        U R                  S[        R                  " U R                  R                  5      SS9  [        U R                  R                  5      U l        g )Nr  FrJ   )rL   rM   rt   r  ffw_layer_startr  	attentionr  lconv1dffw_layer_endrQ   r1   rR   r  rB   r{   rg  r  s     r6   rM   #Gemma3nAudioConformerBlock.__init__p  s    ?L7D7D=dkkJ0%,,t{{?\?\2]jop"4;;#:#:;	r5   rq  r  r_   c                 f   U R                  U5      nU R                  X5      nU) nXR                  S5      R                  UR                  5      -  nU R                  U5      nU R                  U5      n[        R                  " XR                  * U R                  5      nU R                  U5      nU$ )NrW   )r  r  r   r   r   r  r  r1   r@  r  rg  )rS   rq  r  validity_mask_for_lconvaudio_encodings_for_lconv_inputrd   s         r6   re   "Gemma3nAudioConformerBlock.forward{  s    ..?..I#1/*9<]<]^`<a<d<d!!=
 +
' ,,'FG,,_=++o8N8N7NPTPfPfg?+r5   )r  rt   r  r  r  rg  r  rp   s   @r6   r  r  o  s@    	<1 	<u|| UEUEU Z_ZfZf  r5   r  c                      ^  \ rS rSr% Sr\\S'   SrS\4U 4S jjrS\	R                  S\	R                  S\\	R                  \	R                  4   4S jrS	rU =r$ )
Gemma3nAudioEncoderi  zfAn audio encoder based on the [Universal Speech Model](https://arxiv.org/abs/2303.01037) architecture.rt   	audio_melc                    > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l
        g s  snf ra   )rL   rM   rt   r~  subsample_conv_projectionrN   
ModuleListr7  conf_num_hidden_layersr  	conformer)rS   rt   r   rT   s      r6   rM   Gemma3nAudioEncoder.__init__  sZ     )LV)T&9>v?\?\9]^9]A'/9]^
^s   A/r  r_   c                 0   U R                  U5      nUR                  S   nSn[        [        U R                  R
                  5      5       H!  nXPR                  R
                  U   S   -  nM#     [        R                  " XBR                  S9U-  n[        R                  " XrR                  S   S-
  S9nUR                  S:  a?  UR                  S:X  a/  UR                  S5      R                  UR                  S   S5      nOcUR                  UR                  :X  aI  UR                  S   S:X  a6  UR                  S   S:w  a#  XGR                  S   :X  a  UR                  S5      n[        R                  " USU5      nU R                   H  n	U	" X85      nM     U R                  R                  S:  a@  USS2SSU R                  R                  24   nUSS2SSU R                  R                  24   nUR!                  UR                  S5      S5      nX84$ )ad  Encodes a batch of MELs.

Args:
    audio_mel: a torch.Tensor of shape [batch, num_frames, num_channels,
      mel_bins].

Returns:
    audio_encodings: a torch.Tensor of shape
        `[batch_size, self.config.audio_soft_tokens_per_image,
        self.config.audio_config.hidden_size]`
    audio_mel_mask: a torch.BoolTensor of shape [batch, num_frames].
r    r   r   )r~   rW   Nr   )r  rh   r7  r8  rt   rc  r1   r   r   r@  r   r   expandgatherr  conf_reduction_factormasked_fill)
rS   r  r  rq  t_subtime_stride_productstride_pair_idxindicescurrent_maskblocks
             r6   re   Gemma3nAudioEncoder.forward  s    88C  %%a($S)J)J%KLO;;#D#D_#UVW#XX  M ,,u-B-BCFYY++g+?+?+BQ+FG "w||q'8''*11.2F2Fq2I2NG7<</$$Q'1,a A%q)) ''*G||NAw?^^E#OBO $ ;;,,q0-a1UDKK4U4U1U.UVO'+Odkk.O.O+O(OPL)55l6L6LR6PRUV,,r5   )rt   r  r  )r,   r-   r.   r/   r0   r!   r3   main_input_namerM   r1   rn   r1  r@   re   r4   ro   rp   s   @r6   r  r    s^    p!O
1 
5-5-7<7G7G5-	u||U---	.5- 5-r5   r  c            	       l   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\R                  4U 4S	 jjr
S
rU =r$ )Gemma3nTextScaledWordEmbeddingi  zT
This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
num_embeddingsembedding_dimpadding_idxembed_scalec                 p   > [         TU ]  XU5        U R                  S[        R                  " U5      SS9  g )Nr  FrJ   )rL   rM   rQ   r1   rR   )rS   r  r  r  r  rT   s        r6   rM   'Gemma3nTextScaledWordEmbedding.__init__  s1    D]ELL,ERWXr5   	input_idsc                    > [         TU ]  U5      U R                  R                  U R                  R
                  5      -  $ ra   )rL   re   r  r   rH   r   )rS   r  rT   s     r6   re   &Gemma3nTextScaledWordEmbedding.forward  s2    wy)D,<,<,?,?@Q@Q,RRRr5   r+   )rI   )r,   r-   r.   r/   r0   rl   rb   rM   r1   rn   re   r4   ro   rp   s   @r6   r  r    sM    Ys Y3 YS Y_d Y YS S Sr5   r  c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	Gemma3nTextLaurelBlocki  z Learned Augmented Residual Layerrt   c                   > [         TU ]  5         Xl        [        R                  " U R                  R
                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R
                  SS9U l        [        U R                  R
                  U R                  R                  S9U l        g )NFrv   r  )rL   rM   rt   rN   r   r{   laurel_ranklinear_leftlinear_rightrB   r  post_laurel_normr  s     r6   rM   Gemma3nTextLaurelBlock.__init__  s    99T[[%<%<dkk>U>U\abIIdkk&=&=t{{?V?V]bc .t{{/F/FDKKLdLd er5   r=   r_   c                 p    U R                  U5      nU R                  U5      nU R                  U5      nX-   $ ra   )r  r  r  )rS   r=   laurel_hidden_statesnormed_laurel_hidden_statess       r6   re   Gemma3nTextLaurelBlock.forward  s@    -1-=-=m-L-1->->?S-T&*&;&;<P&Q#::r5   )rt   r  r  r  )r,   r-   r.   r/   r0   r#   rM   r1   rn   re   r4   ro   rp   s   @r6   r  r    s5    *f0 f;U\\ ;ell ; ;r5   r  c                      ^  \ rS rSrSS\S\4U 4S jjjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
rU =r$ )Gemma3nTextMLPi  rt   	layer_idxc                   > [         TU ]  5         Xl        UR                  U l        UR                  U   U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        UR                  U   U l        g NFrv   )rL   rM   rt   r{   intermediate_sizerN   r   	gate_projup_proj	down_projr	   hidden_activationact_fnactivation_sparsity_patternactivation_sparsityrS   rt   r  rT   s      r6   rM   Gemma3nTextMLP.__init__  s    !--!'!9!9)!D4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556#)#E#Ei#P r5   r=   r_   c                     U R                  U5      nU R                  S:  a  U R                  U5      nU R                  U5      nU R	                  U5      nU R                  X4-  5      nU$ )Nr   )r  r  _gaussian_topkr  r  r  )rS   r=   r  activationsr  r  s         r6   re   Gemma3nTextMLP.forward  sa    NN=1	##c)++I6Ikk),,,}-NN;#89	r5   inputsc                    [         R                  " U R                  [         R                  UR                  S9n[         R
                  R                  R                  SS5      nUR                  U5      nUR                  UR                  5      n[         R                  " USSS9n[         R                  " USSSS9nXVU-  -   n[        R                  R                  X-
  5      $ )	Nr   r   r   r    rW   Tr<  F)rD   rX   unbiased)r1   rR   r  r   r   distributionsnormalNormalicdfr   r   r[   stdrN   r   relu)rS   r  target_sparsity_tensornormal_diststd_multiplierinputs_mean
inputs_stdcutoff_xs           r6   r  Gemma3nTextMLP._gaussian_topk  s    !&d.F.Femmdjdqdq!r ))00771='2'7'78N'O',,V\\:jjR>YYv2teL
n!<<}}!!&"344r5   )r  r  rt   r  r  r{   r  r  )r   )r,   r-   r.   r/   r#   rl   rM   r1   rn   re   r  r4   ro   rp   s   @r6   r
  r
    s[    	Q0 	QS 	Q 	QU\\ ell 5U\\ 5ell 5 5r5   r
  c                   n  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	S\R                  S\R                  4S	 jr
S
\R                  S\R                  S\R                  4S jrS\R                  S\R                  4S jrS\R                  S\R                  4S jrSrU =r$ )Gemma3nTextAltUpi  a  Alternating Updates (AltUp)

The AltUp module wraps transformer layers. The `predict` step modifies the
input to the transformer layer, and the `correct` step propagates the output
of the transformer layer to the sparsely updated dimensions.

See more in the research paper:

https://proceedings.neurips.cc/paper_files/paper/2023/file/f2059277ac6ce66e7e5543001afa8bb5-Paper-Conference.pdf
rt   c                 2  > [         TU ]  5         Xl        [        R                  " [
        R                  " U R                  R                  5      5      U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        R                  " U R                  R                  U R                  R                  S-  SS9U l        [        R                  " U R                  R                  U R                  R                  SS9U l        [        U R                  R                  U R                  R                  S9U l        U R#                  S[
        R$                  " U R                  R                  S-  5      SS9  g )NFrv   r   r  router_input_scaleg      rJ   )rL   rM   rt   rN   rO   r1   r   r{   correct_output_scaler   altup_num_inputscorrection_coefsprediction_coefsmodality_routerrB   r  router_normrQ   rR   r  s     r6   rM   Gemma3nTextAltUp.__init__#  s   $&LLT[[=T=T1U$V! "		$++*F*FHdHdkp q "		$++*F*FHdHdfgHgns t!yy)@)@$++B^B^ejk)$++*A*At{{G_G_`15<<@W@WY]@]3^kpqr5   r\   r_   c                     U R                  U5      U R                  -  nU R                  U5      n[        R                  " UR                  5       5      R                  U5      $ ra   )r6  r0  r5  r1   r  rb   rc   )rS   r\   router_inputsrouteds       r6   compute_router_modalities*Gemma3nTextAltUp.compute_router_modalities-  sM    ((+d.E.EE%%m4zz&,,.)11!44r5   r=   c                    U R                  XR                  R                     5      nU R                  (       ap  U R                  R                  bY  U R
                  R                  R                  R                  U R                  R                  * U R                  R                  5        U R                  U5      R                  " / UR                  SS QU R                  R                  PU R                  R                  P76 R                  SSSS5      n[        R                  " UR                  SSSS5      U5      nUR                  SSSS5      nXA-  nUR                  5       R!                  U5      $ )a  Predicts the output of a layer using a trainable map.

Args:
    hidden_states: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` containing the predictions.
NrW   r   r    r   r   )r;  rt   altup_active_idxtrainingaltup_coef_clipr4  rH   dataclamp_r   rh   r2  r   r1   r   r   rc   )rS   r=   
modalities	all_coefspredictionss        r6   predictGemma3nTextAltUp.predict2  s?    33M++B^B^4_`
==T[[88D!!((--44dkk6Q6Q5QSWS^S^SnSno !!*-W i &&s+i-1[[-I-IiKO;;KgKgiWQ1a  	 ll=#8#8Aq!#DiP!))!Q15$%%'//>>r5   rE  	activatedc                 h   U R                  U5      nX!U R                  R                     -
  nUR                  U R                  R                  SSS5      nU R                  R
                  bY  U R                  R                  R                  R                  U R                  R
                  * U R                  R
                  5        U R                  U5      S-   nUR                  SSS5      R                  S5      n[        R                  " XE5      nXa-  nUR                  5       R                  U5      $ )a  Corrects the predictions relative to the

Args:
    predictions: A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` derived by
        stacking the input embeddings and preprocessing the last `num_altup_inputs - 1` matrices.
    activated: A 3D tensor of shape `[batch_size, num_tokens, hidden_size]` containing the activated inputs.

Returns:
    A 4D tensor of shape `[num_altup_inputs, batch_size, num_tokens, hidden_size]` correcting the original
        predictions relative to the activated input embeddings.
r    rI   r   r   rW   )r;  rt   r>  repeatr2  r@  r3  rH   rA  rB  r   r   r1   mulr   rc   )rS   rE  rH  rC  
innovationrD  	correcteds          r6   correctGemma3nTextAltUp.correctN  s     33I>
T[[-I-I!JJ
&&t{{'C'CQ1M
;;&&2!!((--44dkk6Q6Q5QSWS^S^SnSno
 #'"7"7
"Cc"I	%%aA.88<	IIj4	 	##%--i88r5   rM  c                 p    UR                  U R                  5      U R                  -  R                  U5      $ )z
This is only defined as the `forward` so that accelerate hooks can move correctly `correct_output_scale`
(which is a nn.Parameter, not a Module) between devices when offloading. It is otherwise only used in
`scale_corrected_output`
)rc   r1  rS   rM  s     r6   re   Gemma3nTextAltUp.forwardk  s2     !!$";";<t?X?XXaabkllr5   c                 $    U R                  U5      $ )zMScales the provided 3D tensor of shape [batch_size, num_tokens, hidden_size].)re   rQ  s     r6   scale_corrected_output'Gemma3nTextAltUp.scale_corrected_outputs  s    ||I&&r5   )rt   r1  r3  r5  r4  r6  )r,   r-   r.   r/   r0   r#   rM   r1   rn   r;  rF  rN  re   rT  r4   ro   rp   s   @r6   r.  r.    s    	r0 r55<< 5ELL 5
?U\\ ?ell ?895<< 9ELL 9U\\ 9:m m%,, m' ' ' 'r5   r.  c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Gemma3nTextRotaryEmbeddingix  rt   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typer   defaultinv_freqFrJ   )rL   rM   hasattr
isinstancerY  dictgetrZ  max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrt   r   rope_init_fnattention_scalingrQ   r\  original_inv_freq)rS   rt   r   r\  rT   s       r6   rM   #Gemma3nTextRotaryEmbedding.__init__y  s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r5   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rW   r    mpscpuF)device_typeenabledr   r   r   )r\  rb   r  rh   r   r   r^  r   strr1   autocast	transposer   r   re  r   r   )
rS   r\   position_idsinv_freq_expandedposition_ids_expandedrk  freqsembr   r   s
             r6   re   "Gemma3nTextRotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)re  rt   rb  rf  rc  rd  rZ  ra   )r,   r-   r.   r/   r#   rM   r1   no_gradr   re   r4   ro   rp   s   @r6   rW  rW  x  s7    /0 / /" ]]_<  <r5   rW  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrW   r   r   )rh   r1   r   )r\   x1x2s      r6   rotate_halfrz    sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r5   r=   n_repr_   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)rh   r  r   )r=   r{  r   num_key_value_headsslenr}   s         r6   	repeat_kvr    s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr5   modulequerykeyru  attention_maskdropoutscalingr   c                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	Nr   r   r   r	  rW   r
  )pr?  r    )r}   r  num_key_value_groupsr1   r   ro  r  rh   rN   r   r  r   r   r   r  r?  r   )r  r  r  ru  r  r  r  r   kwargsr  r  attn_weightscausal_maskattn_outputs                 r6   eager_attention_forwardr    s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r5   r\   r   r   rp  unsqueeze_dimc                 l    UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   $ )ar  Applies Rotary Position Embedding to the query and key tensors.

Args:
    x (`torch.Tensor`): The tensor to embed.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)r   rz  )r\   r   r   rp  r  s        r6   apply_rotary_pos_embr    s6    2 --
&C
--
&CGA,--r5   c                   (  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\
\R                     S\\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )Gemma3nTextAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperrt   r  c                   > [         TU ]  5         UR                  U   S:H  U l        Xl        X l        [        USUR                  UR                  -  5      U l	        UR                  UR                  -  U l        U R                  R                  U l        SU l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR                  UR                   S9U l        U R                  (       a  UR*                  OS U l        [-        UR                  UR.                  S9U l        [-        UR                  UR.                  S9U l        [-        UR                  UR.                  SS9U l        U R                  R6                  U R                  R8                  -
  nX#s=:  =(       a    S:  Os  U l        UR                  U   nU R:                  (       a0  US	-
  UR                  US	-
  S S
2   R=                  U5      -
  U l        g S U l        g )Nsliding_attentionr}   Trv   )rD   rE   F)rD   rE   rF   r   r    rW   ) rL   rM   layer_types
is_slidingrt   r  getattrr{   num_attention_headsr}   r}  r  attention_dropout	is_causalrN   r   attention_biasr   r   r   o_projsliding_windowrB   r  q_normk_normv_normnum_hidden_layersnum_kv_shared_layersis_kv_shared_layerindexkv_shared_layer_index)rS   rt   r  first_kv_shared_layer_idx
layer_typerT   s        r6   rM   Gemma3nTextAttention.__init__  sP    ,,Y7;NN"
F4F4F&JdJd4de$*$>$>&B\B\$\!!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 8<f33D$f>Q>QR$f>Q>QR$f>Q>Q^cd$(KK$A$ADKKDdDd$d!"+"L"L1"L''	2
 && &)F,>,>?X[\?\?b`b?b,c,i,ijt,uu 	"  	"r5   r=   position_embeddingsr  past_key_valuecache_positionr  r_   c                 J   UR                   S S n/ UQSPU R                  R                  P7nUu  pU R                  U5      R	                  U5      nU R                  U5      n[        XU
SS9nUR                  SS5      nU R                  (       Ga!  U R                  Gb  UGb  UR                  U R                     nUR                  UR                  R                  5      n[        U[        5      (       a]  UR                   S   UR!                  5       :  a  [#        SUR!                  5       5      nO!UR%                  SUR!                  5       S-
  S9nUR                  S S 2S S 2U4   R                  UR                  5      nUR&                  S S 2S S 2U4   R                  UR                  5      nOU R)                  U5      R	                  U5      nU R+                  U5      n[        XU
SS9nUR                  SS5      nU R-                  U5      R	                  U5      nU R/                  U5      nUR                  SS5      nUb0  U
U	UU R0                  S.nUR3                  XU R4                  U5      u  p[6        nU R                  R8                  S:w  a  [:        U R                  R8                     nU" U UUUU4U R<                  (       a  U R>                  OS	S
U R0                  S.UD6u  nnUR@                  " / UQSP76 RC                  5       nU RE                  U5      nUU4$ )NrW   r   )r  r    r   )r  r~   )r   r   r  r  eagerr   rI   )r  r  r  )#rh   rt   r}   r   r  r  r  ro  r  r  layersr   r   r   r^  r   get_max_cache_shapeslicer@  valuesr   r  r   r  r  updater  r  _attn_implementationr   r?  r  r   r   r  )rS   r=   r  r  r  r  r  input_shapehidden_shaper   r   r  layerr  r  r  cache_kwargsattention_interfacer  r  s                       r6   re   Gemma3nTextAttention.forward  s    $))#2.??b?$++*>*>?&{{=166|D{{<0+LsRST#--a3"""t'A'A'MR`Rl"))$*D*DEE$''

(9(9:G%!344!''*U-F-F-HH#Au'@'@'BCG%mmu7P7P7RUV7VmWG Aq'M255l6I6IJJ <<1g699,:M:MNL]388FJZ0J-jsRSTJ#--a3J;;}5::<HL;;|4L'11!Q7L% "0"&"5"5	L (6'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
 /3mmD**..
%
 
%
!\ "));;;;FFHkk+.L((r5   )r  rt   r}   r  r  r  r  r   r  r  r  r  r  r   r  r  r   NN)r,   r-   r.   r/   r0   r#   rl   rM   r1   rn   r   r
   
LongTensorr   r   r@   re   r4   ro   rp   s   @r6   r  r    s    G$
0 $
S $
V +/59H)||H) #\\H) !.	H)
 !H) !!1!12H) -.H) 
u||Xell3XeELL>Q5RR	SH) H)r5   r  c                     ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\R                  S\R                  S\R                  S	\	\R                     S
\	\R                     S\	\   S\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Gemma3nTextDecoderLayerib  rt   r  c                 l  > [         TU ]  5         Xl        UR                  U l        X l        UR
                  U   U l        [        X5      U l        [        XS9U l
        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR"                  U l        [$        UR&                     U l        [+        U5      U l        [/        U5      U l        [2        R4                  " U R                  U R"                  SS9U l        [2        R4                  " U R"                  U R                  SS9U l        [        U R                  UR                  S9U l        g )N)r  r  Frv   )rL   rM   rt   r{   r  r  attention_typer  	self_attnr
  mlprB   r  input_layernormpost_attention_layernormpre_feedforward_layernormpost_feedforward_layernormhidden_size_per_layer_inputr	   r  r  r.  altupr  laurelrN   r   per_layer_input_gateper_layer_projectionpost_per_layer_input_normr  s      r6   rM    Gemma3nTextDecoderLayer.__init__c  sU   !--"$00;-f@!&>-d.>.>FDWDWX(6t7G7GVM`M`(a%)78H8HfNaNa)b&*89I9IvObOb*c'+1+M+M(V556%f-
,V4$&IId.>.>@`@`gl$m!$&IId.N.NPTP`P`gl$m!)78H8HfNaNa)b&r5   r=   position_embeddings_globalposition_embeddings_localper_layer_inputr  rp  r  output_attentions	use_cacher  r_   c                    U R                   R                  U5      nXR                  R                     nU R	                  U5      nU R                  U5      nU R                  R                  (       a  UnOUnU R                  " SUUUUUUU	U
S.UD6u  nnU R                  U5      nUU-   nUU-   [        R                  " S5      -  nU R                  U5      nU R                  U5      nU R                  U5      nUU-   nU R                   R                  UU5      nUU R                  R                     R                  5       nU R                  R                   (       a  U R                   R#                  U5      nU R%                  U5      nU R'                  U5      n[(        R*                  " UU5      nU R-                  U5      nU R/                  U5      nUSS === U-  sss& U4nU(       a  UU4-  nU$ )N)r=   r  r  rp  r  r  r  r  r   r    r+   )r  rF  rt   r>  r  r  r  r  r  r   rY   r  r  r  rN  r   altup_correct_scalerT  r  r  r1   multiplyr  r  )rS   r=   r  r  r  r  rp  r  r  r  r  r  rE  active_predictionactive_prediction_normedlaurel_outputr  r  self_attn_weights
attn_gatedattn_laurel	attn_normattn_ffwattn_ffw_normattn_ffw_laurel_gatedcorrected_predictionsfirst_predictionoutputss                               r6   re   Gemma3nTextDecoderLayer.forwardy  s    jj((7'(D(DE#'#7#78I#J $<= >>$$";"<"&.. 
#
2 3)%)/)
#
 
#
 ,,T2&-
!M1TYYq\A22;?	88I&77A +m ; $

 2 2;@U V01M1MNTTV;;**#zz@@AQR  445EF;;'78 >>*:OL  445EF99:JKab!%55!(*)++Gr5   )r  r  r  rt   r{   r  r  r  r  r  r  r  r  r  r  r  r  )NNNFFN)r,   r-   r.   r/   r#   rl   rM   r1   rn   r   r  r
   rm   r@   r2   re   r4   ro   rp   s   @r6   r  r  b  s   c0 cS c8 2637*.,1$)59C||C %*LLC $)<<	C
 C !.C u//0C !C $D>C D>C !!1!12C 
u||XeE,=,=u?P?P,P&QRR	SC Cr5   r  c                   f   ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.rU 4S jrS	rU =r$ )
Gemma3nPreTrainedModeli  rt    Tr  r<   )r=   r>   c                   > [         TU ]  U5        [        U[        5      (       a&  UR                  R
                  R                  S5        g [        U[        5      (       a%  UR                  R
                  R                  5         g [        U[        5      (       a%  UR                  R
                  R                  5         g g )NrI   )rL   _init_weightsr^  r3  rH   rA  fill_r   r   zero_r.  r1  )rS   r  rT   s     r6   r  $Gemma3nPreTrainedModel._init_weights  s    f%f=>>MM$$S) 566  %%++- 011'',,224 2r5   r+   )r,   r-   r.   r/   r"   r3   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r  _can_record_outputsr  r4   ro   rp   s   @r6   r  r    s\    &*#23#4"5N!"&0*
5 5r5   r  zBThe base Gemma 3n language model without a language modeling head.c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\\          SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\   S\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       rS\
R                  S\
R                  4S jr SS	\
R                  S\	\
R                     S\
R                  4S jjrSrU =r$ )Gemma3nTextModeli  rt   c                   > [         TU ]  U5        UR                  U l        UR                  U l        [        UR                  UR                  U R                  U R                  R                  S-  S9U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        [%        US9U l        SU l        [*        R,                  " U5      nUR.                  Ul        SS0Ul        [%        US9U l        UR                  U l        UR6                  U l        [        UR8                  UR                  UR6                  -  U R                  UR6                  S-  S9U l        [        R<                  " U R                  UR                  UR6                  -  SS9U l        [        UR6                  UR                   S9U l         [        R                  " [        S	U R                  RB                  5       Vs/ sH-  n[        R<                  " U R                  U R                  SS9PM/     sn5      U l"        [        R                  " [        S	U R                  RB                  5       Vs/ sH-  n[        R<                  " U R                  U R                  SS9PM/     sn5      U l#        U RI                  S
[J        RL                  " U R                  S-  5      SS9  U RI                  S[J        RN                  " [J        RL                  " S5      5      SS9  U RQ                  5         g s  snf s  snf s  snf )N      ?)r  r  rt   FrZ  r[  rv   r    per_layer_projection_scaler   rJ   per_layer_input_scaleg       @))rL   rM   pad_token_idr  
vocab_sizer  r{   rt   embed_tokensrN   r  r7  r  r  r  rB   r  rg  rW  
rotary_embgradient_checkpointingcopydeepcopyrope_local_base_freq
rope_thetarY  rotary_emb_localr  vocab_size_per_layer_inputembed_tokens_per_layerr   per_layer_model_projectionper_layer_projection_normr2  altup_projectionsaltup_unembed_projectionsrQ   r1   rR   rA  	post_init)rS   rt   r  r   rT   s       r6   rM   Gemma3nTextModel.__init__  s    !.. ++ ;v1143C3CQUQ\Q\QhQhjmQm
 mmINvOgOgIhiIhI$V7Ihi
 #6#5#56;N;NO	4FC&+#
 v&"77*I6 :& I!--+1+M+M(&D--$$v'I'II::C?	'
# +-))$$v'I'II+
' *88Z8Z`f`s`s)t&!#PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw"
 *,PUVWY]YdYdYuYuPvwPv1RYYt'')9)9FPvw*
& 	95<<HXHXZ^H^;_lqr4ekk%,,sBS6Tafg 	[ jD x xs    M23M##3M(r  per_layer_inputsr  rp  r<   inputs_embedsr  r  output_hidden_statesr  r  r_   c                 	   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUb"  U R                  U5      nU R                  U5      nU R                  Xb5      nU(       a  Uc  U R                  (       d
  [        5       nU
cD  Ub  UR                  5       OSn[        R                  " UXR                   S   -   UR"                  S9n
Uc  U
R%                  S5      n['        U=n[(        5      (       d*  U R                   UUU
UUS.n[+        S0 UD6[-        S0 UD6S	.nUnU R/                  X5      nU R1                  X5      n[        R2                  " US
-  SSS9S-  n[        R4                  " S5      nU/n[7        SU R                   R8                  5       H  nU R:                  US-
     " U5      nUR=                  UR>                  UR"                  S9n[        R2                  " US
-  SSS9n[        R@                  " [        RB                  " UUR=                  UR"                  5      5      5      nUU-  U-  nURE                  U5        M     [        RF                  " USS9nU	(       a  SOSnU(       a  SOSnU RH                  SU R                   RJ                    Hb  nU	(       a  UU4-  nUURL                     nUSS2SS2URN                  SS24   nU" UUUU4UUUUUU
S.UD6nUS   nU(       d  MY  UUS   4-  nMd     U	(       a  UU4-  n[        R2                  " US   S
-  SSS9S-  nUS   /n[7        SU R                   R8                  5       H  nU RP                  US-
     " UU   5      n U R=                  UR>                  UR"                  S9n[        R2                  " US
-  SSS9n[        R@                  " [        RB                  " UUR=                  UR"                  5      5      5      nUU-  U-  nURE                  U5        M     [        RF                  " U5      n[        R2                  " USS9nU RS                  U5      n[U        UUUUS9$ )z
per_layer_inputs (torch.Tensor, *optional*, defaults to None):
    Pre-computed per-layer embeddings. If None, they are derived from input_ids if provided.
N:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    r   )rt   input_embedsr  r  r<   rp  )full_attentionr  r   rW   Tr<  r  gh㈵>r  r   r+   )r  rp  r  r  r  r  )last_hidden_stater<   r=   r>   )+rt   r  r  r  r  r   r?  loggerwarning_oncer  get_per_layer_inputsproject_per_layer_inputsr   get_seq_lengthr1   r   rh   r   r   r^  r_  r   r   r  r  r[   rR   r7  r2  r
  r   r   rY   maximumr  stackr  r  r  r  r  rg  r   )!rS   r  r  r  rp  r<   r  r  r  r  r  r  past_seen_tokenscausal_mask_mappingmask_kwargshidden_states_0r  r  target_magnitudeepsilon_tensortemp_hidden_statesr  
altup_projcurrent_hidden_statenew_magnituder=   all_hidden_statesall_self_attnsdecoder_layerr  r  layer_outputsaltup_unemb_projs!                                    r6   re   Gemma3nTextModel.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M#88C88Y0*nO!CRC^==?de"\\  #6#6q#99$++N )33A6L ?-FF ++ -"0"0#2 ,K #5"C{"C%F%U%U# ( &*___%S"$($9$9/$X! !::oq&8b$OSVVd+-.q$++667A//A6GJ#-==7L7LUeUlUl=#m !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $6A> #7BD0d![[)H4;;+H+HIM#!m%55!-m.J.JKK.q!]5L5La/OPO)*)	
  +)."3#- M *!,M  =#3"551 J6  -!11 !::mA&6!&;TRVYY+A./q$++667A-1-K-KAPQE-RS`abSc-d#3#6#6_=R=R[k[r[r#6#s !JJ';Q'>BPTUM!JJu}}]NDUDUVfVmVmDn'opM#7:J#J]#Z %%&:; 8 $67

=a8		-0&+++%	
 	
r5   c                     U R                  U5      R                  " / UR                  QU R                  R                  PU R
                  P76 $ ra   )r  r   rh   rt   r  r  )rS   r  s     r6   r  %Gemma3nTextModel.get_per_layer_inputs  sN    **95== 
__
KK))
 ,,
 	
r5   c                    U R                  U5      nX0R                  R                  UR                  UR                  S9-  nUR
                  " / UR                  S S QU R                  R                  PU R                  P76 nU R                  U5      nUc  U$ UR                  UR                  :w  a   USS U R                  R                  2S S 24   nX2-   U R                  R                  UR                  UR                  S9-  $ )Nr  rW   .)r  r  r   r   r   r   rh   rt   r  r  r	  r  )rS   r  r  r  s       r6   r  )Gemma3nTextModel.project_per_layer_inputs  s&   
 .2-L-L]-[ ? ? B B%%.B.I.I !C !
 	
  4;;  
  "% 
KK)) 
 ,, 

  $==>RS#''%%)9)?)??/5Tt{{7T7T5TVW0WX$74;U;U;X;X%%.B.I.I <Y <
 
 	
r5   )r
  r  r  r  r   r{   r  r  rg  r  r  r	  r  r  r  )
NNNNNNNNNNra   )r,   r-   r.   r/   r#   r3   rM   r   r   r   r1   r  rn   r
   r2   rm   r   r   r   re   r  r  r4   ro   rp   s   @r6   r  r    s   70 7r  15371537+/59$(,0/359T
E,,-T
 #5<<0T
 !.	T

 u//0T
 "%T
   1 12T
 D>T
 $D>T
 'tnT
 !!1!12T
 +,T
 
!T
  T
l
e.>.> 
5<< 
 48
||
 #5<<0
 
	
 
r5   r  z?The base Gemma 3n language model with a language modeling head.c                     ^  \ rS rSr% S/rSS0rSS/S/40r\\S'   Sr	S	S0r
S\4U 4S
 jjrS rS r\\           SS\\R$                     S\\R&                     S\\R$                     S\\   S\\R*                     S\\R$                     S\\   S\\   S\\   S\\R$                     S\\\R&                  4   S\4S jj5       5       rSrU =r$ )Gemma3nForCausalLMi  lm_head.weightlm_headcolwise_repr=   r;   rt   modelzmodel.language_modelc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g r  )
rL   rM   r  r6  r  rN   r   r{   r4  r  r  s     r6   rM   Gemma3nForCausalLM.__init__  sU     %f-
 ++yy!3!3V5F5FUS 	r5   c                     Xl         g ra   r6  rS   decoders     r6   set_decoderGemma3nForCausalLM.set_decoder  s    
r5   c                     U R                   $ ra   r:  ri   s    r6   get_decoderGemma3nForCausalLM.get_decoder  s    zzr5   r  r  rp  r<   r  labelsr  r  r  r  logits_to_keepr_   c                 F   U R                   (       aG  U R                  R                  S:w  a-  [        R	                  SU R                  R                   S35        Ub  UOU R                  R
                  nU	b  U	OU R                  R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  R                  bH  UU R                  R                  -  n[        R                  " U5      nUU R                  R                  -  nSnUb  U R                   " UX`R"                  40 UD6n[%        UUUR&                  UR(                  UR*                  S9$ )a$  
Example:

```python
>>> from transformers import AutoTokenizer, Gemma3nForCausalLM

>>> model = Gemma3nForCausalLM.from_pretrained("google/gemma-2-9b")
>>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")

>>> prompt = "What is your favorite condiment?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is your favorite condiment?"
```r  ziIt is strongly recommended to train Gemma3n models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)	r  r  rp  r<   r  r  r  r  r  )r:   r;   r<   r=   r>   r+   )r?  rt   r  r  r  r  r  r6  r  r^  rl   r  r4  final_logit_softcappingr1   r  loss_functionr  r   r<   r=   r>   )rS   r  r  rp  r<   r  rB  r  r  r  r  rC  r  r  r=   slice_indicesr;   r:   s                     r6   re   Gemma3nForCausalLM.forward  s   F ==T[[==H#{{??@  Aqr 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A;;..:dkkAAAFZZ'FdkkAAAF%%ffooPPD%#33!//))
 	
r5   )r4  r6  r  )NNNNNNNNNNr   )r,   r-   r.   r/   _tied_weights_keys_tp_plan_pp_planr#   r3   r  _checkpoint_conversion_mappingrM   r=  r@  r   r   r   r1   r  rn   r
   r2   rm   r   rl   r   re   r4   ro   rp   s   @r6   r2  r2    sy   *+=)H_-z:;H&<g%F"0   151537+/59-1$(,0/35934K
E,,-K
 !.K
 u//0	K

 "%K
   1 12K
 ))*K
 D>K
 $D>K
 'tnK
 !!1!12K
 c5<</0K
 
 K
  K
r5   r2  c                      ^  \ rS rSrSrS\\\4   S\4U 4S jjr	  SS\
\R                     S\
\R                     S\R                  4S	 jjrS
rU =r$ )Gemma3nMultimodalEmbedderi;  zQEmbeds token ids or soft tokens for multimodal content into language model space.multimodal_configtext_configc                 ^  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  5      U l        [        U R                  U R
                  S9U l        [        U R                  U R
                  S9U l        [        R                  " U R                  U R                  SS9U l        [        U R                  U R
                  SS9U l        g )Nr  Frv   )rE   rF   )rL   rM   r{   multimodal_hidden_sizer  rE   vocab_offsetr  text_hidden_sizerN   	Embedding	embeddingrB   hard_embedding_normsoft_embedding_normr   embedding_projectionembedding_post_projection_norm)rS   rO  rP  rT   s      r6   rM   "Gemma3nMultimodalEmbedder.__init__>  s    
 	&7&C&C#$11-::+66 + 7 7doot7R7RS#1$2M2MSWS[S[#\ #1$2M2MSWS[S[#\ $&IId.I.I4K`K`gl$m!.<T=R=RX\X`X`mr.s+r5   r  r  r_   c                     USL USL-  (       a  [        S5      eUb  U R                  U5      nO.U R                  XR                  -
  5      nU R	                  U5      nU R                  U5      nU R                  U5      $ )a  Embeds token ids or soft tokens for multimodal content into language model space.

Args:
    input_ids: A torch.LongTensor containing the token ids to embed. Values should be in the range
        `[vocab_offset, vocab_offset + vocab_size)`.
    inputs_embeds: A torch.Tensor containing the soft tokens to embed.

Returns:
    A torch.Tensor of embeddings with  shape `[batch_size, seq_len, self.config.text_config.hidden_size]`.
Nr  )r  rX  rV  rS  rW  rY  rZ  )rS   r  r  emb_normhard_embemb_norm_projs         r6   re   !Gemma3nMultimodalEmbedder.forwardQ  s     -t";<YZZ$//>H~~i2C2C&CDH//9H11(;22=AAr5   )
rV  rZ  rY  rE   rW  rR  rX  rT  rS  r  r  )r,   r-   r.   r/   r0   r   r!   r$   r#   rM   r   r1   r  rn   re   r4   ro   rp   s   @r6   rN  rN  ;  sw    [t !35H!HIt 't* 1504BE,,-B  -B 
	B Br5   rN  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, and a language model without a
    language modeling head.
    c                    8  ^  \ rS rSr0 rSrS\4U 4S jjrS rS r	S r
S rS	\R                  S
\R                  4S jr    SS\\R                      S\\R"                     S\\R"                     S\\R"                     4S jjr\              S S\\R                      S	\\R"                     S\\R"                     S\\R                     S\\R                     S\\R                      S\\\\R"                     \4      S\\R                      S\\R                      S\\R"                     S\\R                      S\\   S\\   S\\   S
\4S jj5       rS\R                  S\R                  S
\\R                  \R                  4   4S jrSrU =r$ )!Gemma3nModelim  Frt   c                   > [         TU ]  U5        [        R                  " UR                  S9U l        UR                  R                  U l        [        R                  " UR                  S9nX l        U R                  R                  b  U R                  R                  OSU l
        UR                  R                  U l        [        R                  " UR                  5      U l        [        UR                  UR                  5      U l        [        UR                  UR                  5      U l        U R#                  5         g )Nr  rW   )rL   rM   r   from_configvision_configvision_towerrP  r  language_modelrt   r  r  audio_configaudio_towerrN  embed_visionembed_audior  )rS   rt   rg  rT   s      r6   rM   Gemma3nModel.__init__x  s     %119M9MN ,,77"..f6H6HI,8<8P8P8\DKK44bd*0*<*<*W*W'$001D1DE5f6J6JFL^L^_4V5H5H&J\J\]r5   c                 6    U R                   R                  5       $ ra   )rg  get_input_embeddingsri   s    r6   rn  !Gemma3nModel.get_input_embeddings  s    ""7799r5   c                 :    U R                   R                  U5        g ra   )rg  set_input_embeddingsrS   ru  s     r6   rq  !Gemma3nModel.set_input_embeddings  s    007r5   c                     Xl         g ra   rg  r;  s     r6   r=  Gemma3nModel.set_decoder  s    %r5   c                     U R                   $ ra   ru  ri   s    r6   r@  Gemma3nModel.get_decoder  s    """r5   pixel_valuesr_   c                 f   U R                  USSS9R                  nUR                  UR                  S   U R                  R
                  R                  U R                  R                  5      R                  SSS5      nX R                  R
                  R                  S-  -  nU R                  US9$ )	a^  
Projects the last hidden state from the vision model into language model space.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.

Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
FT)ry  
do_poolingreturn_dictr   r   r    r  r  )
rf  r  r   rh   rt   re  r{   vision_soft_tokens_per_imager   rj  )rS   ry  vision_outputss      r6   get_image_featuresGemma3nModel.get_image_features  s     **%%T + 


 	
 (//  #KK%%11KK44
 '!Q
	 	 	++33??DD  ~ >>r5   r  r  image_featuresaudio_featuresc           	         Uc  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nUU R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  R                  S5      nO0XR                  R                  :H  nXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbR  X%   R                  5       UR                  5       :w  a.  [        SU SUR                  S   UR                  S   -   35      eUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUbR  X&   R                  5       UR                  5       :w  a.  [        SU SUR                  S   UR                  S   -   35      eXV4$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
r  rW   z6Image features and image tokens do not match: tokens: z, features r   r    z6Audio features and image tokens do not match: tokens: )rn  r1   rR   rt   image_token_idlongr   allaudio_token_idr>  r   	expand_asr   numelr  rh   )	rS   r  r  r  r  special_image_maskspecial_audio_maskn_image_tokensn_audio_tokenss	            r6   get_placeholder_mask!Gemma3nModel.get_placeholder_mask  sX    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;,,.LL!;!;5::VcVjVjk c"g  "+kk.H.H!H!*kk.H.H!H+//1/99"=GGVYYZgZnZno%-*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  ,//1/99"=GGVYYZgZnZno%-*K*Q*Q*SWeWkWkWm*mHHXXcdrdxdxyzd{  M  S  S  TU  V  eV  dW  X  "55r5   input_featuresr  input_features_maskrp  r<   token_type_idsr  rB  r  r  r  c                    USL U
SL-  (       a  [        S5      eUb  UOU R                  R                  nUb  UOU R                  R                  nUGb/  U R	                  5       " U5      n
[
        R                  " US:  XR                  :  5      n[
        R                  " UU[
        R                  " U5      5      nU R                  R                  U5      n[
        R                  " XR                  R                  :  XR                  R                  :  5      nU R                  R                  U R                  R                  -   S-
  n[
        R                  " UUU5      R!                  U
R"                  5      nU R                  US9nUR%                  S5      R'                  U
5      n[
        R                  " UUU
5      n
XR                  R                  :  nU R                  R                  U R                  R                  -   S-
  n[
        R                  " UUU5      R!                  U
R"                  5      nU R                  US9nUR%                  S5      R'                  U
5      n[
        R                  " UUU
5      n
OSnUb\  U R)                  U5      nUR!                  U
R"                  U
R*                  5      nU R-                  XUS9u  nnU
R/                  UU5      n
UGb*  UGb&  U R1                  X5) 5      u  n n[
        R2                  " U R                  S-
  //[
        R4                  U R"                  S9n!U R                  U!S9n"[
        R                  " UR%                  S5      U"U 5      n U R6                  u  n#n$n%U R                  R8                  U$-
  n&U"R;                  U#U&U%5      n'[
        R<                  " U U'4SS	9n U R!                  U
R"                  U
R*                  5      n U R-                  XU S
9u  nn(U
R/                  U(U 5      n
U R                  " SSUUUUU
UUUSU	S.UD6n)[?        U)R@                  U(       a  U)RB                  OSU)RD                  U)RF                  Ub  WOSUb  W S9$ SS9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3nForConditionalGeneration

>>> model = Gemma3nForConditionalGeneration.from_pretrained("google/gemma3n2-3b-mix-224")
>>> processor = AutoProcessor.from_pretrained("google/gemma3n2-3b-mix-224")

>>> prompt = "Where is the cat standing?"
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs,)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Where is the cat standing?\nsnow"
```
Nr  r   r    )r  rW   )r  r  r  r   )r  r  T)r  r  r  rp  r<   r  r  r  r  r|  r  )r  r<   r=   r>   r)   r*   r+   )$r  rt   r  r  rn  r1   r  r  r  
zeros_likerg  r  rj  rS  rk  r  r   r   r   r  r  r   r  masked_scatterget_audio_featuresrR   r  rh   audio_soft_tokens_per_imager  r   r'   r  r<   r=   r>   )*rS   r  ry  r  r  r  rp  r<   r  r  r  rB  r  r  r  	lm_kwargsper_layer_inputs_maskper_layer_inputs_tokensr  vision_maskdummy_vision_token_idvision_input_idsvision_embedsexpanded_vision_mask
audio_maskdummy_audio_token_idaudio_input_idsaudio_embedsexpanded_audio_maskr  r  r   r  audio_padding_toksaudio_padding_embsaudio_batch_sizeaudio_seq_lenaudio_embed_dimextra_padding_tokensextra_padding_featuresr  r  s*                                             r6   re   Gemma3nModel.forward  s|   ^ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	   557	BM %*$5$5i1niRqRqFq$r!&+kk2GTYTdTdenTo&p##22GGH_`  ++..;;;YIYIYIfIf=fK %)$5$5$B$BTEVEVEaEa$ade$e!${{;	CXY\\]j]q]qr --8H-IM#.#8#8#<#F#F}#U !KK(<m][M #&6&6&C&CCJ#'#3#3#@#@4CSCSC^C^#^ab#b #kk*iAUVYYZgZnZnoO++o+FL","6"6r":"D"D]"S!KK(;\=YM# #!44\BN+..}/C/C]EXEXYN$($=$=~ %> %! *889K^\M %*=*I)-)@)@Qe)f&NJ "'!0C/D.EUZZ`n`u`u!v!%!1!1<N!1!O"[[)=)=b)ACUWefN?M?S?S<m_#';;#J#J]#Z %7%>%>?OQegv%w""YY8N'OUVWN+..}/C/C]EXEXYN$($=$=~ %> %!A! *889K^\M%% 
-)%+'/!5)
 
 *%777@G33d!//))2>2JPT2@2L
 	
 SW
 	
r5   c                 J    U R                  X5      u  p4U R                  US9U4$ )a  
Projects the last hidden state from the audio encoder into language model space.

Args:
    input_features (`torch.FloatTensor]` of shape `(num_images, seq_length, num_features)`):
       The tensors corresponding to the input audio.
    input_features_mask (`torch.FloatTensor]` of shape `(num_images, seq_length)`):
       The attention mask for the input audio.

Returns:
    audio_features (`torch.Tensor`): Audio feature tensor of shape `(num_images, audio_length, embed_dim)`).
r}  )ri  rk  )rS   r  r  audio_outputsr  s        r6   r  Gemma3nModel.get_audio_featuresb  s0     %)$4$4^$Y!m<jHHr5   )ri  rk  rj  rg  r  rf  r  r  )NNNN)NNNNNNNNNNNNNN)r,   r-   r.   r/   rL  accepts_loss_kwargsr"   rM   rn  rq  r=  r@  r1   rn   r  r   r  r2   r  r   r   r?   r
   rm   r8   re   r@   r  r4   ro   rp   s   @r6   rb  rb  m  si    &("} :8&#?u|| ? ?6 15596:6:(6E,,-(6   1 12(6 !!2!23	(6
 !!2!23(6T  15486:156:37KO595959-1$(,0/3I
E,,-I
 u001I
 !!2!23	I

 !.I
 &ell3I
 u//0I
 "%U->->(?(F"GHI
 !!1!12I
 !!1!12I
   1 12I
 ))*I
 D>I
 $D>I
 'tnI
" 
'#I
 I
VI#llIAFI	u||U\\)	*I Ir5   rb  z
    The base Gemma 3n model comprising a vision backbone, an audio backbone, a language model, and a language modeling
    head.
    c            #         ^  \ rS rSr0 rS/rSrS\4U 4S jjrS r	S r
S rS	 rS
 r\S 5       r\S 5       r\S 5       r\\               S"S\\R,                     S\\R.                     S\\R.                     S\\R0                     S\\R0                     S\\R,                     S\\\\R.                     \4      S\\R,                     S\\R,                     S\\R.                     S\\R,                     S\\   S\\   S\\   S\\\R0                  4   S\4 S jj5       5       r            S#U 4S jjr \S  5       r!S!r"U =r#$ )$Gemma3nForConditionalGenerationiu  r3  r6  rt   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g r  )rL   rM   rb  r6  rN   r   rP  r{   r  r4  r  r  s     r6   rM   (Gemma3nForConditionalGeneration.__init__  sS     !&)
yy!3!3!?!?ASASA^A^ejkr5   c                 6    U R                   R                  5       $ ra   )r6  rn  ri   s    r6   rn  4Gemma3nForConditionalGeneration.get_input_embeddings  s    zz..00r5   c                 :    U R                   R                  U5        g ra   )r6  rq  rr  s     r6   rq  4Gemma3nForConditionalGeneration.set_input_embeddings  s    

''.r5   c                 :    U R                   R                  U5        g ra   )r6  r=  r;  s     r6   r=  +Gemma3nForConditionalGeneration.set_decoder  s    

w'r5   c                 6    U R                   R                  5       $ ra   )r6  r@  ri   s    r6   r@  +Gemma3nForConditionalGeneration.get_decoder  s    zz%%''r5   c                 8    U R                   R                  U5      $ ra   )r6  r  )rS   ry  s     r6   r  2Gemma3nForConditionalGeneration.get_image_features  s    zz,,\::r5   c                 .    U R                   R                  $ ra   )r6  rg  ri   s    r6   rg  .Gemma3nForConditionalGeneration.language_model  s    zz(((r5   c                 .    U R                   R                  $ ra   )r6  rf  ri   s    r6   rf  ,Gemma3nForConditionalGeneration.vision_tower  s    zz&&&r5   c                     [        S5      e)Nz2Use embed_vision instead of multi_modal_projector.)AttributeErrorri   s    r6   multi_modal_projector5Gemma3nForConditionalGeneration.multi_modal_projector  s    QRRr5   r  ry  r  r  r  rp  r<   r  r  r  rB  r  r  r  rC  r_   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  " S	UUUUUUUUU	U
UUUUSS.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                   R                  5       R                  =nb   UU-  n[        R                  " U5      nUU-  nSnUGbQ  UR                  5       nUSSS2SS24   nUSSS24   nUb  USS2UR                  S   * S24   R                  UR                   5      nUUR                  UR                   5      S:g     R#                  5       nUUR                  UR                   5      S:g     R#                  5       nO UR#                  5       nUR#                  5       n[$        R&                  " 5       nUR)                  SU R                   R*                  R,                  5      nUR)                  S5      R                  UR                   5      nU" UU5      n[/        UUUR0                  UR2                  UR4                  UR6                  UR8                  S9$ )
a$  
input_features_mask (torch.Tensor, *optional*, defaults to None):
    The attention mask for the input audio.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are
    ignored (masked), the loss is only computed for the tokens with labels in
    `[0, ..., config.text_config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration

>>> model = Gemma3ForConditionalGeneration.from_pretrained("google/gemma-3-4b-it")
>>> processor = AutoProcessor.from_pretrained("google/gemma-3-4b-it")

>>> messages = [
...     {
...         "role": "system",
...         "content": [
...             {"type": "text", "text": "You are a helpful assistant."}
...         ]
...     },
...     {
...         "role": "user", "content": [
...             {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"},
...             {"type": "text", "text": "Where is the cat standing?"},
...         ]
...     },
... ]

>>> inputs = processor.apply_chat_template(
...     messages,
...     tokenizer=True,
...     return_dict=True,
...     return_tensors="pt",
...     add_generation_prompt=True
... )
>>> # Generate
>>> generate_ids = model.generate(**inputs)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"user\nYou are a helpful assistant.\n\n\n\n\n\nWhere is the cat standing?\nmodel\nBased on the image, the cat is standing in a snowy area, likely outdoors. It appears to"
```
NT)r  ry  r  r  r  rp  r<   r  r  r  rB  r  r  r  r|  .rW   r    r   )r:   r;   r<   r=   r>   r)   r*   r+   )rt   r  r  r6  r  r^  rl   r  r4  get_text_configrE  r1   r  rb   rh   r   r   r   rN   CrossEntropyLossr  rP  r  r8   r<   r=   r>   r)   r*   )rS   r  ry  r  r  r  rp  r<   r  r  r  rB  r  r  r  rC  r  r  r=   rG  r;   rE  r:   shift_logitsshift_labelsshift_attention_maskloss_fctflat_logitsflat_labelss                                r6   re   'Gemma3nForConditionalGeneration.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 ** 
%)) 3%+))'/!5
  !
&  118B>SV8W8W~ot4]kmA}a,?@A'+{{'B'B'D'\'\\#i55FZZ'F55F\\^F!#ssA+.L!#qr'?L) (6a,:L:LQ:O9O9Q6Q'R'U'UV\VcVc'd$+,@,C,CFMM,RVW,WXcce+,@,C,CLDWDW,X\],]^iik+668+668**,H&++B0G0G0R0RSK&++B/22<3F3FGKK5D,#33!//)) ' ; ; ' ; ;
 	
r5   c                 f   > [         TU ]  " U4UUUUUUUU
S.UD6nUS   S:X  a  XoS'   XS'   XS'   U$ )N)r<   r  r  rp  r  r  rC  r  r   ry  r  r  )rL   prepare_inputs_for_generation)rS   r  r<   r  r  rp  ry  r  r  r  r  r  rC  rB  r  model_inputsrT   s                   r6   r  =Gemma3nForConditionalGeneration.prepare_inputs_for_generation'	  sm    $ w<
+')%)))
 
  !!+7(-;)*2E./r5   c                 .    U R                   R                  $ ra   )r6  ri  ri   s    r6   ri  +Gemma3nForConditionalGeneration.audio_towerP	  s    zz%%%r5   )r4  r6  )NNNNNNNNNNNNNNr   )NNNNNNNNNTNN)$r,   r-   r.   r/   rL  rI  r  r"   rM   rn  rq  r=  r@  r  propertyrg  rf  r  r   r   r   r1   r  r2   rn   r   r?   r
   rm   rl   r8   re   r  ri  r4   ro   rp   s   @r6   r  r  u  sW    &("*+} 1/((; ) ) ' ' S S  15486:156:37KO595959-1$(,0/334!A
E,,-A
 u001A
 !!2!23	A

 !.A
 &ell3A
 u//0A
 "%U->->(?(F"GHA
 !!1!12A
 !!1!12A
   1 12A
 ))*A
 D>A
 $D>A
 'tnA
  c5<</0!A
$ 
'%A
  A
L  'R & &r5   r  )r  r2  r  rb  r  r  )r   NN)Nr    )_r  r   collections.abcr   r   dataclassesr   typingr   r   r1   torch.nnrN   torch.nn.functionalr   rv  r  r	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor   configuration_gemma3nr!   r"   r#   r$   
get_loggerr,   r  r'   r8   ModulerB   rr   r   r3  rW  r~  r  r  r  r  r  rU  r  r  r
  r.  rW  rz  rn   rl   r  rb   r@   r  r  r  r  r  r  r2  rN  rb  r  __all__r+   r5   r6   <module>r     s  ,   . ! "     ! B B ) R B 9 O K F & _ _  l l 
		H	% 
<!8 < <* 
<K < <<=RYY =6g)BII g)T]BII ]@j,bii j,Z@7		 @7FF")) FRORYY O8Dryy D2(ryy (V 6E-/ E-P
SR\\ 
S;RYY ;$#5RYY #5L^'ryy ^'B< <D(	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %N ,0.||.	. 
. 5<<(	.
 .<q)299 q)hZ8 Zz 5_ 5 56 abt
- t
 ct
n ^_d
/ d
 `d
N/B		 /Bd I) IID W&&<o W&W&tr5   