
    <h                     b   S SK JrJrJr  S SKrS SKJr  S SKJrJ	r	  SSK
Jr  SSKJrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSK J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*J+r+J,r,J-r-  SSK.J/r/J0r0  \-Rb                  " \25      r3 " S S\Rh                  5      r5 " S S\Rh                  5      r6 " S S\Rh                  5      r7S r8SLS jr9S\Rt                  S\;S\Rt                  4S jr<   SMS\Rh                  S\Rt                  S \Rt                  S!\Rt                  S"\\Rt                     S#\=S$\\=   S%\\=   S\>\Rt                  \Rt                  4   4S& jjr? " S' S(\Rh                  5      r@ " S) S*\Rh                  5      rA " S+ S,\5      rB " S- S.\B5      rC " S/ S0\Rh                  5      rD " S1 S2\Rh                  5      rE " S3 S4\Rh                  5      rF\* " S5 S6\%5      5       rGS"\\Rt                     S\4S7 jrHS8\;S\4S9 jrIS:\\R                     S\Rt                  S;\\;   S\Rt                  4S< jrK " S= S>\G5      rL " S? S@\L5      rM\* " SA SB\G5      5       rN\* " SC SD\G5      5       rO " SE SF\G\5      rP\* " SG SH\G5      5       rQ\* " SI SJ\G5      5       rR/ SKQrSg)N    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask!create_sliding_window_causal_mask)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSequenceClassifierOutputTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )T5GemmaConfigT5GemmaModuleConfigc                   J   ^  \ rS rSrS	S\S\4U 4S jjjrS rS rS r	Sr
U =r$ )
T5GemmaRMSNorm5   dimepsc                    > [         TU ]  5         X l        [        R                  " [
        R                  " U5      5      U l        g N)super__init__r)   nn	Parametertorchzerosweight)selfr(   r)   	__class__s      d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/t5gemma/modeling_t5gemma.pyr-   T5GemmaRMSNorm.__init__6   s,    ll5;;s#34    c                     U[         R                  " UR                  S5      R                  SSS9U R                  -   5      -  $ )N   T)keepdim)r0   rsqrtpowmeanr)   )r3   xs     r5   _normT5GemmaRMSNorm._norm;   s4    5;;quuQx}}R}>IJJJr7   c                     U R                  UR                  5       5      nUSU R                  R                  5       -   -  nUR                  U5      $ )Ng      ?)r@   floatr2   type_as)r3   r?   outputs      r5   forwardT5GemmaRMSNorm.forward>   sC    AGGI& 3!2!2!445~~a  r7   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler2   shaper)   r3   s    r5   
extra_reprT5GemmaRMSNorm.extra_reprE   s'    ))*+6$((<<r7   )r)   r2   )gư>)__name__
__module____qualname____firstlineno__intrC   r-   r@   rF   rL   __static_attributes____classcell__r4   s   @r5   r&   r&   5   s0    5C 5e 5 5
K!= =r7   r&   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
T5GemmaMLPI   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        [
        R                  " UR                  5      U l        g )NFbias)r,   r-   confighidden_sizeintermediate_sizer.   Linear	gate_projup_proj	down_projr	   hidden_activationact_fnDropoutdropout_ratedropoutr3   r\   r4   s     r5   r-   T5GemmaMLP.__init__J   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV556zz&"5"56r7   c                     U R                  U R                  U5      5      U R                  U5      -  nU R                  U5      nU R	                  U5      nU$ r+   )rd   r`   ra   rg   rb   )r3   r?   hidden_statesrb   s       r5   rF   T5GemmaMLP.forwardU   sH    DNN1$56aH]3NN=1	r7   )rd   r\   rb   rg   r`   r]   r^   ra   )rN   rO   rP   rQ   r-   rF   rS   rT   rU   s   @r5   rW   rW   I   s    	7 r7   rW   c                   d   ^  \ rS rSrSU 4S jjr\R                  " 5       \S 5       5       rSr	U =r
$ )T5GemmaRotaryEmbedding\   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r,   r-   hasattr
isinstancerq   dictgetrr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr\   r   rope_init_fnattention_scalingregister_bufferru   original_inv_freq)r3   r\   deviceru   r4   s       r5   r-   T5GemmaRotaryEmbedding.__init__]   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r7   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r:   r"   mpscpuF)device_typeenabledr9   r(   dtype)ru   rC   expandrJ   tor   rx   rs   strr0   autocast	transposecatcosr   sinr   )
r3   r?   position_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r5   rF   T5GemmaRotaryEmbedding.forwardn   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r\   r|   r   r}   r~   rr   r+   )rN   rO   rP   rQ   r-   r0   no_gradr   rF   rS   rT   rU   s   @r5   rn   rn   \   s*    /" ]]_<  <r7   rn   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr:   r9   r   )rJ   r0   r   )r?   x1x2s      r5   rotate_halfr   ~   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r7   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r5   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr7   rk   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r"   N)rJ   r   reshape)rk   r   batchnum_key_value_headsslenhead_dims         r5   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr7   modulequerykeyvalueattention_maskrg   scalingsoftcapc                    Uc  U R                   S-  n[        X R                  5      n	[        X0R                  5      n
[        R                  " XR                  SS5      5      U-  nUb  X-  n[        R                  " U5      nX-  nUb"  US S 2S S 2S S 2S U	R                  S   24   nX-   n[        R                  R                  US[        R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[        R                  " X5      nUR                  SS5      R!                  5       nX4$ )	N      r9   r   r:   )r(   r   )ptrainingr"   )r   r   num_key_value_groupsr0   matmulr   tanhrJ   r.   
functionalsoftmaxfloat32r   r   rg   r   
contiguous)r   r   r   r   r   rg   r   r   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                 r5   eager_attention_forwardr      s/    //4'3 ; ;<JU$?$?@L<<';';Aq'ABWLL#-zz,/#-!$Q1.D
0@0@0D.D%DE#1 ==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r7   c                   F  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )T5GemmaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperr\   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        UR                  U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR
                  UR                  U R                  -  UR"                  S9U l        [        R                   " UR                  U R                  -  UR
                  UR"                  S9U l        U R                  R,                  U l        UR.                  U   S:X  a  UR0                  U l        g S U l        g )Nr   r   rZ   sliding_attention)r,   r-   r\   r   getattrr]   num_attention_headsr   r   r   query_pre_attn_scalarr   attention_dropout
is_decoder	is_causalr.   r_   attention_biasq_projk_projv_projo_projattn_logit_softcappinglayer_typessliding_windowr3   r\   r   r4   s      r5   r-   T5GemmaSelfAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>**ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7=7I7I)7TXk7kf33qur7   rk   position_embeddingsr   past_key_valuecache_positionr   r   c                 `   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       a  U R                  OSU R                   U R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ Nr:   r"   r9   )r   r   r   eager        rg   r   r   r   rJ   r   r   viewr   r   r   r   updater   r   r\   _attn_implementationr   r   r   r   r   r   r   r   r   r3   rk   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r5   rF   T5GemmaSelfAttention.forward       $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL..//%
 %
!\ "));;;;FFHkk+.L((r7   r   r   r\   r   r   r   r   r   r   r   r   r   r   NN)rN   rO   rP   rQ   __doc__r$   rR   r-   r0   TensorrI   r   r
   
LongTensorr   r   rF   rS   rT   rU   s   @r5   r   r      s    Gv2 vs v> +/59+)||+) #5<<#=>+) !.	+)
 !+) !!1!12+) -.+) 
u||Xell3XeELL>Q5RR	S+) +)r7   r   c                     ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\
\R                     S\
\R                     S	\
\   S
\\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )T5GemmaCrossAttentioni  r   r\   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR$                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R,                  U l        UR$                  c  [/        S5      eg )Nr   r   FrZ   zBCross-attention needs cross_attention_hidden_size to be specified.)r,   r-   r\   r   r   r]   r   r   r   r   r   r   r   r   r.   r_   r   r   cross_attention_hidden_sizer   r   r   r   
ValueErrorr   s      r5   r-   T5GemmaCrossAttention.__init__  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii..0J0JT]]0Zagavav
 ii..0J0JT]]0Zagavav
 ii&&68J8JQWQfQf
 '+kk&H&H#--5abb 6r7   rk   r   encoder_hidden_statesr   r   r   c                    Uc  [        S5      eUR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nUb1  UR                  R                  U R                  5      n	UR                  n
Ub  W	(       d  UR                  S S n/ UQSPU R                  P7nU R                  U5      R	                  U5      R                  SS5      nU R                  U5      R	                  U5      R                  SS5      nUb7  W
R                  XU R                  5      u  pSUR                  U R                  '   OFW
R                  U R                     R                  nU
R                  U R                     R                  n[         nU R"                  R$                  S:w  a  [&        U R"                  R$                     nU" U UUUU4U R(                  (       a  U R*                  OSU R,                  S U R.                  S.UD6u  nnUR0                  " / UQSP76 R3                  5       nU R5                  U5      nUU4$ )	Nz5Encoder hidden state is required for cross attention.r:   r"   r9   Tr   r   r   )r   rJ   r   r   r   r   
is_updatedrz   r   cross_attention_cacher   r   r   layerskeysvaluesr   r\   r   r   r   r   r   r   r   r   r   )r3   rk   r   r   r   r   r   r   r   r  curr_past_key_valueencoder_input_shapeencoder_hidden_shaper   r   r   r   r   s                     r5   rF   T5GemmaCrossAttention.forward9  sC    !(TUU#))#2.88b8$--8{{=166|DNNqRST%'2266t~~FJ"0"F"F!"7"="=cr"B#L%8#L"#Ldmm#L %:;@@AUV``abdefJ;;'<=BBCWXbbcdfghL)+>+E+Ej`d`n`n+o(
<@))$..9,33DNNCHHJ.55dnnELLL(?;;++w6"9$++:Z:Z"[$7%
 /3mmD**LL//%
 %
!\ "));;;;FFHkk+.L((r7   )r   r   r\   r   r   r   r   r   r   r   r   r   r+   )rN   rO   rP   rQ   r   r$   rR   r-   r0   r   r   r
   r   r   rI   rF   rS   rT   rU   s   @r5   r   r     s    Gc2 cs cB +/3)||3) !.3)  (5	3)
 !3) -.3) 
u||Xell3XeELL>Q5RR	S3) 3)r7   r   c                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\	\R                  4   4
S
 jjrSrU =r$ )T5GemmaEncoderLayerio  zEncoder sub-layer.r   c                 $  > [         TU ]  5         UR                  U l        Xl        X l        UR
                  U   U l        [        UUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        ["        R$                  " UR&                  5      U l        g N)r\   r   r)   )r,   r-   r]   r\   r   r   attention_typer   	self_attnr&   rms_norm_epspre_self_attn_layernormpost_self_attn_layernormrW   mlppre_feedforward_layernormpost_feedforward_layernormr.   re   rf   rg   r   s      r5   r-   T5GemmaEncoderLayer.__init__r  s    !--"$00;-
 (6f6H6HfNaNa'b$(6v7I7IvObOb(c%f%)78J8JPVPcPc)d&*89K9KQWQdQd*e'zz&"5"56r7   rk   r   r   r   r   c           	      8   UnU R                  U5      nU R                  " SUUUUS S.UD6u  pU R                  U5      nX`R                  U5      -   nUnU R	                  U5      nU R                  U5      nU R                  U5      nX`R                  U5      -   nU$ )N)rk   r   r   r   r    )r  r  r  rg   r  r  r  )r3   rk   r   r   r   r   residual_s           r5   rF   T5GemmaEncoderLayer.forward  s     !44]C>> 
' 3)%
 
 55mD <<#>> 66}E/77F <<#>>r7   )r  r\   rg   r]   r   r  r  r  r  r  r  r   )rN   rO   rP   rQ   r   rR   r-   r0   r   rI   r   r   FloatTensorrF   rS   rT   rU   s   @r5   r  r  o  s    7# 70 2637|| #5<<#=> !.	
 u//0 
u  !	" r7   r  c                   `  ^  \ rS rSrSrS\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\
\   S
\
\   S\
\R                     S\
\R                     S\
\R                     S\R                  4S jjrSrU =r$ )T5GemmaDecoderLayeri  z2Decoder sub-layer: an extra cross-attention layer.r   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        [	        UR
                  UR                  S9U l        g r  )	r,   r-   r   
cross_attnr&   r]   r  pre_cross_attn_layernormpost_cross_attn_layernormr   s      r5   r-   T5GemmaDecoderLayer.__init__  sS    +/vS(6v7I7IvObOb(c%)78J8JPVPcPc)d&r7   rk   r   r   r   r   	use_cacher   r   encoder_attention_maskr   c
                    UnU R                  U5      nU R                  " SUUUUUb  UR                  OS UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  " SUUU	UUS.U
D6u  pU R                  U5      nXR	                  U5      -   nUnU R                  U5      nU R                  U5      nU R                  U5      nXR	                  U5      -   nU$ )N)rk   r   r   r   r   r&  r   )rk   r   r   r   r&  r  )r  r  self_attention_cacher  rg   r#  r"  r$  r  r  r  )r3   rk   r   r   r   r   r&  r   r   r'  r   r  r  s                r5   rF   T5GemmaDecoderLayer.forward  s0    !44]C>> 	
' 3)%BPB\>>>bf)	
 	
 55mD <<#>> 55mD?? 
'"71)
 
 66}E <<#>> 66}E/77F <<#>>r7   )r"  r$  r#  )NNNFNNN)rN   rO   rP   rQ   r   rR   r-   r0   r   rI   r   r   r   boolr  rF   rS   rT   rU   s   @r5   r   r     s    <e# e 26378<$)598<9=.||. #5<<#=>. !.	.
 u//0. !!45. D>. !!1!12.  (5. !) 6. 
		. .r7   r   c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaClassificationHeadi  z-Head for sentence-level classification tasks.r]   
num_labelsclassifier_dropout_ratec                    > [         TU ]  5         [        R                  " US9U l        [        R
                  " X5      U l        g )N)r   )r,   r-   r.   re   rg   r_   out_proj)r3   r]   r.  r/  r4   s       r5   r-   "T5GemmaClassificationHead.__init__  s/    zz$;<		+:r7   rk   r   c                 J    U R                  U5      nU R                  U5      nU$ r+   rg   r1  )r3   rk   s     r5   rF   !T5GemmaClassificationHead.forward  s$    ]3m4r7   r4  )r   )rN   rO   rP   rQ   r   rR   rC   r-   r0   r   rF   rS   rT   rU   s   @r5   r-  r-    sF    7;C ;S ;SX ; ;
U\\ ell  r7   r-  c                   z   ^  \ rS rSrSrSS\S\S\4U 4S jjjrS\R                  S\R                  4S	 jr
S
rU =r$ )T5GemmaLMHeadi  z.Head for language modeling (generation) tasks.r]   
vocab_sizer[   c                 V   > [         TU ]  5         [        R                  " XUS9U l        g )NrZ   )r,   r-   r.   r_   r1  )r3   r]   r8  r[   r4   s       r5   r-   T5GemmaLMHead.__init__  s     		+Er7   rk   r   c                 (    U R                  U5      nU$ r+   r1  )r3   rk   logitss      r5   rF   T5GemmaLMHead.forward  s    }-r7   r<  )F)rN   rO   rP   rQ   r   rR   r+  r-   r0   r   rF   rS   rT   rU   s   @r5   r7  r7    sJ    8FC FS F F FU\\ ell  r7   r7  c                   F  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )T5GemmaAttentioni  r   r\   r   c                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        UR                  S-  U l        U R                  R                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR
                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR
                  UR                   S9U l        U R                  R*                  U l        UR,                  U   S:X  a  UR.                  U l        g S U l        g )Nr   r   TrZ   r   )r,   r-   r\   r   r   r]   r   r   r   r   r   r   r   r   r.   r_   r   r   r   r   r   r   r   r   r   s      r5   r-   T5GemmaAttention.__init__  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!33T9!%!>!>ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
 '+kk&H&H#7=7I7I)7TXk7kf33qur7   rk   r   r   r   r   r   r   c                 `   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       a  U R                  OSU R                   U R"                  U R$                  S.UD6u  nnUR&                  " / UQSP76 R)                  5       nU R+                  U5      nUU4$ r   r   r   s                     r5   rF   T5GemmaAttention.forward  r   r7   r   r   )rN   rO   rP   rQ   r   r#   rR   r-   r0   r   rI   r   r
   r   r   r   rF   rS   rT   rU   s   @r5   r@  r@    s    Gv} v v< +/59+)||+) #5<<#=>+) !.	+)
 !+) !!1!12+) -.+) 
u||Xell3XeELL>Q5RR	S+) +)r7   r@  c                   l   ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.rU 4S jrS	 rS
rU =r$ )T5GemmaPreTrainedModeliA  r\   modelTT5GemmaBlockpast_key_values)rk   
attentionsc                   > [         TU ]  U5        U R                  R                  n[	        U[
        5      (       a  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  [        UR                  S5      (       aG  UR                  R                  b/  UR                  R                  R                  R                  5         g g g [	        U[        5      (       as  U R                  R                  (       dW  UR                  R                  R                  S   S-  nUR                  R                  R                  R                  SX#-  S9  g g g )Nr   r   r   )r>   stdr[   )r,   _init_weightsr\   initializer_rangerx   r-  r1  r2   rJ   datanormal_rw   r[   zero_r7  tie_word_embeddings)r3   r   rL  scaler4   s       r5   rM  $T5GemmaPreTrainedModel._init_weightsS  s   f%kk++f788OO**003t;EOO""''//Sck/Jv//FOO4H4H4T$$))//1 5U/..;;22..44Q74?&&++33#+3N 3 /r7   c                 b   U R                   R                  R                  nU R                   R                  R                  nUc  [	        S5      eUR                  UR                  5      nUSSS24   R                  5       USSS24'   X$S'   Uc  [	        S5      eUR                  US:H  U5        U$ )	z
Shifts input_ids to the right, prepends the decoder_start_token_id, and handles
pad_token_id replacement for labels that were -100.
This is a common preparation step for decoder inputs in sequence-to-sequence models.
Nz:self.model.config.decoder.bos_token_id has to be defined. .r:   r"   ).r   z9self.model.config.decoder.pad_token_id has to be defined.i)	r\   decoderbos_token_idpad_token_idr   	new_zerosrJ   clonemasked_fill_)r3   	input_idsdecoder_start_token_idrX  shifted_input_idss        r5   _shift_right#T5GemmaPreTrainedModel._shift_righta  s     "&!4!4!A!A{{**77!)YZZ &//	@%.sCRCx%8%>%>%@#qr'"$:&!XYY 	&&'8D'@,O  r7   r  )rN   rO   rP   rQ   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r@  _can_record_outputsrM  r_  rS   rT   rU   s   @r5   rF  rF  A  sb    &*#'(#4"5N!"&,&
O! !r7   rF  c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z,
This creates bidirectional attention mask.
	batch_idxhead_idxq_idxkv_idxr   c                    > Tc#  [         R                  " S[         R                  S9$ TX4   R                  [         R                  5      $ )Nr  r   )r0   onesr+  r   )rm  rn  ro  rp  r   s       r5   
inner_mask/bidirectional_mask_function.<locals>.inner_mask  s;    !::b

33i/033EJJ??r7   rR   r+  )r   rs  s   ` r5   bidirectional_mask_functionrv  |  s9    
@c @S @ @c @d @
 r7   r   c           
      T   ^  S[         S[         S[         S[         S[        4
U 4S jjnU$ )z@
This creates bidirectional attention mask with sliding window.
rm  rn  ro  rp  r   c                 $   > UT-
  U:  X2T-   :  -  $ r+   r  )rm  rn  ro  rp  r   s       r5   rs  >sliding_window_bidirectional_mask_function.<locals>.inner_mask  s     &/F^=S4STTr7   ru  )r   rs  s   ` r5   *sliding_window_bidirectional_mask_functionrz    s9    
Uc US U Uc Ud U r7   	token_idsrX  c                    U b<  Uc  [        S5      eX:g  R                  UR                  [        R                  5      nU$ [        R
                  " UR                  S   UR                  S   4UR                  [        R                  S9nU$ )z%Construct the default attention mask.z3`pad_token_id` is required for padding information.r   r"   r   r   )r   r   r   r0   longrr  rJ   )r{  rk   rX  r   s       r5   make_default_2d_attention_maskr    s     RSS#3778L8LejjY
    #]%8%8%;<]EYEYafakak
 r7   c                      ^  \ rS rSr\\S.rU 4S jr\    SS\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\\   S	\4S
 jj5       rSrU =r$ )T5GemmaEncoderi  )rJ  rk   c           	      P  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        UR                  UR                  S9U l        [        US9U l        SU l        [
        R                  " [!        UR"                  5       Vs/ sH  n[%        X5      PM     sn5      U l        [
        R(                  " UR*                  5      U l        U R/                  5         g s  snf )Nr  )r\   F)r,   r-   rX  padding_idxr8  r.   	Embeddingr]   embed_tokensr&   r  normrn   
rotary_embgradient_checkpointing
ModuleListrangenum_hidden_layersr  r  re   rf   rg   	post_initr   s      r5   r-   T5GemmaEncoder.__init__  s     !.. ++LL):):F<N<NPTP`P`a"6#5#56;N;NO	0?&+#mmEJ6KcKcEdeEd	 3Ede
 zz&"5"56 	 fs   D#r\  r   r   inputs_embedsr   r   c           	         US L US L-  (       a  [        S5      eUc  U R                  U5      n[        R                  " SUR                  S   UR
                  S9nUc  UR                  S5      nUc   [        XU R                  R                  5      n[        U=n[        5      (       db  U R                  UUUS US.n[        S0 UDS[        U5      0D6[        S0 UD[        U R                  R                   5      [        U5      S.D6S.nUn	U R#                  X5      n
[        R$                  " U R                  R&                  S	-  U	R(                  S
9nX-  n	U R+                  U	5      n	U R,                  S U R                  R.                    H  nU" U	U
X|R0                     U40 UD6n	M     U R3                  U	5      n	U R+                  U	5      n	[5        U	S9$ )N:You must specify exactly one of input_ids or inputs_embedsr   r"   r   r\   input_embedsr   r   rI  r   or_mask_function)r  and_mask_functionfull_attentionr         ?r   )last_hidden_stater  )r   r  r0   arangerJ   r   r   r  r\   rX  rx   ry   r   rv  r   rz  r   r  tensorr]   r   rg   r  r  r  r  r   )r3   r\  r   r   r  r   r   self_attn_mask_mappingmask_kwargsrk   r   
normalizerlayer_modules                r5   rF   T5GemmaEncoder.forward  s    -t";<YZZ  --i8Ma)<)<Q)?H\H\])33A6L!;IVZVaVaVnVnoNNB0DII++ -"0"0#' ,K #5 #!#%@%P# &G &!&%OPTP[P[PjPj%k&A.&Q&
&" &"oomJ\\$++"9"93">mFYFYZ
%2]3 KK(G$++*G*GHL(#&'B'BC	
 M I 		-0]3+
 	
r7   )rg   r  r  r  r  r  r  r8  NNNN)rN   rO   rP   rQ   r   r  rk  r-   r   r   r0   r   r   r  r   r   r   rF   rS   rT   rU   s   @r5   r  r    s    *,
$  15153759>
E,,->
 !.>
 u//0	>

   1 12>
 +,>
 
>
 >
r7   r  c                   f  ^  \ rS rSr\" \SS9\" \SS9\S.rU 4S jr	\
         SS\\R                     S\\R                     S\\R                     S	\\   S
\\R                      S\\   S\\R                     S\\R                     S\\R                     S\\   S\4S jj5       rSrU =r$ )T5GemmaDecoderi  r"   )index)rJ  cross_attentionsrk   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        X5      PM     sn5      U l        U R                  5         g s  snf r+   )	r,   r-   r.   r  r  r  r   r  r  r   s      r5   r-   T5GemmaDecoder.__init__  sW     mmEJ6KcKcEdeEd	 3Ede
 	 fs   A)r\  r   r   rI  r  r&  r   r   r'  r   r   c
                    US L US L-  (       a  [        S5      eUc  [        S5      eUc  U R                  U5      nU R                  (       d$  U(       a  Uc  [        [	        5       [	        5       S9nUcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nUc#  Uc   [        XU R                  R                  5      n[        U=n[        5      (       d9  U R                  UUUUb  UR                   OS US.n[#        S0 UD6[%        S0 UD6S.n[        U	=n[        5      (       d-  U R                  UU	US S S.nS	[#        S0 UDS
['        U	5      0D60nUnU R)                  X5      n[        R*                  " U R                  R,                  S-  UR.                  S9nUU-  nU R1                  U5      nU R2                  S U R                  R4                    H$  nU" UUUUR6                     UUUUUUS	   4	0 U
D6nM&     U R9                  U5      nU R1                  U5      n[;        UUS9$ )Nr  z0`encoder_hidden_states` must be given in decoder)r)  r  r   r"   r  r  r  r  r  r  r   )r  rI  r  )r   r  r   r   r   get_seq_lengthr0   r  rJ   r   r   r  r\   rX  rx   ry   r)  r   r   rv  r  r  r]   r   rg   r  r  r  r  r   )r3   r\  r   r   rI  r  r&  r   r   r'  r   past_seen_tokensr  r  cross_attn_mask_mappingrk   r   r  r  s                      r5   rF   T5GemmaDecoder.forward  s    -t";<YZZ (OPP  --i8M}}/F1%1^&2nO !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L!o&=;IVZVaVaVnVnoNNB0DII++ -"0"0KZKf?#G#Glp ,K #5"C{"C%F%U%U&"
 5KK1TRR++ 5"8"0#' $K !"4 #!#%@AW%X#'# &"oomJ\\$++"9"93">mFYFYZ
%
2]3 KK(G$++*G*GHL(#&|'B'BC%'(89 M I 		-0]38++
 	
r7   )r  )	NNNNNNNNN)rN   rO   rP   rQ   r   r   r   r   rk  r-   r   r   r0   r   r   r   r  r+  r   r   r   rF   rS   rT   rU   s   @r5   r  r    s*   $%9C*+@J,  1515379=59$(598<9=]
E,,-]
 !.]
 u//0	]

 ""56]
   1 12]
 D>]
 !!1!12]
  (5]
 !) 6]
 +,]
 
3]
 ]
r7   r  c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	\
\            SS\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                      S\\R                     S\\   S\\   S\\R&                     S\\R&                     S\\   S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaModelio  r\   c                    > [         TU ]  U5        UR                  (       d  [        S5      e[	        UR
                  5      U l        [        UR                  5      U l        U R                  5         g )NzVT5GemmaModel only support encoder-decoder modeling. Use `T5GemmaEncoderModel` instead.)	r,   r-   is_encoder_decoderr   r  encoderr  rV  r  rh   s     r5   r-   T5GemmaModel.__init__q  sO     ((uvv%fnn5%fnn5r7   c                     U R                   $ r+   r  rK   s    r5   get_encoderT5GemmaModel.get_encoder|      ||r7   c                     U R                   $ r+   )rV  rK   s    r5   get_decoderT5GemmaModel.get_decoder  r  r7   c                 6    U R                   R                  5       $ r+   r  get_input_embeddingsrK   s    r5   r  !T5GemmaModel.get_input_embeddings      ||0022r7   c                 8    U R                   R                  U5      $ r+   r  set_input_embeddingsr3   new_embeddingss     r5   r  !T5GemmaModel.set_input_embeddings      ||00@@r7   r\  r   r   decoder_input_idsdecoder_attention_maskdecoder_position_idsencoder_outputsrI  r  decoder_inputs_embedsr&  r   r   r   c                    Uc  U R                   " SUUUU	S.UD6nUR                  nU R                  " SUUUU
UUUUUS.	UD6n[        UR                  UR                  UR                  SS5      (       a  UR                  OUR                  4UR                  UR                  UR                  UR                  UR                  S9$ )a8  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
r\  r   r   r  )	r\  r   r   r  rI  r   r'  r&  r   output_hidden_statesF)r  rI  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr  )	r  r  rV  r   rI  rz   rk   rJ  r  )r3   r\  r   r   r  r  r  r  rI  r  r  r&  r   r   r   decoder_outputss                   r5   rF   T5GemmaModel.forward  s    . ""ll #-)+	
 O !0 A A,, 
'1-/+"7#1)
 
 "-??+;;zz0%88 #2"?"?!335.99,==&5&G&G"1"?"?.99
 	
r7   )rV  r  )NNNNNNNNNNNN)rN   rO   rP   rQ   r#   r-   r  r  r  r  r   r   r   r0   r   r  
BoolTensorr   r   r   r+  r   r   r   rF   rS   rT   rU   s   @r5   r  r  o  sd   	} 	3A  156:378<=A;?599=048<$(598
E,,-8
 !!2!238
 u//0	8

 $E$4$458
 !))9)9 :8
 'u'7'788
 "/28
 ""568
  -8
  (58
 D>8
 !!1!128
 +,8
 
8
  8
r7   r  c                      ^  \ rS rSrS\4U 4S jjrS rS r\\	    SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\\   S\4S jj5       5       rSrU =r$ )T5GemmaEncoderModeli  r\   c                    > [         TU ]  U5        UR                  (       a  [        S5      e[	        UR
                  5      U l        U R                  5         g )NzQT5GemmaEncoderModel only supports encoder-only model. Use `T5GemmaModel` instead.)r,   r-   r  r   r  r  r  rh   s     r5   r-   T5GemmaEncoderModel.__init__  s?     $$pqq%fnn5r7   c                 6    U R                   R                  5       $ r+   r  rK   s    r5   r  (T5GemmaEncoderModel.get_input_embeddings  r  r7   c                 8    U R                   R                  U5      $ r+   r  r  s     r5   r  (T5GemmaEncoderModel.set_input_embeddings  r  r7   r\  r   r   r  r   r   c                 4    U R                   " SUUUUS.UD6nU$ )Nr  r  r  )r3   r\  r   r   r  r   r  s          r5   rF   T5GemmaEncoderModel.forward  s5     ,, 
)%'	

 
 r7   r  r  )rN   rO   rP   rQ   r#   r-   r  r  r   r   r   r0   r   r  r   r   r   r   rF   rS   rT   rU   s   @r5   r  r    s    } 3A  156:3704E,,- !!2!23 u//0	
  - +, 
  r7   r  c            %       l  ^  \ rS rSrSS/rSS0rSS/S/40rS\4U 4S	 jjrS
 r	S r
S rS rS r\\              S"S\\R$                     S\\R&                     S\\R$                     S\\R$                     S\\R(                     S\\R$                     S\\   S\\   S\\R&                     S\\R&                     S\\R$                     S\\   S\\R$                     S\\\R4                  4   S\\   S\\\R&                     \4   4 S jj5       5       rS\R4                  4S  jr S!r!U =r"$ )#T5GemmaForConditionalGenerationi  z!model.decoder.embed_tokens.weightzlm_head.out_proj.weightzlm_head.out_projcolwise_reprk   r=  r\   c                   > SUl         [        TU ]	  U5        [        U5      U l        UR
                  R                  U l        [        UR
                  R                  U R                  5      U l	        SU l
        U R                  5         g )NTForMaskedLM)r  r,   r-   r  rG  rV  r8  r7  r]   lm_head	loss_typer  rh   s     r5   r-   (T5GemmaForConditionalGeneration.__init__  sb    $(! !&)
 ..33$V^^%?%?Q&r7   c                 $    XR                   l        g r+   r  r1  r  s     r5   set_output_embeddings5T5GemmaForConditionalGeneration.set_output_embeddings  s     .r7   c                 .    U R                   R                  $ r+   r  rK   s    r5   get_output_embeddings5T5GemmaForConditionalGeneration.get_output_embeddings  s    ||$$$r7   c                     U R                   R                  (       aC  U R                  U R                  R                  U R                  5       R                  5       5        g g r+   )r\   rR  _tie_or_clone_weightsr  r1  r  r  rK   s    r5   _tie_weights,T5GemmaForConditionalGeneration._tie_weights   s@    ;;**&&t||'<'<d>N>N>P>e>e>gh +r7   c                 .    U R                   R                  $ r+   )rG  r  rK   s    r5   r  +T5GemmaForConditionalGeneration.get_encoder      zz!!!r7   c                 .    U R                   R                  $ r+   )rG  rV  rK   s    r5   r  +T5GemmaForConditionalGeneration.get_decoder  r  r7   r\  r   r   r  r  r  r  rI  r  r  labelsr&  r   logits_to_keepr   r   c                    U R                   (       ac  U R                  R                  S:w  aI  SU R                  R                   S3n[        5       (       a  [	        U5      e[
        R                  U5        Ub  Uc  U
c  U R                  U5      nU R                  " SUUUUUUUUU	U
UUS.UD6nUR                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nU R                  5       R                  nUR                  b4  UUR                  -  n[         R"                  " U5      nUUR                  -  nSnUb  U R$                  " UXR&                  40 UD6n[)        UUUR*                  UR,                  UR.                  UR0                  UR2                  UR4                  UR6                  S9	$ )a  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
r   ziIt is strongly recommended to train T5Gemma models with the `eager` attention implementation instead of `zp`. Use `eager` with `AutoModelForCausalLM.from_pretrained('<path-to-checkpoint>', attn_implementation='eager')`.N)r\  r   r   r  r  r  r  rI  r  r  r&  r   )	lossr=  rI  r  r  r  r  r   r  r  )r   r\   r   r    r   loggerwarning_oncer_  rG  r  rx   rR   slicer  r  final_logit_softcappingr0   r   loss_functionr8  r   rI  r  r  r  r  r   r  )r3   r\  r   r   r  r  r  r  rI  r  r  r  r&  r   r  r   msgr  rk   slice_indicesr=  decoder_configr  s                          r5   rF   'T5GemmaForConditionalGeneration.forward  s   : ==T[[==H#{{??@  Aqr  ()) o%##C("3";@U@] $ 1 1& 9.2jj /
)%/#9!5++'"7)/
 /
  (998B>SV8W8W~ot4]kmA}a,?@A))+2211=nDDDFZZ'FnDDDF%%ffooPPD+;;"1"G"G.AA,==&5&O&O"1"G"G.AA

 
	
r7   c                 $    U R                  U5      $ r+   )r_  )r3   r  s     r5   %prepare_decoder_input_ids_from_labelsET5GemmaForConditionalGeneration.prepare_decoder_input_ids_from_labelsa  s      ((r7   )r  r  rG  r8  )NNNNNNNNNNNNNr   )#rN   rO   rP   rQ   _tied_weights_keys_tp_plan_pp_planr#   r-   r  r  r  r  r  r   r   r   r0   r   r  r  r   r   r+  r   rR   r   r   r   rI   r   rF   r  rS   rT   rU   s   @r5   r  r    s   =?XY"M2H"o%6
$CDH	} 	/%i
""  156:378<=A;?599=59=A-1$(5934R
E,,-R
 !!2!23R
 u//0	R

 $E$4$45R
 !))9)9 :R
 'u'7'78R
 "/2R
 ""56R
   1 12R
  ((9(9:R
 ))*R
 D>R
 !!1!12R
 c5<</0R
  +,!R
" 
uU&&'8	9#R
  R
h)ELL ) )r7   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ ) T5GemmaForSequenceClassificationie  r\   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for sequence classification. When set to False, only encoder is used.
Nr/  皙?r  r,   r-   r.  r  rG  r  r  r]   rV  r   r-  scorer  r3   r\   r  r]   classifier_dropoutr4   s        r5   r-   )T5GemmaForSequenceClassification.__init__g  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r7   c                 6    U R                   R                  5       $ r+   rG  r  rK   s    r5   r  5T5GemmaForSequenceClassification.get_input_embeddings~      zz..00r7   c                 :    U R                   R                  U5        g r+   rG  r  r3   r   s     r5   r  5T5GemmaForSequenceClassification.set_input_embeddings      

''.r7   r\  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R                  c  US	:w  a  [        S
5      eU R                   R                  c  SnGOUb  XR                   R                  :g  R!                  UR"                  [$        R&                  5      n[$        R(                  " UR                  S   UR"                  [$        R&                  S9nUU-  R+                  S5      nU R                   R                  (       a*  US	-  n[$        R,                  " UUR                  S   S	-
  S9nO.Sn[.        R1                  U R                  R                   S35        U[$        R(                  " UUR"                  S9U4   nSnU
b  U R3                  UU
UU R                   S9n[5        UUUUS9$ )  
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, decoder_sequence_length)`, *optional*):
    Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0,
    config.decoder.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N8Passing input embeddings is currently not supported for  in encoder-decoder mode.If no `decoder_input_ids` or `decoder_inputs_embeds` are passed, `input_ids` cannot be `None`. Please pass either `input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`.F	r   r   r  r  r  r  r  r  r&  r   r   r  r   r"   z=Cannot handle batch sizes > 1 if no padding token is defined.r:   r}  )maxz will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`r  )r=  r  pooled_logitsr\   r  r=  rk   rJ  )r\   r  NotImplementedErrorr4   rN   r   r_  rG  r  r  r  rk   rJ  r  rJ   rX  r   r   r0   int32r  argmaxclampr  r  r   r   )r3   r\  r   r   r  r  r  r  r  r  r  r   outputsr  rk   rJ  r=  
batch_sizelast_non_pad_tokennon_pad_masktoken_indicesr%  r  s                          r5   rF   (T5GemmaForSequenceClassification.forward  s   2 ;;))y/@]E^%J4>>KbKbJcc|} 
 ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-. "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J{{--"a'"%*[[1CIZI`I`acIdghIh%i"!#>>**+ ,Z Z
 u||Jv}}MOaab%%VFR_hlhshs%tD' '!	
 	
r7   rG  r.  r  r+   
NNNNNNNNNN)rN   rO   rP   rQ   r#   r   r+  r-   r  r  r   r   r0   r   r   r   r  r   r   r   rF   rS   rT   rU   s   @r5   r  r  e  sS   } (4.  .1/  1515378<9=;?5959=A-1i
E,,-i
 !.i
 u//0	i

 $E$4$45i
 !) 6i
 'u'7'78i
 "/2i
   1 12i
  ((9(9:i
 ))*i
 +,i
 
"i
  i
r7   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjrS rS r	\
\          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\R                      S\\R                      S\\R                     S\\   S\4S jj5       5       rSrU =r$ )T5GemmaForTokenClassificationi  r\   r  c                   > Ub  X!l         [        TU ]	  U5        UR                  U l        UR                   (       a  [	        U5      U l        O[        U5      U l        UR                  R                  nUR                   (       a  UR                  R                  n[        USS5      n[        X0R                  U5      U l        U R                  5         g)z
is_encoder_decoder (`Optional`, *optional*):
    Whether use encoder_decoder for token classification. When set to False, only encoder is used.
Nr/  r  r  r  s        r5   r-   &T5GemmaForTokenClassification.__init__  s    
 )(:%  ++$$%f-DJ,V4DJnn00$$ ..44K$V-FL.{OOM_`
r7   c                 6    U R                   R                  5       $ r+   r  rK   s    r5   r  2T5GemmaForTokenClassification.get_input_embeddings  r  r7   c                 :    U R                   R                  U5        g r+   r  r  s     r5   r  2T5GemmaForTokenClassification.set_input_embeddings  r  r7   r\  r   r   r  r  r  r  r  r  r  r   r   c                    U R                   R                  (       a)  Uc&  Ub#  [        SU R                  R                   S35      eU R                   R                  (       a%  Uc"  U	c  Uc  [        S5      eU R                  U5      nU R                   R                  (       aB  U R                  " U4UUUUUUUU	SS.	UD6nUR                  nUR                  nUR                  nO;U R                  " U4UUUS.UD6nUR                  nUR                  nUR                  nU R                  U5      nSnU
b  U R                  UXR                   5      n[        UUUUS9$ )	r  Nr  r   r!  Fr"  r#  r&  )r\   r  r'  r4   rN   r   r_  rG  r  r  r  rk   rJ  r  r   r   )r3   r\  r   r   r  r  r  r  r  r  r  r   r+  r  rk   rJ  r=  r  s                     r5   rF   %T5GemmaForTokenClassification.forward  s   4 ;;))y/@]E^%J4>>KbKbJcc|}  ;;))/@/HMbMj  U 
 !% 1 1) <;;))*.**+-)"3'=%9 /+&;+ +G !( 9 9#99M 33J'+zz(-)+	(
 (G !( 9 9#11M ++J-.%%ffkkBD$'!	
 	
r7   r1  r+   r2  )rN   rO   rP   rQ   r#   r   r+  r-   r  r  r   r   r0   r   r   r   r  r   r   r   rF   rS   rT   rU   s   @r5   r4  r4    sS   } (4.  01/  1515378<9=;?5959=A-1N
E,,-N
 !.N
 u//0	N

 $E$4$45N
 !) 6N
 'u'7'78N
 "/2N
   1 12N
  ((9(9:N
 ))*N
 +,N
 
N
  N
r7   r4  )r  r  r  rF  r  r4  )Nr"   )r   NN)Ttypingr   r   r   r0   torch.nnr.   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r    r!   configuration_t5gemmar#   r$   
get_loggerrN   r  Moduler&   rW   rn   r   r   r   rR   r   rC   rI   r   r   r   r  r   r-  r7  r@  rF  rv  rz  r   r  r  r  r  r  r  r  r4  __all__r  r7   r5   <module>rO     s  , - ,   I ! C C ) R B 9  L F & l l E 
		H	%=RYY =( &<RYY <D(6	UU\\ 	U# 	U%,, 	U$ ## %II %<< % 
 % <<	 %
 U\\* %  % e_ % e_ % 5<<%& %FH)299 H)VR)BII R)j14 1h7- 7t		 	BII 	G)ryy G)T 7!_ 7! 7!t
0F 
8 
s x (()<< 3- \\	"W
+ W
tm
^ m
` R
) R
 R
j !0 ! !Hx)&<o x)v I
'= I
 I
X o
$: o
 o
dr7   