
    <hL                       S SK r S SKrS SKJr  S SKJrJrJrJr  S SK	r	S SK	J
r
  S SKJrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(  SSK)J*r*J+r+  SSK,J-r-  \+" 5       (       a  S SK.J/r/  S SK0J1r1J2r2  OSu  r/r1r2\*" 5       (       a	  S SK3J4r4J5r5  OSu  r5r4\(Rl                  " \75      r8 " S S\	R                  Rr                  5      r: " S S\
Rr                  5      r; " S S\5      r< " S S \
Rr                  5      r=S!\	R|                  S"\?S#\	R|                  4S$ jr@ SKS%\
Rr                  S&\	R|                  S'\	R|                  S(\	R|                  S)\\	R|                     S*\AS+\A4S, jjrBS- rCSLS. jrD " S/ S0\
Rr                  5      rES1\	R|                  S2\?4S3 jrFS4 rGS5 rH\I" \/\4\545      rJ " S6 S7\
Rr                  5      rK " S8 S9\
Rr                  5      rL " S: S;\
Rr                  5      rM " S< S=\
Rr                  5      rN " S> S?\
Rr                  5      rO " S@ SA\#5      rP\' " SB SC\P5      5       rQ " SD SE\P\5      rR\'" SFSG9 " SH SI\P5      5       rS/ SJQrTg)M    N)cycle)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGated;   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer4   eps	__class__s       b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/zamba2/modeling_zamba2.pyr.   Zamba2RMSNormGated.__init__<   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ N   T)keepdim)dtypetor0   float32r   
functionalsilushaper4   viewpowmeanrsqrtr3   r2   )	r5   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r9   forwardZamba2RMSNormGated.forwardB   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r;   )r4   r3   r2   gư>r,   )__name__
__module____qualname____firstlineno__r.   rS   __static_attributes____classcell__r8   s   @r9   r)   r)   ;   s    %; ;r;   r)   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Zamba2RMSNormP   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z,
Zamba2RMSNorm is equivalent to T5LayerNorm
N)r-   r.   r   r/   r0   r1   r2   r3   )r5   r6   r7   r8   s      r9   r.   Zamba2RMSNorm.__init__Q   s/     	ll5::k#:; #r;   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ r=   )	rA   rB   r0   rC   rH   rI   rJ   r3   r2   )r5   rK   rM   rR   s       r9   rS   Zamba2RMSNorm.forwardY   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r;   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler2   rF   r3   r5   s    r9   
extra_reprZamba2RMSNorm.extra_repr`   s*    ))*+6$2G2G1HIIr;   )r3   r2   rU   )	rV   rW   rX   rY   r.   rS   rg   rZ   r[   r\   s   @r9   r^   r^   P   s    $;J Jr;   r^   c                   x   \ rS rSrSrSrSrSr\R                  S4S\
S\S\R                  S\\   4S	 jjrS
 rS\S\\R$                  \R$                  4   4S jr SS\R$                  S\R$                  S\S\\\\4      S\\R$                  \R$                  4   4
S jjrS\R.                  4S jrSS\\   S\4S jjrS\\\R$                     \\R$                     4   4S jr\SS\\\\R8                           SS4S jj5       rS\S\R$                  S\R.                  S\R$                  4S jrS rSr g) Zamba2HybridDynamicCached   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
NFconfig
batch_sizerA   devicec           
      (   X0l         UR                  U l        SU l        [        UR                  UR
                  -  5      U l        UR                  U l        UR                  U l
        UR                  U l        / U l        0 U l        0 U l        0 U l        0 U l        0 U l        [%        UR&                  5       H  n[(        R*                  " UU R                  SUR,                  -  UR                  -  -   U R                  UUS9U R                   U'   [(        R*                  " X R                  UR.                  U R                  XCS9U R"                  U'   U R                  U   S:X  d  M  U R                  R1                  U5        M     [%        UR&                  5       Vs/ sH  n[(        R2                  " / /U-  US9PM     snU l        [%        UR&                  5       Vs/ sH  n[(        R2                  " / /U-  US9PM     snU l        g s  snf s  snf )NFr>   rn   rA   hybridrn   )rA   layers_block_typehas_previous_stateintmamba_expandr6   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr0   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r5   rl   rm   rA   rn   i_s          r9   r.   !Zamba2HybridDynamicCache.__init__v   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0A"'++&&V-A-A)AFDXDX)XX%%#DQ "'..0D0DdFYFYbh"DOOA %%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   "H
"Hc                 ,    [        U R                  5      $ r,   )lenr   rf   s    r9   __len__ Zamba2HybridDynamicCache.__len__   s    4>>""r;   	layer_idxreturnc                 >    U R                   U   U R                  U   4$ r,   )r   r   r5   r   s     r9   __getitem__$Zamba2HybridDynamicCache.__getitem__   s!    ~~i($*:*:9*EEEr;   
key_statesvalue_statescache_kwargsc                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )Nr?   r   r>   dim)r   rF   r   r0   cat)r5   r   r   r   r   s        r9   updateZamba2HybridDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr;   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   r   r   rn   index_selectrB   r   r   r   )r5   r   r   rn   s       r9   reorder_cache&Zamba2HybridDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r;   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  d!  U R                  U   R                  5       S:X  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r}   r   r   numelrF   r   s     r9   get_seq_length'Zamba2HybridDynamicCache.get_seq_length   sj     3<CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r;   c                     [        S5      eNzAZamba2HybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrf   s    r9   to_legacy_cache(Zamba2HybridDynamicCache.to_legacy_cache   s    !"effr;   past_key_valuesr   c                     [        S5      er   r   )clsr   s     r9   from_legacy_cache*Zamba2HybridDynamicCache.from_legacy_cache   s    !"effr;   new_conv_statecache_positionc                 N   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR	                  UR
                  5      US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r   r?   shiftsdims)r   clampr{   rollrB   rn   zero_)r5   r   r   r   
conv_states        r9   update_conv_state*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r;   c                 l    U R                   R                  5         U R                  R                  5         g r,   )r   r   r   rf   s    r9   resetZamba2HybridDynamicCache.reset   s$     r;   )r   r~   r   r{   r   rA   rt   rw   r   rs   r|   ry   r   r}   r   r,   )r   )!rV   rW   rX   rY   __doc__r   r   is_compileabler0   float16r    ru   rA   r   strr.   r   re   Tensorr   dictr   r   
LongTensorr   r   r   classmethodFloatTensorr   r   r   rZ    r;   r9   rj   rj   d   s    IKN KP--quu"u03u<AKKuaijmanu@#FS FU5<<3M-N F 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3guU\\':E%,,<O'O!P g guUEVEV?W9X0Y ges g g
+
+.3ll
+LQL\L\
+	
+ r;   rj   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Zamba2RotaryEmbedding   rl   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r-   r.   hasattr
isinstancer   r   getr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrl   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r5   rl   rn   r   r8   s       r9   r.   Zamba2RotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r;   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r?   r   mpscpuF)device_typeenabledr>   r   rA   )r   floatexpandrF   rB   rn   r   r   r   r0   autocast	transposer   cosr   sinrA   )
r5   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r9   rS   Zamba2RotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   rl   r   r   r   r   r   r,   )rV   rW   rX   rY   r    r.   r0   no_gradr   rS   rZ   r[   r\   s   @r9   r   r      s6    /| / /" ]]_<  <r;   r   rK   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rF   r   reshape)rK   r   batchnum_key_value_headsslenhead_dims         r9   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr;   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr>   r   r   r?   )r   rA   )ptrainingr   )r   num_key_value_groupsr0   matmulr   rF   r   rD   softmaxrC   rB   rA   r  r
  
contiguous)r  r  r  r  r  r  r  kwargsr   r   attn_weightscausal_maskattn_outputs                r9   eager_attention_forwardr    s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r;   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr?   r>   r   )rF   r0   r   )r   x1x2s      r9   rotate_halfr  "  sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r;   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer  )qkr   r   r   unsqueeze_dimq_embedk_embeds           r9   apply_rotary_pos_embr  )  sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr;   c                   X  ^  \ rS rSrSr   SS\S\\   S\\   S\\   4U 4S jjjr   SS\	R                  S\S	\\	R                     S
\\   S\\\	R                  \	R                  4      S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )Zamba2AttentioniD  a*  
Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
rl   r   num_fwd_mem_blocksblock_idc           
        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        X0l        UR,                  U l        X@l        UR2                  (       Ga  [        R4                  " / 5      U l        [        R4                  " / 5      U l        [        R4                  " / 5      U l        [=        U R*                  5       GH  nXQR>                  -  U:X  Gar  [        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      n[        R@                  " [        R                  " U R                  U R                  RB                  SS9[        R                  " U R                  RB                  U R                  SS95      nO?[        RD                  " 5       n[        RD                  " 5       n[        RD                  " 5       nU R6                  RG                  U5        U R8                  RG                  U5        U R:                  RG                  U5        GM     [I        U R.                  5       V	V
s0 sH  u  pX_M	     sn
n	U l%        g s  sn
n	f )Nr>   g      TFbias)&r-   r.   rl   r   attention_hidden_sizeattention_head_dimr   num_attention_headsr   r  r   r  	is_causalattention_dropoutr   Linearq_projk_projv_projr6   o_projr"  hybrid_layer_idslayer_block_mapr#  use_shared_attention_adapter
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr   num_mem_blocks
Sequentialadapter_rankIdentityr   	enumerate	layer_dic)r5   rl   r   r"  r#  r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexr  r8   s              r9   r.   Zamba2Attention.__init__T  s>    	"%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejk"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   Q.rK   r  past_key_valueposition_embeddingsr  r   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         nU R
                  R"                  S:w  a  [$        U R
                  R"                     nU" U U	U
UU4U R&                  (       d  SOU R(                  U R*                  S.UD6u  nnUR,                  " / UQSP76 R/                  5       nU R1                  U5      nUU4$ )Nr?   r   r>   eager        )r  r  )rF   r   r-  r.  r/  rl   r3  r=  r5  r6  r7  rG   r   use_mem_roper  r   r  _attn_implementationr   r
  r+  r  r   r  r0  )r5   rK   r   r  rC  rD  r  input_shapehidden_shapequery_statesr   r   adapter_layer_idxr   r   attention_interfacer  r  s                     r9   rS   Zamba2Attention.forward  s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L%'5'<'<ZW`'a$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r;   )r+  r'  r#  rl   r   r*  r.  r2  r=  r   r6  r5  r7  r   r"  r  r0  r-  r  r/  r$   )rV   rW   rX   rY   r   r    r   ru   r.   r0   r   rj   re   r   r   rS   rZ   r[   r\   s   @r9   r!  r!  D  s   $ $(,0"&6\6\ C=6\ %SM	6\
 3-6\ 6\x 26=AKO1)||1) 1) !.	1)
 !!9:1) &eELL%,,,F&GH1) -.1) 
u||Xell3XeELL>Q5RR	S1) 1)r;   r!  input_tensorpad_sizec                     [        U R                  5      S:X  a
  SSSSSUSS4OSSSUSS4n[        R                  R                  R                  XSSS9$ )zv
Padding x tensor with `pad_size` on the seq_len dim (dim=1)

Assumes that we only have tensors of either size 4 or 3
   r   constant)moder  )r   rF   r0   r   rD   pad)rP  rQ  	pad_shapes      r9   pad_tensor_by_sizerX    sd     47|7I7I3Ja3OAq!Q!Q/VWYZ\]_gijlmUnI88""<ST"UUr;   c                    [        X5      n [        U R                  5      S:X  a-  U R                  U R                  S   SX R                  S   5      $ U R                  U R                  S   SX R                  S   U R                  S   5      $ )z
Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and
simultaneously splitting it into chunk sequences.

Assumes that we only have tensors of either size 4 or 3
r   r   r?   r>   )rX  r   rF   r   )rP  rQ  
chunk_sizes      r9   reshape_into_chunksr[    s     &l=L
<!###L$6$6q$92zK]K]^_K`aa ##q!2z3E3Ea3H,J\J\]^J_
 	
r;   c           	      
   U R                  S5      nU S   R                  " / U R                  5       QUP76 n [        R                  " [        R                  " XU R
                  [        R                  S9SS9nU R                  U) S5      n [        R                  " U SS9n[        R                  " [        R                  " XU R
                  [        R                  S9SS9nUR                  U) [        R                  * 5      nU$ )zg
More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions.
r?   .Nrp   diagonalr   r   r   )
sizer   r0   trilr1   rn   boolmasked_fillcumsuminf)rP  rZ  masktensor_segsums       r9   segment_sumrh    s     ""2&J  	*11S<3D3D3FS
SL::ejj@S@S[`[e[efqstD++TE15LLL26M ::ejj@S@S[`[e[efqrsD!--teeiiZ@Mr;   c                     ^  \ rS rSrSrSS\S\\   4U 4S jjjr  SS\	R                  S\\   S\\	R                     4S	 jjrSS\\   S\\	R                     4S
 jjr  SS\\   S\\	R                     4S jjrSrU =r$ )Zamba2MambaMixeri  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
rl   r   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        SU RP                  l)        [U        U R                  U R                  U R&                  -  SS9U l+        [        RD                  " [F        RH                  " U R.                  5      5      U l,        SU RX                  l)        [        R>                  " U R                  U R                  UR@                  S9U l-        [\        (       d  [^        Ra                  S	5        g g )
NrE   r>   Tr   )in_channelsout_channelsr&  kernel_sizegroupspaddingr%  gh㈵>)r4   r7   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r-   r.   rl   r6   rx   ry   rz   r{   ru   rv   rw   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   r|   	num_headsrZ  time_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr,  add_bias_linearin_projr/   r0   r1   dt_biasarangelogA_log_no_weight_decayr)   normDout_projis_fast_path_availableloggerwarning_once)r5   rl   r   projection_sizeAr8   s        r9   r.   Zamba2MambaMixer.__init__  s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#&""t/E/E/V\`
	 ejj89"&		$"8"8$:J:JQWQgQgh%%> &r;   rK   cache_paramsr  c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGb%  UR
                  (       Ga  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     U R                  R                  R                  S5      U R                  R                   U R"                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R$                  " U R&                  R)                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R+                  SU R,                  U R                  5      R/                  [        R0                  S9nUS S 2S S 2S 4   R+                  SSU R,                  5      nU R2                  S S 2S S4   R+                  SU R,                  5      nU R4                  S S 2S S4   R+                  SU R,                  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  U R,                  5      n[9        UR:                  U R                     UUUUUUS USS9
nUR7                  X@R                  U R,                  -  5      nU R=                  X5      nU R?                  U5      S S 2S S4   nU$ UbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      nU R                  U5      n[        R$                  " U R&                  R)                  5       5      * nU RD                  c  0 OS	U RD                  0nUb  [        R@                  " US:H  5      nOSnU RF                  (       Ga   U RH                  (       a  Uc  U(       a  [K        UU R                  R                  R                  S5      U R                  R                   U R2                  U4U R4                  U RL                  S U R"                  U R<                  R                  U R<                  RN                  U R>                  R                  U R>                  R                   U R,                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbv  URQ                  SS5      n[R        RT                  RW                  UU RX                  UR                   S   -
  S45      nUR                  U R                     R[                  U5        [\        b  U R"                  S;  aJ  U R_                  U R                  URQ                  SS5      5      RQ                  SS5      S S 2S U24   5      nOv[]        URQ                  SS5      U R                  R                  R                  S5      U R                  R                   U R"                  S9RQ                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      n[a        UR7                  XESU R,                  5      UUUR7                  XEU R                  S5      UR7                  XEU R                  S5      4U RL                  U R4                  S S SU R2                  SS.UD6u  nnUb+  Ub(  UR:                  U R                     R[                  U5        UR7                  XES5      nU R=                  UU5      nU R?                  U5      nU$ )Nr>   r   r?   r   .r   T)zr  dt_softplusdt_limitF)r  rZ  seq_idxrr  rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rE   swish)r   r2   r&  rr  )rZ  r  r  r  r  r  r  )1rF   rv  ry   rw   rw  rt   r  squeezer{  r0   splitr&   r   r   r}  r2   r&  rr  expr  r   r   r   rB   rC   r  r  rG   r!   r   r  r  allrA   rx  ru  r
  r#   rZ  r3   r   r   rD   rV  r{   copy_r%   rt  r"   )r5   rK   r  r  rm   seq_lenr   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrL   hidden_states_B_CdtBCr  r  r  hidden_states_reshapedoutrA   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r9   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forwardB  sT    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!((8""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..zBG"iiT:mmK0
r;   c                    UR                   u  pEnUR                  nUb2  UR                  (       a!  U R                  UR	                  S5      5      nOOUb;  [
        R                  " US:H  5      (       d  XS S 2S S 2S 4   -  R                  U5      nU R                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUGb  UR                  U R                     R!                  5       nUR                  UR"                  5      nUR                  (       Ga2  U
R%                  S5      n
UR&                  U R                     n[
        R(                  " USSS9nUR*                  S:X  a  US S 2SS S 24   OUUS S 2S S 2S4'   UR&                  U R                     R-                  U5        [
        R.                  " UR                  UR"                  5      U R0                  R2                  S S 2SS S 24   -  SS9nU R4                  (       a  XR0                  R6                  -  nU R9                  U5      R                  U5      S S 2S S4   nGOUR;                  SS5      n[<        R>                  RA                  UU RB                  UR                   S   -
  S45      nUR&                  U R                     R-                  U5        U R9                  U R1                  U5      R;                  SS5      5      S S 2S U2S S 24   nUbG  [
        R                  " US:H  5      (       d)  UR                  nXS S 2S S 2S 4   -  R                  U5      nO[
        RD                  " X@R                  U RF                  U R                  4UR"                  US	9nU R9                  U R1                  UR;                  SS5      5      SS U24   R;                  SS5      5      n[
        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[
        RH                  " U RJ                  RM                  5       5      * nUGbq  UR                  (       Ga_  UR*                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR;                  SS5      RO                  XLR                   S   U RF                  5      nU RP                  S
   RO                  U RP                  R                   S   U RF                  5      n[
        R<                  R>                  RS                  UUR                  UR                  5      -   5      n[
        RT                  " XRV                  5      nUS   RO                  U R                  U RF                  U R                  5      R                  [
        RX                  S9n[
        RH                  " US
   U-  5      nUR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUS
   USS S S 24   -  nUR[                  USU RF                  5      nUUS
   -  nUR                  U R                     R-                  UR                  U R                     U-  U-   5        UR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUR                  U R                     R                  UR                  5      nUR_                  X@R                  -  U RF                  U R                  5      nUR_                  X@R                  -  U R                  S5      n[
        R`                  " UU5      nUR_                  X@R                  U RF                  5      nU Rb                  S
   RO                  U Rb                  R                   S   U RF                  5      nUUU-  -   R                  UR                  5      nUR[                  US5      S S 2S S4   nGO'[<        R>                  RS                  XRP                  -   5      n[
        RT                  " XRV                  5      nUR[                  XESU RF                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nURe                  U R                  U R                  -  SU R                  S9nURe                  U R                  U R                  -  SU R                  S9nU Rf                  XPRf                  -  -
  U Rf                  -  nU Rb                  S
   [i        UU5      -  nXS
   -  nUR                  UR                  5      U-  nUUUU4 Vs/ sH  n[k        UUU Rf                  5      PM     snu  nnnnURm                  SSSS5      n[
        Rn                  " USS9n[
        RH                  " [q        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R/                  SS9n!U!S
   URm                  SSSSS5      S
   -  n"U"R/                  SS9n#U#S
   US S 2S S 2S 4   -  R/                  S5      n$[
        RH                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rm                  SSSS5      S
   -  n&U&Rm                  SSSSS5      S
   URm                  SSSSS5      SS S S 24   -  R/                  SS9Rm                  SSSSS5      n'Ub3  UR                  (       a"  UR                  U R                     S S 2S S4   n(O[
        Rr                  " U'S S 2S S24   5      n([
        Rt                  " U(U'/SS9n'[
        RH                  " [q        [<        R>                  RA                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rm                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R/                  SS9n+U+Rm                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   nn'[
        RH                  " U5      n-USS S S 24   U'S S 2S S 2S S4   -  n.U-Rm                  SSSS5      n/U.R/                  S5      U/S
   -  n0U$U0-   nUR[                  USU R                  U RF                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nUR[                  XES5      nUb+  Ub(  UR                  U R                     R-                  U5        U Rw                  UU
5      n1U Ry                  U1R                  U5      5      n2U2$ s  snf )Nr   r?   r>   r   r   r   r   .rp   r]  ).NNr   )r   output_sizerS  )r   r   )=rF   rA   rt   r  r  r0   r  rB   rw   rv  ry   rw  r  r{  r   r   clonern   r  r   r   ndimr  sumr}  r2   rq  r&  rt  r   r   rD   rV  r{   r   r   r  r  r   r   r  softplusr   ry  rC   r   r  rG   bmmr  repeat_interleaverZ  rX  r[  permuterd  rh  
zeros_liker   r  r  )3r5   input_statesr  r  rm   r  r   rA   r  r  rL   rK   r  r  r   r  r  r  r  dAdBdBxr   ssm_states_reshaped
C_reshapedyr  rQ  
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr  contextualized_statess3                                                      r9   torch_forwardZamba2MambaMixer.torch_forward  s[   !-!3!3
Q""#(G(G $\-A-A!-D E)%))NA<M2N2N$0!Q*3M$M#Q#QRW#XL $\ :!''+a$2H2H.HHAPTP]P]L]`d`s`sLssuy  vD  vD  D  IJ  J(8(>(>t55t~~V\^ )? )
%1M
 #$//?EEGI!]%9%9:I...~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%![[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-eiiPQ@Q6R6R)//E%2Aq$J5O%O$S$STY$ZM^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**#(G(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   	 zc                     [         (       a@  SU R                  R                  R                  R                  ;   a  U R                  XU5      $ U R                  XU5      $ )Ncuda)r  r  r2   rn   r   r  r  )r5   rK   r  r  s       r9   rS   Zamba2MambaMixer.forward  sM     "!f0C0C0J0J0O0O&O,,].YY!!-~NNr;   )r  r  rt  rr  rZ  rl   r}  r{  r{   r  r   r6   r  rw   r   rv  r  rw  r  ry   rx  rz  ry  rq  ru  r,   r'   )rV   rW   rX   rY   r   r    r   ru   r.   r0   r   rj   r  r  rS   rZ   r[   r\   s   @r9   rj  rj    s    ?| ? ? ?H <@15	T||T 78T !.	Tn%AY8Z %qyz  {G  {G  rH %J <@15		O 78	O !.		O 	Or;   rj  c                   H   ^  \ rS rSrSS\S\\   4U 4S jjjrSS jrSr	U =r
$ )		Zamba2MLPi  rl   r#  c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 sH  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
r>   r%  FN)r-   r.   rl   r6   rw   r"  r#  r   r,  r~  gate_up_proj	down_projr   
hidden_actact_fnr4  gate_up_proj_adapter_listr   r8  r9  r:  r;  r   r1  r<  r=  )
r5   rl   r"  r#  r   gate_up_proj_adapterr2  rA  r  r8   s
            r9   r.   Zamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr>   r?   r   r   r   )r  r=  r  r0   chunkr  r  )r5   hidden_stater   gate_up_stateoutputs        r9   rS   Zamba2MLP.forward  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r;   )
r  r#  rl   r  r  r  r6   rw   r=  r"  r'   r,   )rV   rW   rX   rY   r    r   ru   r.   rS   rZ   r[   r\   s   @r9   r  r    s0    W| WPXY\P] W W< r;   r  c                   F  ^  \ rS rSrSS\S\\   S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S	\\
   S
\\   S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  rl   r#  r   c                 "  > [         TU ]  5         X l        [        UR                  5      n[        USXBS9U l        [        XUS9U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr?   )r   r"  r#  )r"  r#  r7   )r-   r.   r#  r   r1  r!  	self_attnr  feed_forwardr^   r'  rms_norm_epsinput_layernormr6   pre_ff_layernorm)r5   rl   r#  r   num_gsr8   s        r9   r.   $Zamba2AttentionDecoderLayer.__init__  sz     V,,-(2RXl%fRZ[,V-I-IvObObc -f.@.@fFYFY Zr;   rK   original_hidden_statesr  rC  output_attentionsrD  r  r   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU4n
U(       a  X4-  n
U
$ )aj  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
r?   r   )rK   r   r  rC  r  rD  r   )r0   concatenater  r  r  r  )r5   rK   r  r   r  rC  r  rD  r  self_attn_weightsoutputss              r9   rS   #Zamba2AttentionDecoderLayer.forward  s    > ))=*QWYZ,,];+/>> ,
'))/ 3,
 ,
( --m<))-C "++Gr;   )r#  r  r  r  r  r'   )NNFN)rV   rW   rX   rY   r    r   ru   r.   r0   r   rj   rb  r   r   r   re   r   rS   rZ   r[   r\   s   @r9   r  r    s    [| [x} [X`adXe [ [ 26=A,1:>3||3 !&3 	3
 !.3 !!9:3 $D>3 &e&6&673 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r;   r  c                     ^  \ rS rSrS\S\4U 4S jjr         SS\R                  S\	\R                     S\	\   S\	\R                     S\	\R                     S	\	\
   S
\	\   S\	\   S\	\R                     S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )Zamba2MambaDecoderLayeri  rl   r   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)rl   r   r  )	r-   r.   rj  mambar^   r6   r  r  r   )r5   rl   r   r8   s      r9   r.    Zamba2MambaDecoderLayer.__init__  s:    %VI
,V-?-?VEXEXY"r;   rK   r  r  r  rC  r  	use_cacher   transformer_hidden_statesr   c                     UnU
b  X-   OUnU R                  U5      nU R                  UUUS9nSnX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
N)rK   r  r  )r  r  )r5   rK   r  r   r  r  rC  r  r
  r   r  r  residualr  r  s                  r9   rS   Zamba2MambaDecoderLayer.forward  s    < !
 :S9^M5dq 	 ,,];

'') # 
 ! !0 "++G((Gr;   )r  r   r  )	NNNNNFFNN)rV   rW   rX   rY   r    ru   r.   r0   r   r   rj   rb  r   re   r   rS   rZ   r[   r\   s   @r9   r  r    s   #| # # :>#'15.2=A,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!9:: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X: :r;   r  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )Zamba2HybridLayeriU  shared_transformerlinearr  c                 F   > [         TU ]  5         X l        X0l        Xl        g r,   )r-   r.   r  mamba_decoderr  )r5   r  r  r  r8   s       r9   r.   Zamba2HybridLayer.__init__V  s!     	""4r;   rK   r  r   r  r  rC  r  r
  rD  r   c
           
          U R                  UUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)r  r   r  rC  r  rD  r   r   )r  r  rC  r  r
  rD  r>   N)r  r  r  )r5   rK   r  r   r  r  rC  r  r
  rD  layer_outputsr  r  s                r9   rS   Zamba2HybridLayer.forward^  s    @ //#9&)/ 3 0 
 %2!$4! -a 0$(KK0I$J!**&?))/ 3 + 
 *1-/@AMRSRTDUUMr;   )r  r  r  )NNNNNFFN)rV   rW   rX   rY   r  r   r,  r  r.   r0   r   r   ru   rj   rb  r   re   r   rS   rZ   r[   r\   s   @r9   r  r  U  s   5"=5GIyy5Yp5 :>#'15.2=A,1$):>>||> !) 6> C=	>
 !.> ell+> !!9:> $D>> D>> &e&6&67> 
u  (51B1BEDUDU1U+V"WW	X> >r;   r  c                   X   ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSrU 4S jrS	rU =r$ )
Zamba2PreTrainedModeli  rl   modelTr  r  r   c                   > [         TU ]  U5        [        U[        5      (       Ga  [        R
                  " [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      -
  -  [        R                  " U R                  R                  5      -   5      R                  U R                  R                  S9nU[        R                  " [        R                  " U* 5      * 5      -   nUR                   R"                  R%                  U5        [        R&                  " SUR(                  S-   5      nUR*                  R"                  R%                  [        R                  " U5      5        UR,                  R"                  R/                  S5        g g )N)minr   g      ?)r-   _init_weightsr   rj  r0   r  randrl   r|   mathr  rz  ry  r   time_step_floorexpm1r  datar  r  rw  r  r  fill_)r5   r  r  inv_dtr  r8   s        r9   r  #Zamba2PreTrainedModel._init_weights  s=   f%f.//

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f-Q 0 01 45ALL##EIIaL1HHMM$ 0r;   r   )rV   rW   rX   rY   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  rZ   r[   r\   s   @r9   r  r    sG    &*#68QR"3NL% %r;   r  c                   B  ^  \ rS rSrSrS\4U 4S jjr\          SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\	R                     S
\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       rS rS rSrU =r$ )Zamba2Modeli  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
rl   c           	        > [         TU ]  U5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        [        UR                  5       Vs/ sH  n[        XS9PM     nn/ n/ nUR                  U l        [        UR                  5       H  nUR                  U   S:X  a  UR                  [!        XS95        M0  UR                  U   S:X  d  ME  UR                  [        R"                  " U R                  R                  U R                  R                  SS95        UR                  [!        XS95        M     [%        U5      n[%        U5      n['        U5      nU R)                  X5U5      n[        R*                  " U5      U l        UR.                  U l        [1        UR                  UR2                  S9U l        UR6                  (       a6  UR8                  (       a  [:        R=                  S5        [?        U5      U l         SU l!        U RE                  5         g s  snf )	N)r#  r  r   rq   Fr%  r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)#r-   r.   rl   pad_token_idpadding_idx
vocab_sizer   	Embeddingr6   embed_tokensr   r8  r  rs   r   r   r  r,  iterr   
get_layersr4  layersrI  r^   r  final_layernormrH  use_long_contextr  r  r   
rotary_embgradient_checkpointing	post_init)	r5   rl   r  blocksmamba_layerslinear_layersr   r;  r8   s	           r9   r.   Zamba2Model.__init__  s    !.. ++LL):):F<N<NPTP`P`aKPQWQfQfKghKga-fAKgh!'!9!9v//0A''*g5##$;F$PQ))!,8$$RYYt{{/F/FH_H_fk%lm##$;F$PQ 1 L)]+vEmmF+$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	7 is   I	input_idsr  r   r   inputs_embedsr
  r  output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       aM  UcJ  Ub  UR                  S   OUR                  S   n[        U R                   XR                  U R                   S9nU
cM  Ub  UR#                  U R$                  S9OSn[        R&                  " XUR                  S   -   UR                   S9n
Uc  U
R)                  S5      nU R+                  X%U
5      nU R                   R,                  (       a  U R/                  X5      nOS nU(       a  S	OS nU(       a  S	OS n[1        U R2                  5       H  u  nnU(       a  UU4-  nU R                  (       a6  U R                  (       a%  U R5                  UR6                  UUUUUUUUU5
      nOU" UUUUUUUUUS
9	nUS   nU(       d  Mv  US   c  M~  UUS   4-  nM     U R9                  U5      nU(       a  UU4-  nUb  UR:                  (       d  SUl        [=        UU(       a  UOS UUS9nU	(       a  U$ UR?                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   rA   rn   r3  r   rr   r   )r  r   r  r  rC  r  r
  rD  T)last_hidden_stater   rK   
attentions) rl   r  rG  r
  use_return_dict
ValueErrorr?  r
  r  r  r8  r0   r  rF   rj   rA   rn   r   first_transformer_layer_idr  r  _update_causal_maskrH  r>  r<  r;  _gradient_checkpointing_func__call__r<  rt   r   to_tuple)r5   rE  r  r   r   rF  r
  r  rG  rH  r   rK   r  rm   past_seen_tokensr  rD  all_hidden_statesall_self_attnsr   layerr  r  s                          r9   rS   Zamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JV`V`imitituO! #.  ..9X9X.Y 
 #\\ ]5H5H5K"KTaThThN )33A6L..~n] ;;##"&//-"N"&"6BD0d )$++ 6Iu#!m%55!**t}} $ A ANN!*"#%'! !&!+A'#1 +#2&7'(;
! *!,M   #/"}Q'7&99NE !7H ,,];  -!11&/Q/Q15O.(+/8Od+%	
 %v;&//*;;r;   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nflash_attention_2rG  r   r?   )
fill_valuerA   rn   r^  rr   r   r>   .sdpa)r  xpunpu)rl   rI  rA   rn   r0   finfor  rF   fulltriur  r   r   r  r   eqrc  r   r   _unmask_unattended)r5   r  rP  r   rA   rn   	min_dtypesequence_lengthtarget_lengthr  mask_lengthpadding_masks               r9   rP  Zamba2Model._update_causal_maskb  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Kr;   c           
         / n/ U l         SU l        [        U R                  5       GH!  u  pVUS:X  Ga  U R                  S:X  a  XPl        [	        U5      nU R
                  R                  [        U R
                  R                  5      -  S:  Gam  SU S3n[        R                  " US-   S-   S-   S	-   S
-   5      n	U R                   R                  U	5        Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     U R
                  R                  (       a  Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     UR                  [        U[	        U5      [	        U5      5      5        GM  UR                  [	        U5      5        GM$     U$ )Nr   rq   r   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysrO  r<  rs   nextrl   r8  r   r1  recompiler   r#  r   r3  r  )r5   rA  rC  rB  r;  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r9   r:  Zamba2Model.get_layers  s3   "$*+'$-d.D.D$E HX%22a76>3V;;--DKK4P4P0QQTUU(1(;Q%RN(*

& !PQ OO J	J
   )% ++223DE!"J'+'='=&(2zKKD^D^7^bgbpbp7p.0jj a"%j/!2"7!8/O
 !33::?K"a
 (> {{??%&
+/+A+AK*h6:HbHb;bfkftft;t79zz%q&)*o%6 '<%<8" 4 !% 7 7 > >?S T&!OJ ,B /tM7JDQ]L^_`d<01S %FT r;   )rI  rk  rl   r8  r<  rO  r?  r;  rs   r5  r>  r6  
NNNNNNNNNN)rV   rW   rX   rY   r   r    r.   r   r   r0   r   r   rj   r   rb  r   re   r   rS   rP  r:  rZ   r[   r\   s   @r9   r1  r1    s)   "| "H  151537>B59$(,0/3&*59v<E,,-v< !.v< u//0	v<
 "":;v<   1 12v< D>v< $D>v< 'tnv< d^v< !!1!12v< 
u--	.v< v<p!F. .r;   r1  c                     ^  \ rS rSrS\4U 4S jjrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                  4   S\\\4   4S jj5       r      SS jrSrU =r$ )Zamba2ForCausalLMi  rl   c                    > [         TU ]  U5        [        U5      U l        S/U R                  R                  QU l        UR
                  U l        [        R                  " UR                  UR
                  SS9U l	        U R                  5         g )Nzlm_head.weightFr%  )r-   r.   r1  r  rk  r6  r   r,  r6   lm_headr@  r5   rl   r8   s     r9   r.   Zamba2ForCausalLM.__init__  so      (
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r;   c                     Xl         g r,   r  )r5   decoders     r9   set_decoderZamba2ForCausalLM.set_decoder  s    
r;   c                     U R                   $ r,   r  rf   s    r9   get_decoderZamba2ForCausalLM.get_decoder  s    zzr;   rE  r  r   r   rF  labelsr
  r  rG  rH  r   logits_to_keepr   c                 .   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R	                  UUUUUUUU	UU
S9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Zamba2ForCausalLM

>>> model = Zamba2ForCausalLM.from_pretrained("Zyphra/Zamba2-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba2-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
rE  r  r   r   rF  r
  r  rG  r   rH  r   r   losslogitsr   rK   rL  )rl   r  rG  rM  r  r   ru   slicer}  loss_functionr6  r   r   rK   rL  )r5   rE  r  r   r   rF  r  r
  r  rG  rH  r   r  r  r  rK   slice_indicesr  r  r  s                       r9   rS   Zamba2ForCausalLM.forward  sK   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W8W~ot4]kmA}a,?@A%%ffooPPDY,F'+'7D7V#CVC%#33!//))
 	
r;   c           	         US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUU R                  R                  US.5        U
$ )Nr?   r   r   rJ  rF  rE  )r   r   r
  r  r  r   )rF   rj   rl   rA   rn   longrd  masked_fill_r  r   num_logits_to_keep)r5   rE  r   r  rF  r   r   r
  r  empty_past_kvmodel_inputss              r9   prepare_inputs_for_generation/Zamba2ForCausalLM.prepare_inputs_for_generation  sd    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	6Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r;   )rk  r}  r  r6  )NNNNNNNNNNNr   )NNNNNT)rV   rW   rX   rY   r    r.   r  r  r   r   r0   r   r   rj   r   rb  r   ru   re   r   rS   r  rZ   r[   r\   s   @r9   r{  r{    sp   |   151537>B59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 "":;O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9 9r;   r{  a  
    The Zamba2 Model with a sequence classification head on top (linear layer).

    [`Zamba2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   N  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\
\\\R                     4      S\\R                     S\\R                     S	\\   S
\\   S\\   S\\   S\
\\4   4S jj5       rSrU =r$ )Zamba2ForSequenceClassificationiV  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        U R                  R
                  U l        [        R                  " UR                  U R                  SS9U l	        U R                  5         g )NFr%  )r-   r.   
num_labelsr1  r  rk  r   r,  r6   scorer@  r~  s     r9   r.   (Zamba2ForSequenceClassification.__init__e  se      ++ (
"&**"?"?YYv114??O
 	r;   rE  r  r   r   rF  r  r
  r  rG  rH  r   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r  r   r   rF  r
  r  rG  rH  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r?   rp   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rr   
regressionsingle_label_classificationmulti_label_classificationr  )rl   rM  r  r  rF   r4  rN  rB   rn   r0   int32r  argmaxr  r  r8   rV   problem_typer  rA   r  ru   r   r  r
   rG   r	   r   r   rK   rL  )r5   rE  r  r   r   rF  r  r
  r  rG  rH  transformer_outputsrK   r  rm   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r9   rS   'Zamba2ForSequenceClassification.forwardo  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r;   )rk  r  r  r  ry  )rV   rW   rX   rY   r.   r   r   r0   r   r   r   r   listr   rb  re   r   rS   rZ   r[   r\   s   @r9   r  r  V  s     151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r;   r  )r{  r  r1  r  )rG  )Nr   )Ur   rm  	itertoolsr   typingr   r   r   r   r0   r   torch.nnr	   r
   r   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zamba2r    +mamba_ssm.ops.triton.selective_state_updater!   !mamba_ssm.ops.triton.ssd_combinedr"   r#   causal_conv1dr%   r&   
get_loggerrV   r  Moduler)   r^   rj   r   r   ru   r   r   r  r  r  r!  rX  r[  rh  r  r  rj  r  r  r  r  r  r1  r{  r  __all__r   r;   r9   <module>r     s  ,  	  1 1   A A ! . ) > B q q K F & , T . RmmZjW57WDD-7**			H	%; ;*JBII J(s u s l<BII <D	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4(6y)bii y)~VU\\ VS V
(( 46FH\]^ kOryy kO\'		 'T=")) =@Abii AHG		 GT%O %: v' v vt\- \~ g
&; g
g
T kr;   