
    <h                     N   S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SK	rS SKJ
r
  SSKJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJrJr  SSKJrJ r J!r!  SSK"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,  SSK-J.r.  \" 5       (       a  S SK/J0r0  S SK1J2r2J3r3  OSu  r0r2r3\" 5       (       a	  S SK4J5r5J6r6  OSu  r6r5\7" \0\5\645      r8Sr9\Rt                  " \;5      r< " S S\R                  Rz                  5      r> " S S\+5      r? " S S\'5      r@ " S S \5      rA " S! S"\#5      rB " S# S$\
Rz                  5      rC " S% S&\
Rz                  5      rD " S' S(\$5      rE " S) S*\)5      rF " S+ S,\(5      rG " S- S.\5      rH " S/ S0\*\H5      rI " S1 S2\%5      rJ " S3 S4\&5      rK/ S5QrLg)6    N)cycle)CallableOptionalUnion)nn   )ACT2FN)FlashAttentionKwargs)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)logging)is_causal_conv1d_availableis_mamba_ssm_available   )LlamaRotaryEmbeddingapply_rotary_pos_emb)pad_tensor_by_sizereshape_into_chunkssegment_sum)
ZambaAttentionZambaAttentionDecoderLayerZambaForCausalLMZambaForSequenceClassificationZambaHybridDynamicCacheZambaHybridLayerZambaMambaDecoderLayer
ZambaModelZambaRMSNormeager_attention_forward   )Zamba2Config)selective_state_update)mamba_chunk_scan_combined mamba_split_conv1d_scan_combinedNNN)causal_conv1d_fncausal_conv1d_updateNNzZyphra/Zamba2-2.7Bc                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )Zamba2RMSNormGatedI   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X0l        X l        g N)	super__init__r   	Parametertorchonesweightvariance_epsilon
group_size)selfhidden_sizer7   eps	__class__s       a/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/zamba2/modular_zamba2.pyr1   Zamba2RMSNormGated.__init__J   s2    ll5::k#:; #$    c                 X   UR                   nUR                  [        R                  5      nUb?  U[        R
                  R                  UR                  [        R                  5      5      -  nUR                  Gt pEXPR                  -  nUR                  " / UQUPU R                  P76 nUR                  S5      R                  SSS9nU[        R                  " XR                  -   5      -  nUR                  " / UQX`R                  -  P76 nU R                  UR                  U5      -  $ )Nr   T)keepdim)dtypetor3   float32r   
functionalsilushaper7   viewpowmeanrsqrtr6   r5   )	r8   hidden_statesgateinput_dtypeprefix_dimslast_dimgroup_counthidden_states_groupvariances	            r<   forwardZamba2RMSNormGated.forwardP   s    #))%((7)BMM,>,>twwu}}?U,VVM!.!4!4//1+00\+\{\DOO\&**1-222t2D1EKKK`K`@`4aa+00]+]{__?\]{{]--k:::r>   )r7   r6   r5   )gư>r/   )__name__
__module____qualname____firstlineno__r1   rT   __static_attributes____classcell__r;   s   @r<   r,   r,   I   s    %; ;r>   r,   c                       \ rS rSrSrg)Zamba2RMSNorm^    NrV   rW   rX   rY   rZ   r`   r>   r<   r^   r^   ^       r>   r^   c            
           \ rS rSrSr\R                  S4S\S\S\R                  S\
\   4S jjrS	\S
\R                  S\R                  S\R                  4S jrS rSS	\
\   S\4S jjrSrg)Zamba2HybridDynamicCacheb   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
Nconfig
batch_sizerB   devicec           
      (   X0l         UR                  U l        SU l        [        UR                  UR
                  -  5      U l        UR                  U l        UR                  U l
        UR                  U l        / U l        0 U l        0 U l        0 U l        0 U l        0 U l        [%        UR&                  5       H  n[(        R*                  " UU R                  SUR,                  -  UR                  -  -   U R                  UUS9U R                   U'   [(        R*                  " X R                  UR.                  U R                  XCS9U R"                  U'   U R                  U   S:X  d  M  U R                  R1                  U5        M     [%        UR&                  5       Vs/ sH  n[(        R2                  " / /U-  US9PM     snU l        [%        UR&                  5       Vs/ sH  n[(        R2                  " / /U-  US9PM     snU l        g s  snf s  snf )NFr   rh   rB   hybridrh   )rB   layers_block_typehas_previous_stateintmamba_expandr9   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headstransformer_layers_modules_parameters_buffersconv_states
ssm_statesrangenum_hidden_layersr3   zerosmamba_ngroupsmamba_headdimappendtensor	key_cachevalue_cache)r8   rf   rg   rB   rh   i_s          r<   r1   !Zamba2HybridDynamicCache.__init__p   s    
!'!9!9"'!$V%8%86;M;M%M!N$22 & 3 3#11"$v//0A"'++&&V-A-A)AFDXDX)XX%%#DQ "'..0D0DdFYFYbh"DOOA %%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   "H
"H	layer_idxnew_conv_statecache_positionreturnc                 N   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR	                  UR
                  5      US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r"   r@   shiftsdims)r{   clampru   rollrC   rh   zero_)r8   r   r   r   
conv_states        r<   update_conv_state*Zamba2HybridDynamicCache.update_conv_state   s     %%i0
'--a1F1F1JK__BR_8
+9+<+<Z=N=N+O
1a'(#))+#z1#	**r>   c                 l    U R                   R                  5         U R                  R                  5         g r/   )r{   r   r|   )r8   s    r<   resetZamba2HybridDynamicCache.reset   s$     r>   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  d!  U R                  U   R                  5       S:X  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rw   lenr   numelrG   )r8   r   s     r<   get_seq_length'Zamba2HybridDynamicCache.get_seq_length   sj     3<CZCZ2ZD++A.`i	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r>   )rz   rx   ry   ru   r{   rB   rn   rq   r   rm   rv   rs   r|   rw   r   )r   )rV   rW   rX   rY   __doc__r3   float16r#   ro   rB   r   strr1   Tensor
LongTensorr   r   r   rZ   r`   r>   r<   rd   rd   b   s     KP--quu"u03u<AKKuaijmanu@
+
+.3ll
+LQL\L\
+	
+ 3 3c 3 3r>   rd   c                       \ rS rSrSrg)Zamba2RotaryEmbedding   r`   Nra   r`   r>   r<   r   r      rb   r>   r   c                   X  ^  \ rS rSrSr   SS\S\\   S\\   S\\   4U 4S jjjr   SS\	R                  S\S	\\	R                     S
\\   S\\\	R                  \	R                  4      S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )Zamba2Attention   a*  
Multi-headed attention from 'Attention Is All You Need' paper.

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
Finally, this attention layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this
layer is tied, un-tied adapters (formally the same as LoRA but used in the base model) modules are added to the q, k, v projectors to increase
expressivity with a small memory overhead (see Fig. 2 of https://huggingface.co/papers/2411.15242).
rf   r   num_fwd_mem_blocksblock_idc           
        > [         TU ]  X5        X0l        UR                  U l        X@l        UR                  (       Ga  [        R                  " / 5      U l	        [        R                  " / 5      U l
        [        R                  " / 5      U l        [        U R                  5       GH  nXQR                  -  U:X  Gar  [        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      n[        R                  " [        R                  " U R                   U R"                  R$                  SS9[        R                  " U R"                  R$                  U R                   SS95      nO?[        R&                  " 5       n[        R&                  " 5       n[        R&                  " 5       nU R                  R)                  U5        U R                  R)                  U5        U R                  R)                  U5        GM     [+        U R                  5       V	V
s0 sH  u  pX_M	     sn
n	U l        g s  sn
n	f )NFbias)r0   r1   r   hybrid_layer_idslayer_block_mapr   use_shared_attention_adapterr   
ModuleListlinear_q_adapter_listlinear_k_adapter_listlinear_v_adapter_listr}   num_mem_blocks
SequentialLinearattention_hidden_sizerf   adapter_rankIdentityr   	enumerate	layer_dic)r8   rf   r   r   r   r   linear_q_adapterlinear_k_adapterlinear_v_adapterindexvaluer;   s              r<   r1   Zamba2Attention.__init__   s    	+"4%66 ...)+r):D&)+r):D&)+r):D&4223,,,8')}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($ (*}}		$"<"<dkk>V>V]bc		$++":":D<V<V]bc($
 (*{{}$'){{}$'){{}$**112BC**112BC**112BC) 4, <ETEYEY;Z[;Z<5%,;Z[[s   K3rL   attention_maskpast_key_valueposition_embeddingskwargsr   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R
                  R                  (       aT  U R                  U   nXR                  U   " U5      -   n	XR                  U   " U5      -   n
XR                  U   " U5      -   nU	R                  U5      R                  SS5      n	U
R                  U5      R                  SS5      n
UR                  U5      R                  SS5      nU R
                  R                  (       a  Uu  p[        XX5      u  pUb  UR                  XU5      u  p[         nU R
                  R"                  S:w  a  [$        U R
                  R"                     nU" U U	U
UU4U R&                  (       d  SOU R(                  U R*                  S.UD6u  nnUR,                  " / UQSP76 R/                  5       nU R1                  U5      nUU4$ )Nr@   r"   r   eagerg        )dropoutscaling)rG   head_dimq_projk_projv_projrf   r   r   r   r   r   rH   	transposeuse_mem_roper   updater!   _attn_implementationr   trainingattention_dropoutr   reshape
contiguouso_proj)r8   rL   r   r   r   r   r   input_shapehidden_shapequery_states
key_statesvalue_statesadapter_layer_idxcossinattention_interfaceattn_outputattn_weightss                     r<   rT   Zamba2Attention.forward   s    $))#2.88b8$--8{{=1[[/
{{=1;;33 $y 9'*D*DEV*WXe*ffL#&@&@AR&STa&bbJ'*D*DEV*WXe*ffL#((6@@AF__\2<<QB
#((6@@AF;;##*HC';LVY'_$L%'5'<'<ZW`'a$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r>   )r   r   r   r   r   r   r   r'   )rV   rW   rX   rY   r   r#   r   ro   r1   r3   r   rd   tupler   r
   rT   rZ   r[   r\   s   @r<   r   r      s   $ $(,0"&'\'\ C='\ %SM	'\
 3-'\ '\Z 26=AKO1)||1) 1) !.	1)
 !!9:1) &eELL%,,,F&GH1) -.1) 
u||Xell3XeELL>Q5RR	S1) 1)r>   r   c                     ^  \ rS rSrSrSS\S\\   4U 4S jjjr  SS\	R                  S\\   S\\	R                     4S	 jjrSS\\   S\\	R                     4S
 jjr  SS\\   S\\	R                     4S jjrSrU =r$ )Zamba2MambaMixeri  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
rf   r   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        [        UR                  U R                  -  5      U l
        X l        UR                  U l        SU l        [        R                  " 5       U l        UR"                  U l        UR$                  U l        UR(                  U l        U R                  R,                  U l        UR0                  U l        UR2                  U l        UR4                  U l        UR6                  U l        U R                  SU R&                  -  U R
                  -  -   U l        [        R:                  " U R8                  U R8                  SUR                  U R8                  UR                  S-
  S9U l        U R                  U R8                  -   U R.                  -   n[        R>                  " U R                  UUR@                  S9U l!        [        RD                  " [F        RH                  " U R.                  5      5      U l%        [F        RL                  " SU R.                  S-   5      n[        RD                  " [F        RN                  " U5      5      U l(        SU RP                  l)        [U        U R                  U R                  U R&                  -  SS9U l+        [        RD                  " [F        RH                  " U R.                  5      5      U l,        SU RX                  l)        [        R>                  " U R                  U R                  UR@                  S9U l-        [\        (       d  [^        Ra                  S	5        g g )
NrF   r   Tr"   )in_channelsout_channelsr   kernel_sizegroupspaddingr   gh㈵>)r7   r:   a  The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)` is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d)1r0   r1   rf   r9   rr   rs   rt   ru   ro   rp   rq   r   use_conv_bias
activationr   SiLUactuse_mem_eff_pathr   n_groupsr   r   rv   	num_heads
chunk_sizetime_step_limittime_step_mintime_step_maxconv_dimConv1dconv1dr   add_bias_linearin_projr2   r3   r4   dt_biasarangelogA_log_no_weight_decayr,   normDout_projis_fast_path_availableloggerwarning_once)r8   rf   r   projection_sizeAr;   s        r<   r1   Zamba2MambaMixer.__init__"  s   !--$22 & 3 3!$V%8%84;K;K%K!L"#11 779 & 7 7,,,,22 ++%55#11#11..T]]1BTEXEX1XXii++==''!+
 004==@4>>Qyy''
 ||EJJt~~$>? LLDNNQ./\\%))A,/
&*

#&""t/E/E/V\`
	 ejj89"&		$"8"8$:J:JQWQgQgh%%> &r>   rL   cache_paramsr   c                    UR                   u  pEnU R                  U R                  -  nSU R                  -  SU R                  -  U R                  -  -   U R                  -   nUGb%  UR
                  (       Ga  U R                  UR                  S5      5      n	U	R                   S   U-
  S-  n
XU R                  U R                  U R                  /n[        R                  " XSS9u    plp[        UUR                  U R                     U R                  R                  R                  S5      U R                  R                   U R"                  5      n[        R                  " UU R                  Xw/SS9u  pn[        R$                  " U R&                  R)                  5       5      * nUS S 2S S4   S S 2S S 2S 4   R+                  SU R,                  U R                  5      R/                  [        R0                  S9nUS S 2S S 2S 4   R+                  SSU R,                  5      nU R2                  S S 2S S4   R+                  SU R,                  5      nU R4                  S S 2S S4   R+                  SU R,                  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  UR                   S   U R                  -  5      nUR7                  X@R                  U R,                  5      n[9        UR:                  U R                     UUUUUUS USS9
nUR7                  X@R                  U R,                  -  5      nU R=                  X5      nU R?                  U5      S S 2S S4   nU$ UbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      nU R                  U5      n[        R$                  " U R&                  R)                  5       5      * nU RD                  c  0 OS	U RD                  0nUb  [        R@                  " US:H  5      nOSnU RF                  (       Ga   U RH                  (       a  Uc  U(       a  [K        UU R                  R                  R                  S5      U R                  R                   U R2                  U4U R4                  U RL                  S U R"                  U R<                  R                  U R<                  RN                  U R>                  R                  U R>                  R                   U R,                  U R                  S
SS.UD6u  nnU$ [        R                  " UU R                  U R                  U R                  /SS9u  pnUbv  URQ                  SS5      n[R        RT                  RW                  UU RX                  UR                   S   -
  S45      nUR                  U R                     R[                  U5        [\        b  U R"                  S;  aJ  U R_                  U R                  URQ                  SS5      5      RQ                  SS5      S S 2S U24   5      nOv[]        URQ                  SS5      U R                  R                  R                  S5      U R                  R                   U R"                  S9RQ                  SS5      S S 2S U24   n[        R                  " UU R                  Xw/SS9u  pnUbG  [        R@                  " US:H  5      (       d)  URB                  nXS S 2S S 2S 4   -  R/                  U5      n[a        UR7                  XESU R,                  5      UUUR7                  XEU R                  S5      UR7                  XEU R                  S5      4U RL                  U R4                  S S SU R2                  SS.UD6u  nnUb+  Ub(  UR:                  U R                     R[                  U5        UR7                  XES5      nU R=                  UU5      nU R?                  U5      nU$ )Nr   r"   r@   dim.rB   T)zr   dt_softplusdt_limitF)r  r   seq_idxr   rmsnorm_weightrmsnorm_epsoutproj_weightoutproj_biasheaddimngroupsnorm_before_gatereturn_final_statesr   )rF   swish)xr5   r   r   )r   r  r  r  r  r   r  )1rG   r   rs   rq   r   rn   r   squeezer   r3   splitr)   r{   r   r   r5   r   r   expr  floatexpandr   rC   rD   r   r  rH   r$   r|   r  r  allrB   r   r   r   r&   r   r6   r   r   rE   padru   copy_r(   r   r%   )r8   rL   r  r   rg   seq_lenr   groups_time_state_sized_to_removein_projected_statesd_mlpsplit_projection_dimrM   hidden_states_B_CdtBCr  r   r  hidden_states_reshapedoutrB   projected_statesdt_limit_kwargsinput_not_masked	ssm_state	time_stephidden_states_B_C_tr   scan_outputs                                  r<   cuda_kernels_forward%Zamba2MambaMixer.cuda_kernels_forwardc  sT    "/!4!4
Q!%1D1D!D$0001t}}3DtGZGZ3ZZ]a]k]kk #(G(G(G"&,,}/D/DQ/G"H(..r2[@QFE$)$2H2H$--Y]YgYg#h 05<Okm0n-Aq) 4!((8""**1-  ! #(++!'')?X#Ma
 4::++-..A!T3,1d
+222t}}dFYFYZ]]didqdq]rAAq$J&&r2t}}=Bll1dC<077DMMJGq$|$++B>Az==!''!*2MNAz==!''!*2MNA%2%7%7
NNTXTaTa%b"2''7& M *..z>>DMM;YZM IIm:M--.q$|<Cz 
u )%))Na<O2P2P%++!.1d
1K!K O OPU V#||M:4::++-..A$($8$8$@bzSWSgSgFhO)#(99^q-@#A #' $$$<;OTd!A$KK&&..q1KK$$LL" ff# ##'99#3#3 $		 : :#'==#7#7!%!3!3 MM MM%*(,#"$ &%"YX 
m 6;[[$++T]]DNNK62  +*;*E*Ea*K'!#!2!2+d.C.CFYF_F_`bFc.cef-g"J !,,T^^<BB:N#+tFW/W(,$5$?$?1$EFPPQRTUVWXZb[bZbWbc)% )9+55a;#{{1199!<![[--#'??	)
  i1oa'k)3% ',kk%++-C\'#!
 "-eiiRS@S6T6T)//E%2Aq$J5O%O$S$STY$ZM)B!&&zBNFF:rBFF:rB*  $ff (, LL $* &*&Y (\-E ++DNN;AA)L)..zBG"iiT:mmK0
r>   c                    UR                   u  pEnUR                  nUb2  UR                  (       a!  U R                  UR	                  S5      5      nOOUb;  [
        R                  " US:H  5      (       d  XS S 2S S 2S 4   -  R                  U5      nU R                  U5      nUR                   S   SU R                  -  -
  SU R                  -  U R                  -  -
  U R                  -
  S-  n	UR                  XU R                  U R                  U R                  /SS9u    pjpUGb  UR                  U R                     R!                  5       nUR                  UR"                  5      nUR                  (       Ga2  U
R%                  S5      n
UR&                  U R                     n[
        R(                  " USSS9nUR*                  S:X  a  US S 2SS S 24   OUUS S 2S S 2S4'   UR&                  U R                     R-                  U5        [
        R.                  " UR                  UR"                  5      U R0                  R2                  S S 2SS S 24   -  SS9nU R4                  (       a  XR0                  R6                  -  nU R9                  U5      R                  U5      S S 2S S4   nGOUR;                  SS5      n[<        R>                  RA                  UU RB                  UR                   S   -
  S45      nUR&                  U R                     R-                  U5        U R9                  U R1                  U5      R;                  SS5      5      S S 2S U2S S 24   nUbG  [
        R                  " US:H  5      (       d)  UR                  nXS S 2S S 2S 4   -  R                  U5      nO[
        RD                  " X@R                  U RF                  U R                  4UR"                  US	9nU R9                  U R1                  UR;                  SS5      5      SS U24   R;                  SS5      5      n[
        R                  " XR                  U R                  U R                  -  U R                  U R                  -  /SS9u  pn[
        RH                  " U RJ                  RM                  5       5      * nUGbq  UR                  (       Ga_  UR*                  S:X  a
  US S 2S S4   OUS S 2SS S 24   S S 2S S4   nUR;                  SS5      RO                  XLR                   S   U RF                  5      nU RP                  S
   RO                  U RP                  R                   S   U RF                  5      n[
        R<                  R>                  RS                  UUR                  UR                  5      -   5      n[
        RT                  " XRV                  5      nUS   RO                  U R                  U RF                  U R                  5      R                  [
        RX                  S9n[
        RH                  " US
   U-  5      nUR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUS
   USS S S 24   -  nUR[                  USU RF                  5      nUUS
   -  nUR                  U R                     R-                  UR                  U R                     U-  U-   5        UR[                  X@R                  S5      SS S S 24   nURO                  X@R                  U R                  U R                  -  UR                   S   5      R]                  5       nUR[                  USUR                   S   5      nUR                  U R                     R                  UR                  5      nUR_                  X@R                  -  U RF                  U R                  5      nUR_                  X@R                  -  U R                  S5      n[
        R`                  " UU5      nUR_                  X@R                  U RF                  5      nU Rb                  S
   RO                  U Rb                  R                   S   U RF                  5      nUUU-  -   R                  UR                  5      nUR[                  US5      S S 2S S4   nGO'[<        R>                  RS                  XRP                  -   5      n[
        RT                  " XRV                  5      nUR[                  XESU RF                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nUR[                  XESU R                  5      RM                  5       nURe                  U R                  U R                  -  SU R                  S9nURe                  U R                  U R                  -  SU R                  S9nU Rf                  XPRf                  -  -
  U Rf                  -  nU Rb                  S
   [i        UU5      -  nXS
   -  nUR                  UR                  5      U-  nUUUU4 Vs/ sH  n[k        UUU Rf                  5      PM     snu  nnnnURm                  SSSS5      n[
        Rn                  " USS9n[
        RH                  " [q        U5      5      nUS S 2S S 2S S 2S S S 2S S 24   US S 2S S 2S S S 2S S 2S S 24   -  n U R/                  SS9n!U!S
   URm                  SSSSS5      S
   -  n"U"R/                  SS9n#U#S
   US S 2S S 2S 4   -  R/                  S5      n$[
        RH                  " US S 2S S 2S S 2SS 24   U-
  5      n%UU%Rm                  SSSS5      S
   -  n&U&Rm                  SSSSS5      S
   URm                  SSSSS5      SS S S 24   -  R/                  SS9Rm                  SSSSS5      n'Ub3  UR                  (       a"  UR                  U R                     S S 2S S4   n(O[
        Rr                  " U'S S 2S S24   5      n([
        Rt                  " U(U'/SS9n'[
        RH                  " [q        [<        R>                  RA                  US S 2S S 2S S 2S4   S5      5      5      n)U'Rm                  SSSSS5      n*U)S   U*S S 2S S 2S S4   -  R/                  SS9n+U+Rm                  SSSSS5      n,U,S S 2S S24   U,S S 2S4   nn'[
        RH                  " U5      n-USS S S 24   U'S S 2S S 2S S4   -  n.U-Rm                  SSSS5      n/U.R/                  S5      U/S
   -  n0U$U0-   nUR[                  USU R                  U RF                  5      nUU-   nUS:  a  US S 2S U2S S 2S S 24   nUR[                  XES5      nUb+  Ub(  UR                  U R                     R-                  U5        U Rw                  UU
5      n1U Ry                  U1R                  U5      5      n2U2$ s  snf )Nr"   r@   r   r  r   r   r   .rj   ).N).NNr  )r  output_size   )r"   r   )=rG   rB   rn   r   r   r3   r%  rC   rq   r   rs   r   r!  r   r|   r   clonerh   	unsqueezer{   r   ndimr'  sumr   r5   r   r   r   r   r   rE   r&  ru   r   r   r"  r  r#  r$  r   softplusr   r   rD   r   r   rH   bmmr  repeat_interleaver   r   r   permutecumsumr   
zeros_likecatr  r  )3r8   input_statesr  r   rg   r(  r   rB   r4  r,  rM   rL   r/  r7  r   r0  r1  r  r   dAdBdBxr|   ssm_states_reshaped
C_reshapedyr  pad_size
D_residualtA_cumsumLG_intermediateGM_intermediateMY_diagdecay_statesB_decay_contractionstatesprevious_statesdecay_chunkstates_permutedresult
new_statesstate_decay_outC_times_statesstate_decay_out_permutedY_offr:  contextualized_statess3                                                      r<   torch_forwardZamba2MambaMixer.torch_forward  s[   !-!3!3
Q""#(G(G $\-A-A!-D E)%))NA<M2N2N$0!Q*3M$M#Q#QRW#XL $\ :!''+a$2H2H.HHAPTP]P]L]`d`s`sLssuy  vD  vD  D  IJ  J(8(>(>t55t~~V\^ )? )
%1M
 #$//?EEGI!]%9%9:I...~~a()55dnnE
"ZZ
2BG
ANASASWXAX}Q1W'=^k
1a8$((8>>zJ %		*--8H8O8O*PSWS^S^SeSefgijlmfmSn*ntv w%%![[%5%55M $ 7 : :5 A!T3, O - 7 7! <]]..!**]-@-@-DDaH
 ((8>>zJ $])C)M)MaPQ)R STUW_X_W_abTb c!-eiiPQ@Q6R6R)//E%2Aq$J5O%O$S$STY$ZM^^T]]D<O<OP$++5I !HHT[[1H1HA1N%OPSU]V]U]P]%^%h%hijlm%noM#kk-:P:PRVR_R_bfbubuRuw{  xE  xE  HL  H[  H[  x[  :\  bd  e!YYtzz'')**#(G(G(G &(WW\AtSL!r!Q'{1dC<7PBa#**:xx|T]]SBll9-44T\\5G5G5JDMMZG$$--b7::bhh3G.GHBR!3!34B/"))$..$--I\I\]``glgtgt`uA2i=1,-B
 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6AI3a<0B *11*b$--PM}Y//C ##DNN399''7"<sB 		*mmR8dAFA]]DNNdmm4SUVU\U\]_U`allnA		*b!''"+6A &00@CCAGGLJ",//*~~2Mt}}^b^q^q"r
^^ ;T=P=PRSTJ		-z:Az>>4==AA y!((a$--HA]Q&&**1773A 		*b)!T3,7A ''\\(9:BR!3!34B)11*r4==Y__aM		*D4G4GHNNPA		*r43F3FGMMOA##DNNdmm$CX\XfXf#gA##DNNdmm$CX\XfXf#gA'OO*CCtVH	*-?x-XXJ *yM9M](()B.A cpqrtuwxay%zay\]&9!Xt&Way%z"M1a 		!Q1%A||A2.H 		+a.)A q!Qa23a1dAq!8K6LLN""r"*A y\AIIaAq!,DY,OON""r"*A 	l]1a:%>>CCAFF !99XaArsl%;h%FGL"#l&:&:1aA&Fy&Q"Q)11!Q1a@K}OdOdefhiklnoqrOstwy}  @A  uA  PB  B  G  G  LM  G  N  V  V  WX  Z[  ]^  `a  cd  eF'L,K,K"."9"9$.."I!TSV,"W"'"2"26!RaR%="AYY8a@F))K0A0A(1aQRTV;BWY_0`$abK$nnQ1a;O!/2_Q4QT_5UUZZ_`ZaF1aA6J *1crc6 2Jq"u4EIF $ii1OT1oq!T30GGN'6'>'>q!Q'J$#''+.Fy.QQE A		*b$..$--HAJA!|a'1a'(		*r2A$)A''7==iHii4(
 !%knnU.C D$$I &{s   	 zc                     [         (       a@  SU R                  R                  R                  R                  ;   a  U R                  XU5      $ U R                  XU5      $ )Ncuda)r  r   r5   rh   typer;  ri  )r8   rL   r  r   s       r<   rT   Zamba2MambaMixer.forward  sM     "!f0C0C0J0J0O0O&O,,].YY!!-~NNr>   )r  r  r   r   r   rf   r   r   ru   r   r   r9   r   rq   r   r   r  r   r  rs   r   r   r   r   r   r/   r*   )rV   rW   rX   rY   r   r#   r   ro   r1   r3   r   rd   r;  ri  rT   rZ   r[   r\   s   @r<   r   r     s    ?| ? ? ?H <@15	T||T 78T !.	Tn%AY8Z %qyz  {G  {G  rH %J <@15		O 78	O !.		O 	Or>   r   c                   H   ^  \ rS rSrSS\S\\   4U 4S jjjrSS jrSr	U =r
$ )		Zamba2MLPi  rf   r   c           
        > [         T	U ]  5         Xl        UR                  U l        UR                  U l        X l        X0l        [        R                  " U R                  SU R                  -  UR                  S9U l
        [        R                  " U R                  U R                  UR                  S9U l        [        UR                     U l        [        R                  " / 5      U l        [#        U R
                  5       H  nXAR$                  -  U:X  a  [        R&                  " [        R                  " U R                  R                  U R                  R(                  SS9[        R                  " U R                  R(                  SU R                  -  SS95      nO[        R*                  " 5       nU R                   R-                  U5        M     UR.                  n[1        U5       VVs0 sH  u  pxX_M	     snnU l        gs  snnf )a9  
This MLP layer contributes to tied transformer blocks aimed to increasing compute without increasing model size. Because this layer
is tied, un-tied adapter modules (formally same as LoRA, but used in the base model) are added to the up and gate projectors to increase expressivity with a small memory overhead.
r   r   FN)r0   r1   rf   r9   rq   r   r   r   r   r   gate_up_proj	down_projr	   
hidden_actact_fnr   gate_up_proj_adapter_listr}   r   r   r   r   r   r   r   r   )
r8   rf   r   r   r   gate_up_proj_adapterr   r   r   r;   s
            r<   r1   Zamba2MLP.__init__  s   
 	!--!'!9!9"4 IId&6&6D<R<R8RY_YoYop4#9#94;K;KRXRhRhiV../)+r):&t../A(((H4')}}IIdkk55t{{7O7OV[\IIdkk66D<R<R8RY^_($
 (*{{}$**112FG 0 !11;D_;UV;U<5%,;UVVs   -Hc                     U R                  U5      nU R                  U   nX0R                  U   " U5      -   n[        R                  " USSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr   r@   r  r   r"   )rr  r   rv  r3   chunkru  rs  )r8   hidden_stater   gate_up_stateoutputs        r<   rT   Zamba2MLP.forward  s{    )),7NN9-	%(F(Fy(QR^(__M1"={{=#34}Q7GG-r>   )
ru  r   rf   rs  rr  rv  r9   rq   r   r   r*   r/   )rV   rW   rX   rY   r#   r   ro   r1   rT   rZ   r[   r\   s   @r<   rp  rp    s0    W| WPXY\P] W W< r>   rp  c                   F  ^  \ rS rSrSS\S\\   S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S	\\
   S
\\   S\\R                     S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )Zamba2AttentionDecoderLayeri  rf   r   r   c                    > X l         [        UR                  5      n[        TU ]  X5        [        USXBS9U l        [        XUS9U l        g )Nr@   )r   r   r   )r   r   )	r   r   r   r0   r1   r   	self_attnrp  feed_forward)r8   rf   r   r   num_gsr;   s        r<   r1   $Zamba2AttentionDecoderLayer.__init__  sF     V,,-+(2RXl%fRZ[r>   rL   original_hidden_statesr   r   output_attentionsr   r   r   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  X5      nU4n
U(       a  X4-  n
U
$ )aj  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
r@   r  )rL   r   r   r   r  r   r`   )r3   concatenateinput_layernormr  pre_ff_layernormr  )r8   rL   r  r   r   r   r  r   r   self_attn_weightsoutputss              r<   rT   #Zamba2AttentionDecoderLayer.forward  s    > ))=*QWYZ,,];+/>> ,
'))/ 3,
 ,
( --m<))-C "++Gr>   )r   r  r  r*   )NNFN)rV   rW   rX   rY   r#   r   ro   r1   r3   r   rd   boolr   r   r
   r   FloatTensorrT   rZ   r[   r\   s   @r<   r  r    s    \| \x} \X`adXe \ \ 26=A,1:>3||3 !&3 	3
 !.3 !!9:3 $D>3 &e&6&673 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r>   r  c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )Zamba2MambaDecoderLayeri0  rf   r   c                    > [         TU ]  X5        [        XS9U l        [	        UR
                  UR                  S9U l        g )N)rf   r   r:   )r0   r1   r   mambar^   r9   rms_norm_epsr  )r8   rf   r   r;   s      r<   r1    Zamba2MambaDecoderLayer.__init__1  s7    +%VI
,V-?-?VEXEXYr>   )r  r  )	rV   rW   rX   rY   r#   ro   r1   rZ   r[   r\   s   @r<   r  r  0  s    Z| Z Z Zr>   r  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )Zamba2HybridLayeri7  shared_transformerlinearr  c                 6   > [         TU ]  XU5        U ?Xl        g r/   )r0   r1   shared_transfr  )r8   r  r  r  r;   s       r<   r1   Zamba2HybridLayer.__init__8  s!     	+U;"4r>   rL   r  r   r   causal_maskr   r  	use_cacher   r   c
           
          U R                  UUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Zamba2HybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
)r  r   r   r   r  r   r   r"   )transformer_hidden_statesr   r   r  r  r   r   N)r  r  mamba_decoder)r8   rL   r  r   r   r  r   r  r  r   layer_outputsr  r  s                r<   rT   Zamba2HybridLayer.forward?  s    @ //#9&)/ 3 0 
 %2!$4! -a 0$(KK0I$J!**&?))/ 3 + 
 *1-/@AMRSRTDUUMr>   )r  )NNNNNFFN)rV   rW   rX   rY   r  r   r   r  r1   r3   r   r   ro   rd   r  r   r   r  rT   rZ   r[   r\   s   @r<   r  r  7  s   5"=5GIyy5Yp5 :>#'15.2=A,1$):>>||> !) 6> C=	>
 !.> ell+> !!9:> $D>> D>> &e&6&67> 
u  (51B1BEDUDU1U+V"WW	X> >r>   r  c                   X   ^  \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrSrU 4S jrS	rU =r$ )
Zamba2PreTrainedModeli  rf   modelTr  r  past_key_valuesc                   > [         TU ]  U5        [        U[        5      (       Ga  [        R
                  " [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      [        R                  " U R                  R                  5      -
  -  [        R                  " U R                  R                  5      -   5      R                  U R                  R                  S9nU[        R                  " [        R                  " U* 5      * 5      -   nUR                   R"                  R%                  U5        [        R&                  " SUR(                  S-   5      nUR*                  R"                  R%                  [        R                  " U5      5        UR,                  R"                  R/                  S5        g g )N)minr"   g      ?)r0   _init_weights
isinstancer   r3   r"  randrf   rv   mathr  r   r   r   time_step_floorexpm1r   datar'  r   r   r  r  fill_)r8   moduler/  inv_dtr  r;   s        r<   r  #Zamba2PreTrainedModel._init_weights  s=   f%f.//

4;;44588DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f-Q 0 01 45ALL##EIIaL1HHMM$ 0r>   r`   )rV   rW   rX   rY   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_flex_attn_supports_sdpa_is_statefulr  rZ   r[   r\   s   @r<   r  r    sG    &*#68QR"3NL% %r>   r  c                   $   \ rS rSrSrS\4S jrS r          SS\\	R                     S\\	R                     S	\\	R                     S
\\   S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jjrSrg)Zamba2Modeli  zX
Model consisting of *config.num_hidden_layers* layers.

Args:
    config: Zamba2Config
rf   c           	         [         R                  X5        Xl        UR                  U l        UR
                  U l        [        R                  " UR
                  UR                  U R                  5      U l	        [        UR                  5       Vs/ sH  n[        XS9PM     nn/ n/ nUR                  U l        [        UR                  5       H  nUR                  U   S:X  a  UR                  [!        XS95        M0  UR                  U   S:X  d  ME  UR                  [        R"                  " U R                  R                  U R                  R                  SS95        UR                  [!        XS95        M     [%        U5      n[%        U5      n['        U5      nU R)                  X5U5      n[        R*                  " U5      U l        UR.                  U l        [1        UR                  UR2                  S9U l        UR6                  (       a6  UR8                  (       a  [:        R=                  S5        [?        U5      U l         SU l!        U RE                  5         g s  snf )	N)r   r  r   rk   Fr   r  ze`use_long_context` set to `True`: using rescaled `rope_theta` and extended `max_position_embeddings`.)#r  r1   rf   pad_token_idpadding_idx
vocab_sizer   	Embeddingr9   embed_tokensr}   r   r  rm   r~   r   r  r   iterr   
get_layersr   layersr   r^   r  final_layernormr   use_long_contextr  r	  r   
rotary_embgradient_checkpointing	post_init)r8   rf   kblocksmamba_layerslinear_layersr   r  s           r<   r1   Zamba2Model.__init__  s   &&t4!.. ++LL):):F<N<NPTP`P`aKPQWQfQfKghKga-fAKgh!'!9!9v//0A''*g5##$;F$PQ))!,8$$RYYt{{/F/FH_H_fk%lm##$;F$PQ 1 L)]+vEmmF+$*$?$?!,V-?-?VEXEXY&&##{ 4F;DO&+# 	7 is   Ic           
         / n/ U l         SU l        [        U R                  5       GH!  u  pVUS:X  Ga  U R                  S:X  a  XPl        [	        U5      nU R
                  R                  [        U R
                  R                  5      -  S:  Gam  SU S3n[        R                  " US-   S-   S-   S	-   S
-   5      n	U R                   R                  U	5        Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     U R
                  R                  (       a  Sn
U R                   Ht  nUS:X  af  XR
                  R                  -  UR                  :X  a@  [        R                  " S[        U
5      -   S-   5      nU R                   R                  U5        U
S-  n
Mv     UR                  [        U[	        U5      [	        U5      5      5        GM  UR                  [	        U5      5        GM$     U$ )Nr   rk   r"   z	^layers\.z\.shared_transformer\.z(?:z3self_attn\.(?:q_proj|k_proj|v_proj|o_proj)\.weight|z1feed_forward\.(?:gate_up_proj|down_proj)\.weight|z,(?:input_layernorm|pre_ff_layernorm)\.weightz)$z>^shared_transformer\.feed_forward\.gate_up_proj_adapter_list\.z\.(?:0|1)\.weight$zg^shared_transformer\.self_attn\.(?:linear_q_adapter_list|linear_k_adapter_list|linear_v_adapter_list)\.)_tied_weights_keysfirst_transformer_layer_idr   rm   nextrf   r   r   r   recompiler   r   r   r   r  )r8   r  r  r  r  layer_id
layer_typeblockprefix_patternmain_keys_pattern
adapter_id_layer_typeadapter_patternattn_adapter_patterns                 r<   r  Zamba2Model.get_layers  s3   "$*+'$-d.D.D$E HX%22a76>3V;;--DKK4P4P0QQTUU(1(;Q%RN(*

& !PQ OO J	J
   )% ++223DE!"J'+'='=&(2zKKD^D^7^bgbpbp7p.0jj a"%j/!2"7!8/O
 !33::?K"a
 (> {{??%&
+/+A+AK*h6:HbHb;bfkftft;t79zz%q&)*o%6 '<%<8" 4 !% 7 7 > >?S T&!OJ ,B /tM7JDQ]L^_`d<01S %FT r>   N	input_idsr   position_idsr  inputs_embedsr  r  output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       aM  UcJ  Ub  UR                  S   OUR                  S   n[        U R                   XR                  U R                   S9nU
cM  Ub  UR#                  U R$                  S9OSn[        R&                  " XUR                  S   -   UR                   S9n
Uc  U
R)                  S5      nU R+                  X%U
5      nU R                   R,                  (       a  U R/                  X5      nOS nU(       a  S	OS nU(       a  S	OS n[1        U R2                  5       H  u  nnU(       a  UU4-  nU R                  (       a6  U R                  (       a%  U R5                  UR6                  UUUUUUUUU5
      nOU" UUUUUUUUUS
9	nUS   nU(       d  Mv  US   c  M~  UUS   4-  nM     U R9                  U5      nU(       a  UU4-  nUb  UR:                  (       d  SUl        [=        UU(       a  UOS UUS9nU	(       a  U$ UR?                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   )rB   rh   r  r"   rl   r`   )r  r   r   r  r   r  r  r   T)last_hidden_stater  rL   
attentions) rf   r  r  r  use_return_dict
ValueErrorr  r   r  r	  r  r3   r@  rG   rd   rB   rh   r   r  r   rA  _update_causal_maskr   r  r   r  _gradient_checkpointing_func__call__r  rn   r   to_tuple)r8   r  r   r  r  r  r  r  r  r  r   rL   r  rg   past_seen_tokensr  r   all_hidden_statesall_self_attnsr   layerr  r}  s                          r<   rT   Zamba2Model.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0/8/D+-J]J]^_J`J6t{{JV`V`imitituO! #.  ..9X9X.Y 
 #\\ ]5H5H5K"KTaThThN )33A6L..~n] ;;##"&//-"N"&"6BD0d )$++ 6Iu#!m%55!**t}} $ A ANN!*"#%'! !&!+A'#1 +#2&7'(;
! *!,M   #/"}Q'7&99NE !7H ,,];  -!11&/Q/Q15O.(+/8Od+%	
 %v;&//*;;r>   )r   r  rf   r  r  r  r  r  rm   r  r  r  )
NNNNNNNNNN)rV   rW   rX   rY   r   r#   r1   r  r   r3   r   r   rd   r  r  r   r   r   rT   rZ   r`   r>   r<   r  r    s   "| "H.d 151537>B59$(,0/3&*59v<E,,-v< !.v< u//0	v<
 "":;v<   1 12v< D>v< $D>v< 'tnv< d^v< !!1!12v< 
u--	.v< v<r>   r  c                       \ rS rSrSrg)Zamba2ForCausalLMir  r`   Nra   r`   r>   r<   r  r  r  rb   r>   r  c                       \ rS rSrSrg)Zamba2ForSequenceClassificationiv  r`   Nra   r`   r>   r<   r  r  v  rb   r>   r  )r  r  r  r  )Mr  r  	itertoolsr   typingr   r   r   r3   torch.utils.checkpointr   activationsr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   r   processing_utilsr   utilsr   utils.import_utilsr   r   llama.modeling_llamar   r   mamba2.modeling_mamba2r   r   r   zamba.modeling_zambar   r   r   r   r   r   r   r   r    r!   configuration_zamba2r#   +mamba_ssm.ops.triton.selective_state_updater$   !mamba_ssm.ops.triton.ssd_combinedr%   r&   causal_conv1dr(   r)   r%  r  _CONFIG_FOR_DOC
get_loggerrV   r  Moduler,   r^   rd   r   r   r   rp  r  r  r  r  r  r  r  __all__r`   r>   r<   <module>r     s     	  , ,    ! B 7 F & N Y Y   / RmmZjW57WDD-7**46FH\]^  '			H	%; ;*	L 	D36 D3N	0 	j)n j)ZkOryy kO\'		 'T;"< ;|Z4 ZF( FR%O %:R<*3 R<j	( 		&D 	r>   