
    <h                     @   S r SSKrSSKJrJrJrJr  SSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  \'" 5       (       a  SSK*J+r+J,r,  SSK-J.r.  OSu  r.r,r+\&" 5       (       a	  SSK/J0r0J1r1  OSu  r1r0\2" \.\,\0\1\+45      r3\$Rh                  " \55      r6 " S S\	Rn                  5      r8S\Rr                  S\:S\Rr                  4S jr; " S S\5      r< S?S \	Rn                  S!\Rr                  S"\Rr                  S#\Rr                  S$\\Rr                     S%\=S&\=4S' jjr> " S( S)\	Rn                  5      r? " S* S+\	Rn                  5      r@ " S, S-\	Rn                  5      rA " S. S/\	Rn                  5      rB " S0 S1\	Rn                  5      rC " S2 S3\	Rn                  5      rD\# " S4 S5\5      5       rE\# " S6 S7\E5      5       rF " S8 S9\E\5      rG\#" S:S;9 " S< S=\E5      5       rH/ S>QrIg)@zPyTorch Zamba model.    N)AnyCallableOptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)CacheDynamicCache)GenerationMixin)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastCausalLMOutputWithPast SequenceClassifierOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_available   )ZambaConfig)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )ZambaRMSNorm?   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
ZambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/zamba/modeling_zamba.pyr)   ZambaRMSNorm.__init__@   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor+   float32powmeanrsqrtr.   r-   )r/   hidden_statesinput_dtypevariances       r3   forwardZambaRMSNorm.forwardH   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r5   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler-   shaper.   r/   s    r3   
extra_reprZambaRMSNorm.extra_reprO   s*    ))*+6$2G2G1HIIr5   )r.   r-   )gư>)	__name__
__module____qualname____firstlineno__r)   rC   rI   __static_attributes____classcell__r2   s   @r3   r%   r%   ?   s    $;J Jr5   r%   r@   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rG   expandreshape)r@   rR   batchnum_key_value_headsslenhead_dims         r3   	repeat_kvr[   T   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr5   c                      \ rS rSrSrSrSrSr\R                  S4S jr
S rS\S\\R                  \R                  4   4S	 jr SS
\R                  S\R                  S\S\\\\4      S\\R                  \R                  4   4
S jjrS\R*                  4S jrSS\\   S\4S jjrS\\\R                     \\R                     4   4S jr\SS\\\\R4                           SS4S jj5       rSrg)ZambaHybridDynamicCache`   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
NFc                 $   X0l         SU l        UR                  U l        SU l        UR                  UR
                  -  U l        UR                  U l        UR                  U l
        UR                  U l        / U l        / U l        / U l        0 U l        0 U l        0 U l        [%        UR&                  5       H  nU =R                  [(        R*                  " X R                  U R                  XCS9/-  sl        UU R                  U R                  U R                  -  U R                  4nU =R                  [(        R*                  " XdUS9/-  sl        U R                  U   S:X  d  M  U R                  R-                  U5        M     [%        UR&                  5       Vs/ sH  n[(        R.                  " / /U-  US9PM     snU l        [%        UR&                  5       Vs/ sH  n[(        R.                  " / /U-  US9PM     snU l        g s  snf s  snf )NFdevicer:   hybridra   )r:   is_compileablelayers_block_typehas_previous_statemamba_expandr0   intermediate_sizemamba_d_statessm_state_sizemamba_d_convconv_kernel_sizen_mamba_headsconv_states
ssm_statestransformer_layers_modules_parameters_buffersrangenum_hidden_layersr+   zerosappendtensor	key_cachevalue_cache)r/   config
batch_sizer:   ra   icache_shape_s           r3   r)    ZambaHybridDynamicCache.__init__r   s   
#!'!9!9"'!'!4!4v7I7I!I$22 & 3 3#11"$v//0AJ(>(>@U@U^dr!  ""&&$*<*<<##	K OOKe TUUO%%a(H4''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   "H"Hc                 ,    [        U R                  5      $ N)lenry   rH   s    r3   __len__ZambaHybridDynamicCache.__len__   s    4>>""r5   	layer_idxrS   c                 >    U R                   U   U R                  U   4$ r   )ry   rz   r/   r   s     r3   __getitem__#ZambaHybridDynamicCache.__getitem__   s!    ~~i($*:*:9*EEEr5   
key_statesvalue_statescache_kwargsc                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )Nr8   r   r7   dim)ry   rG   rz   r+   cat)r/   r   r   r   r   s        r3   updateZambaHybridDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEEr5   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	rt   r   ry   ra   index_selectr;   rz   rn   ro   )r/   r   r   ra   s       r3   reorder_cache%ZambaHybridDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r5   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )rp   r   ry   rG   r   s     r3   get_seq_length&ZambaHybridDynamicCache.get_seq_length   sP     3<CZCZ2ZD++A.`i	t~~)+~~i(..r22r5   c                     [        S5      eNz@ZambaHybridDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrH   s    r3   to_legacy_cache'ZambaHybridDynamicCache.to_legacy_cache   s    !"deer5   past_key_valuesr   c                     [        S5      er   r   )clsr   s     r3   from_legacy_cache)ZambaHybridDynamicCache.from_legacy_cache   s    !"deer5   )rs   rq   rr   rl   rn   r:   rf   rh   rd   ry   re   rm   rj   ro   rp   rz   r   )r   )rK   rL   rM   rN   __doc__ry   rz   rd   r+   float16r)   r   intrF   Tensorr   r   dictstrr   r   
LongTensorr   r   r   classmethodFloatTensorr   rO    r5   r3   r]   r]   `   sM    IKN16t u@#FS FU5<<3M-N F 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F$ie&6&6 i3 3c 3fuU\\':E%,,<O'O!P f fuUEVEV?W9X0Y fes f fr5   r]   modulequerykeyvalueattention_maskscalingdropoutc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr7   r   r   r8   )r   r:   )ptrainingr   )r[   num_key_value_groupsr+   matmul	transposerG   r   
functionalsoftmaxr<   r;   r:   r   r   
contiguous)r   r   r   r   r   r   r   kwargsr   r   attn_weightscausal_maskattn_outputs                r3   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r5   c                      ^  \ rS rSrSrS\S\4U 4S jjr SS\R                  S\S\
\R                     S\
\   S	\\   S
\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )ZambaAttention   a  
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".

Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
(see fig. 2 in https://huggingface.co/papers/2405.16712).
Additionally, replaced
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
r{   r   c                   > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  UR                  -  U l	        UR                  U l
        U R                  S-  S-  U l        SU l        UR                  U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR&                  SS9U l        g )Nr7         TFbias)r(   r)   r{   r   attention_hidden_sizeattention_head_dimrZ   num_attention_headsrX   r   max_position_embeddingsr   	is_causalattention_dropoutr   Linearq_projk_projv_projr0   o_projr/   r{   r   r2   s      r3   r)   ZambaAttention.__init__   s5   "%+%A%A"11$*$>$>&B\B\$\!'-'E'E$)d2!'!9!9ii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii < <f>X>X[_[h[h>hotuii : :T]] JFL^L^ejkr5   r@   r   past_key_valuer   rS   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
Ub  UR                  XU5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
U4U R                  (       d  SOU R                  U R                  S.UD6u  pUR                  " / UQSP76 R!                  5       nU R#                  U5      nX4$ )Nr8   r   r7   eager        )r   r   )rG   rZ   r   viewr   r   r   r   r   r{   _attn_implementationr   r   r   r   rV   r   r   )r/   r@   r   r   r   r   input_shapehidden_shapequery_statesr   r   attention_interfacer   r   s                 r3   rC   ZambaAttention.forward  sk    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST%'5'<'<ZW`'a$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
! "));;;;FFHkk+.((r5   )r   r   r{   rZ   r   r   r   r   r   r   r   r   r   r   )rK   rL   rM   rN   r   r   r   r)   r+   r   r   r]   r   r   rF   rC   rO   rP   rQ   s   @r3   r   r      s    l{ ls l. =A#)||#) #) !.	#)
 !!89#) -.#) 
u||Xell3XeELL>Q5RR	S#) #)r5   r   c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\	4S jjr
SS\	4S jjrSS\	4S	 jjrS
rU =r$ )ZambaMambaMixeri)  u!  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)

This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
- Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
`self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
r{   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        U R                  U R                  -  U l        UR                  U l        UR"                  U l        [&        R(                  " U R                  U R                  U R                   U R                  U R                  U R                  S-
  S9U l        UR,                  U l        [0        UR,                     U l        UR4                  U l        [&        R8                  " U R                  U R                  S-  U R$                  S9U l        [&        R<                  " [>        R@                  " U R                  U R                  U R                  S-  -   U R                  5      5      U l!        [&        R<                  " [>        R@                  " U R                  U R                  U R                  5      S-
  S-  U R                  S-  -  5      U l"        [&        R<                  " [>        R@                  " U R                  U R                  5      5      U l#        [>        RH                  " SU R                  S-   [>        RJ                  S9S S S 24   nURM                  U R                  S5      RO                  5       n[&        R<                  " [>        RP                  " U5      RS                  U R                  U R                  S5      5      U l*        [&        R<                  " [>        RV                  " U R                  U R                  5      5      U l,        [&        R8                  " U R                  U R                  U R$                  S9U l-        [\        (       d  [^        Ra                  S5        g g )	Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingr7   r   g      ?r:   r8   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)1r(   r)   r{   r   r0   ri   rj   rk   rl   rg   rh   mamba_dt_ranktime_step_rankrm   mamba_head_dimmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1dhidden_mamba_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projr*   r+   rv   x_proj_weightdt_proj_weightdt_proj_biasaranger<   rU   r   logrV   A_logr,   Dout_projis_fast_path_availableloggerwarning_once)r/   r{   r   Ar2   s       r3   r)   ZambaMambaMixer.__init__6  s
   "!--$22 & 3 3!'!4!4v7I7I!I$22#11"448J8JJ#33..ii..//##--))))A-
 !11&112 & 8 8 yy!1!143I3IA3MTXTaTab  \\KK""##d&9&9A&==##
 !ll[[++T-@-@$BUBUVY\\!!3&'

 LLT5G5GI\I\)]^ LLD//!35==I$PQ'RHHT++R0;;=\\%))A,"6"6t7I7I4K^K^`b"cd
ejj););T=P=PQR		$"8"8$:J:JQUQ^Q^_%%^ &r5   r@   cache_paramsc                    UR                   u  pEnUS L=(       a    UR                  =(       a    US:H  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pUR                  S5      R                  5       nU	R                  S5      n	U	R                  X@R                  SU5      R                  SS5      n	U R                  R                  R	                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ae  [        UR                  S5      UR                  U R                     U
U R                  R                   U R"                  5      nUR%                  S5      nOUb1  [&        R(                  " US:H  5      (       d  XR%                  S5      -  nUbc  [*        R,                  R/                  XR0                  UR                   S   -
  S45      nUR                  U R                     R3                  U5        [5        XU R                  R                   U R"                  S9nUb1  [&        R(                  " US:H  5      (       d  XR%                  S5      -  nUR                  SU R                  U R6                  U5      R                  SS5      nU R8                  S S 2S S S 2S S 24   U-  R                  SS5      n[&        R:                  " XR<                  U R>                  U R>                  /SS9u  pnU R@                  S S 2S 4   UR                  SS5      -  n[&        RB                  " U RD                  RG                  5       5      * nU RH                  b  U RH                  RG                  5       OS n[&        RJ                  " USU4URL                  URN                  S9nU(       a  [Q        U R                  5       H  n[S        URT                  U R                     S S 2U4   UUS	S4   UUS	S4   UU   UUS S 2S4   UUS S 2S4   U RV                  U   U	US	S4   UU   S
S9
R%                  S5      n[&        RX                  " UU4SS9nM     GO<[&        RJ                  " USU R6                  U R>                  4URL                  URN                  S9n[Q        U R                  5       H  n[[        UU   UU   UU   UU   R                  SS5      UU   R                  SS5      U RV                  U   RG                  5       U	U   UU   S
S
S9
u  nn[&        RX                  " UU4SS9R                  5       n[&        RX                  " UUR%                  S5      4SS9nM     Ub+  Ub(  URT                  U R                     R3                  U5        U R]                  UR                  SS5      5      nU$ )Nr   r7   r8   r   r   )r   r   r`   .T)dt_softplus)delta_softplusreturn_last_state)/rG   rf   r   r   r   chunksqueezer   rV   rm   r   r-   sizer"   rn   r   r   r   	unsqueezer+   allr   r   padrl   copy_r!   r   r   splitr   rj   r   expr  floatr   emptyra   r:   rt   r    ro   r  r   r   r  )r/   r@   r  r   r|   seq_lenr   use_precomputed_statesprojected_statesgateconv_weightsrn   ssm_parameters	time_stepBCdiscrete_time_stepr	  time_proj_biasscan_outputsnscan_outputs_	ssm_state
ssm_state_contextualized_statess                            r3   cuda_kernels_forward$ZambaMambaMixer.cuda_kernels_forwards  sn    "/!4!4
Q!-T!9!nl>]>]!nbimnbn  <<6@@AF.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M)%))Na<O2P2P -0H0H0K K' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]$++JZJZgkgvgvwM)%))Na<O2P2P -0H0H0K K
 &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	a "00D9I<O<OPRTV<WWYYtzz'')** 7;6G6G6S**002Y]{{J7#;MDXDX`m`s`st!4--. 6 ++DNN;AqDA!!S!),&q#qy1aDaAgJaAgJFF1ICO"1% $! )B-   %yy,)FAN /  Q 3 3T5H5HI$++#))I
 4--.,=!!$&q)aDaDNN1a(aDNN1a(FF1IOO%G"1%#'&*-)z  %yy,)FANYY[!IIy*2F2Fq2I&JPQR	 / $)A''7==iH !%l.D.DQ.J K$$r5   c           
      R   UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  USSU5      R                  SSS9u  pU	R                  S5      R                  5       n	U
R                  S5      n
U
R                  X@R                  SU5      R                  SS5      n
[        U[        5      nU(       Ga  UR                  U R                     R                   S   U:X  Ga  U R                  (       a(  UR                  U R                     R                  5       nOUR                  U R                     nUR!                  U	R"                  5      nUR$                  (       Ga  US:X  Ga  UR&                  U R                     R                   S   U:X  a  UR&                  U R                     n[(        R*                  " USSS9nU	S S 2S S 2S4   US S 2S S 2S4'   XR&                  U R                  '   [(        R,                  " XR.                  R0                  S S 2SS S 24   -  SS9n	U R2                  (       a  XR.                  R4                  -  n	U R7                  U	5      R!                  U5      R9                  S5      n	GOUbH  [(        R:                  " US:H  5      (       d*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	[<        R>                  RA                  XRB                  U	R                   S   -
  S45      nXR&                  U R                  '   U R7                  U R/                  U	5      SS U24   5      n	UbH  [(        R:                  " US:H  5      (       d*  XS S 2U	R                   S   * S 24   R9                  S5      -  n	O[(        RD                  " X@R                  U RF                  U RH                  4U	R"                  US9nUb1  [(        R:                  " US:H  5      (       d  XR9                  S5      -  n	U R7                  U R/                  U	5      SS U24   5      n	Ub1  [(        R:                  " US:H  5      (       d  XR9                  S5      -  n	U	R                  SU R                  U RF                  U5      R                  SS5      n	U RJ                  S S 2S S S 2S S 24   U	-  R                  SS	5      n[(        RL                  " XRN                  U RH                  U RH                  /SS9u  nnnU RP                  S S 2S 4   UR                  SS	5      -  U RR                  S S 2S S S 2S 4   -   n[<        R>                  RU                  U5      n[(        RV                  " U RX                  R[                  5       5      * n[(        RV                  " US S 2S S S 2S S S 24   US S 2S S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S S 2S 4   US S 2S S 2S S S 2S S 24   R[                  5       -  nUU	S S 2S S 2S S 2S S 2S 4   R[                  5       -  n/ n[]        U5       H  nUS S 2S S 2S S 2US S 24   R                  SS5      U-  US S 2S S 2S S 2US S 24   R                  SS5      -   n[(        R^                  " UR                  SS5      R!                  U5      US S 2S S 2US S 24   R9                  S5      5      nURa                  US S 2S S 2S S 2S4   5        M     [(        Rb                  " USS9nUXRd                  S S 2S S S 2S 4   -  -   nUU R7                  U
5      -  nU(       a  XR                  U R                  '   U Rg                  UR                  SS5      R                  USU5      R                  SS5      5      nU$ )
Nr   r7   r8   r   r   )shiftsdims.r`   r   )4rG   r:   r   r   r   r  r  r   rV   rm   
isinstancer]   ro   r   r   cloner;   ra   rf   rn   r+   rollsumr   r-   r   r   r   r  r  r   r   r  rl   rv   r   rj   r   r  r   r   r   softplusr  r  r  rt   r   rw   stackr  r  )r/   input_statesr  r   r|   r  r   r:   r  r@   r  	use_cacher)  
conv_stater   r!  r"  r#  r$  r	  
discrete_A
discrete_BdeltaB_ur&  r}   scan_outputr+  s                              r3   slow_forwardZambaMambaMixer.slow_forward  s   !-!3!3
Q""<<5??1E.33JAwOUUVW]^U_%--a0;;=||A||J(:(:BHRRSTVWX|-DE	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I ///qL ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O!-eiiRS@S6T6T$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM]]..}?T?TWdWjWjkmWn?npq>rs
;E((8 $])CC'M)R S!-eiiRS@S6T6T$11}GZGZ[]G^F^F`C`4a4k4klm4n$nM//1D1DdFYFYZ$++I
 )%))Na<O2P2P -0H0H0K K HHT[[%?XgX%NOM)%))Na<O2P2P -0H0H0K K &--b$2D2DdFYFY[bcmmnoqrs,,Qa];mKVVWY[]^++00$2E2EtGZGZ[ac
	1a #11!T':Y=P=PQSUW=XX\`\m\mtQ]
 
  ]]334FG YYtzz'')**YYqD!T1!458J1aQRTUW[K[8\\]
'1aD(89AaD!Q>N<O<U<U<WW
aAq$.> ? E E GGwA"1aAq=1;;AqAIMPXYZ\]_`bcefYfPgPqPqrsuvPwwI,,y':':1a'@'C'CE'JAaQRTUWXjMLcLcdfLghKAq!QJ 78   kk,B7!]VVAtQ<L5M%MN!DHHTN26?##DNN3 !%!!!Q'//
BHRRSTVWX!
 %$r5   c                     U R                   (       aJ  [        (       a$  SU R                  R                  R                  ;  a  [        S5      eU R                  XUS9$ U R                  XUS9$ )NcudazFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config.)r   )r   r  r   ra   type
ValueErrorr,  r>  )r/   r@   r  r   s       r3   rC   ZambaMambaMixer.forward1  sl      ))V4;M;M;T;T;Y;Y-Y i 
 ,,]Yg,hh  ^ \\r5   )r  r  r   r   r{   r   rl   r   r   r0   r   rh   r   r   rm   r  rj   r   r   r   r   r   r#   )rK   rL   rM   rN   r   r   r)   r+   r   r]   r,  r>  rC   rO   rP   rQ   s   @r3   r   r   )  s^    
;{ ;| im_%"\\_%9P_%B[%7N [%z	]3J 	] 	]r5   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )ZambaMLPi>  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFr   )r(   r)   r{   r0   rh   r   r   	gate_projup_proj	down_projr   
hidden_actact_fnr/   r{   r2   s     r3   r)   ZambaMLP.__init__?  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r5   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )rK  rM  rI  rJ  )r/   xrK  s      r3   rC   ZambaMLP.forwardI  s6    NN4;;t~~a/@#ADLLQRO#ST	r5   )rM  r{   rK  rI  r0   rh   rJ  )rK   rL   rM   rN   r)   rC   rO   rP   rQ   s   @r3   rF  rF  >  s    0 r5   rF  c                   (  ^  \ rS rSrSS\S\\   4U 4S jjjr    SS\R                  S\R                  S\S\\R                     S\\
   S	\\   S
\\   S\\   S\\R                  \\\R                  \R                  4      4   4S jjrSrU =r$ )ZambaAttentionDecoderLayeriN  r{   r   c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l        g )Nr1   )r(   r)   r   	self_attnrF  feed_forwardr%   r   rms_norm_epsinput_layernormr0   pre_ff_layernormr   s      r3   r)   #ZambaAttentionDecoderLayer.__init__O  s]    ':$V,+F,H,HfNaNab ,V-?-?VEXEX Yr5   r@   original_hidden_statesr   r   output_attentionsr8  r   rS   c           
          [         R                  " X/SS9nU R                  U5      nU R                  " SUUUUUUS.UD6u  pU R	                  U5      nU R                  U5      nU4n
U(       a  X4-  n
U
$ )ai  
Args:
    hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
        This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
        concatenated tensor is then used as input of the pre-attention RMSNorm
        (see fig. 2 in https://huggingface.co/papers/2405.16712).
    layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
r8   r   )r@   r   r   r   r^  r8  r   )r+   concatenaterZ  rW  r[  rX  )r/   r@   r]  r   r   r   r^  r8  r   self_attn_weightsoutputss              r3   rC   "ZambaAttentionDecoderLayer.forwardW  s    > ))=*QWYZ,,];+/>> ,
'))/,
 ,
( --m<))-8 "++Gr5   )rX  rZ  r[  rW  r   )NNFF)rK   rL   rM   rN   r   r   r   r)   r+   r   r]   boolr   r   rF   r   rC   rO   rP   rQ   s   @r3   rT  rT  N  s    Z{ Zx} Z Z 26<@,1$)3||3 !&3 	3
 !.3 !!893 $D>3 D>3 -.3 
u  (51B1BEDUDU1U+V"WW	X3 3r5   rT  c                     ^  \ rS rSrS\S\4U 4S jjr         SS\R                  S\	\R                     S\	\   S\	\R                     S\	\R                     S	\	\
   S
\	\   S\	\   S\	\R                     S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )ZambaMambaDecoderLayeri  r{   r   c                    > [         TU ]  5         [        XS9U l        [	        UR
                  UR                  S9U l        X l        g )N)r{   r   rV  )	r(   r)   r   mambar%   r0   rY  rZ  r   r   s      r3   r)   ZambaMambaDecoderLayer.__init__  s:    $FH
+F,>,>FDWDWX"r5   r@   r]  r   r   r   r^  r8  cache_positiontransformer_hidden_statesrS   c                     UnU
b  X-   OUnU R                  U5      nU R                  UUUS9nSnX-   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
N)r@   r  r   )rZ  rh  )r/   r@   r]  r   r   r   r   r^  r8  rj  rk  r   residualra  rb  s                  r3   rC   ZambaMambaDecoderLayer.forward  s    < !
 :S9^M5dq 	 ,,];

'') # 
 ! !0 "++G((Gr5   )rZ  r   rh  )	NNNNNFFNN)rK   rL   rM   rN   r   r   r)   r+   r   r   r]   rd  r   rF   r   rC   rO   rP   rQ   s   @r3   rf  rf    s   #{ #s # :>#'15.2<@,1$)59<@:||: !) 6: C=	:
 !.: ell+: !!89: $D>: D>: !!1!12: $,ELL#9: 
u  (51B1BEDUDU1U+V"WW	X: :r5   rf  c                   |  ^  \ rS rSrS\S\R                  S\4U 4S jjr        SS\	R                  S\\	R                     S\\   S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\	R"                  \\\	R"                  \	R"                  4      4   4S jjrSrU =r$ )ZambaHybridLayeri  shared_transflinearrh  c                 F   > [         TU ]  5         Xl        X l        X0l        g r   )r(   r)   rq  rr  mamba_decoder)r/   rq  rr  rh  r2   s       r3   r)   ZambaHybridLayer.__init__  s    *"r5   r@   r]  r   r   r   r   r^  r8  rj  rS   c
                     U R                  UUUUUUUU	S9n
U
S   nU(       a  U
S   nU R                  U5      nU R                  UUUUUUU	S9n
U(       a  U
S   W4U
SS -   n
U
$ )a\  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
    hidden activations to form the input of the shared transformer layer.
    layer_idx (`int`): layer number.
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
)r]  r   r   r   r^  r8  rj  r   r   )rk  r   r   r^  r8  rj  r7   N)rq  rr  rt  )r/   r@   r]  r   r   r   r   r^  r8  rj  layer_outputsrk  ra  s                r3   rC   ZambaHybridLayer.forward  s    > **#9&)/) + 	
 %2!$4! -a 0$(KK0I$J!**&?))/) + 
 *1-/@AMRSRTDUUMr5   )rr  rt  rq  )NNNNNFFN)rK   rL   rM   rN   rT  r   r   rf  r)   r+   r   r   r   r]   rd  r   rF   r   rC   rO   rP   rQ   s   @r3   rp  rp    s   #&@ #")) #\r # :>#'15.2<@,1$)59>||> !) 6> C=	>
 !.> ell+> !!89> $D>> D>> !!1!12> 
u  (51B1BEDUDU1U+V"WW	X> >r5   rp  c                   F    \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrS	 rS
rg)ZambaPreTrainedModeli  r{   modelTrT  rf  r   Fc                 t   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       Ga  UR                   R                  R                  SUS9  U R                   R"                  S-  n[        R$                  R'                  UR(                  U* U5        U R                   R*                  U R                   R,                  -  U R                   R.                  -  n[0        R2                  " [0        R4                  " U R                   R.                  U5      [6        R8                  " U R                   R:                  5      [6        R8                  " U R                   R<                  5      -
  -  [6        R8                  " U R                   R<                  5      -   5      R?                  U R                   R@                  S9nU[0        R8                  " [0        RB                  " U* 5      * 5      -   nURD                  R                  RG                  U5        [0        RH                  " SURJ                  S-   [0        RL                  S9S S S 24   nURO                  URP                  S5      RS                  5       nURT                  R                  RG                  [0        R8                  " U5      RW                  UR.                  URX                  S5      5        URZ                  R                  R                  S5        g g )	Nr   )r>   stdg      ?r   )minr   r   r8   ).r{   initializer_ranger1  r   r   r   r-   datanormal_r   zero_	Embeddingpadding_idxr%   fill_r   r   r   inituniform_r   rg   r0   rm   r+   r  randmathr  time_step_maxtime_step_minclamptime_step_floorexpm1r   r  r  rj   r<   rU   rh   r   r  rV   r   r  )r/   r   r}  dt_init_stdr   dtinv_dtr	  s           r3   _init_weights"ZambaPreTrainedModel._init_weights%  s   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)00  %%--3C-@++33T9KGGV22[L+N![[558O8OOSWS^S^SlSllN

4;;44nE88DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566F$$**62Q 5 5 9OPTVWPWXA1126AACALL##EIIaL$8$89M9MvOdOdfh$ijHHMM$% 1r5   r   N)rK   rL   rM   rN   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  rO   r   r5   r3   rz  rz    s;    &*#57OP"3 NL%r5   rz  c                   <  ^  \ rS rSrSrS\4U 4S jjr\          SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\	R                     S
\\   S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       rS rSrU =r$ )
ZambaModeliF  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]

Args:
    config: ZambaConfig
r{   c           
        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [        U5      n/ n/ nUR                  U l
        [        UR                  5       H  nUR                  U   S:X  a  UR                  [        XS95        M0  UR                  U   S:X  d  ME  UR                  [
        R                  " U R                   R                  U R                   R                  SS95        UR                  [        XS95        M     [#        U5      n[#        U5      n/ n/ U l        ['        U R                  5       H  u  pxUS:X  aa  SU S3n	/ SQn
/ U R$                  QU
 Vs/ sH  oU-   PM	     snQU l        UR                  [)        U[+        U5      [+        U5      5      5        Ml  UR                  [+        U5      5        M     [
        R,                  " U5      U l        UR0                  U l        [3        UR                  UR4                  S	9U l        SU l        U R;                  5         g s  snf )
Nrh  )r   rb   Fr   zlayers..)	z%shared_transf.self_attn.q_proj.weightz%shared_transf.self_attn.k_proj.weightz%shared_transf.self_attn.v_proj.weightz%shared_transf.self_attn.o_proj.weightz+shared_transf.feed_forward.gate_proj.weightz)shared_transf.feed_forward.up_proj.weightz+shared_transf.feed_forward.down_proj.weightz$shared_transf.input_layernorm.weightz%shared_transf.pre_ff_layernorm.weightrV  )r(   r)   pad_token_idr  
vocab_sizer   r  r0   embed_tokensrT  re   rt   ru   rw   rf  r   r{   iter_tied_weights_keys	enumeraterp  next
ModuleListlayersr   r%   rY  final_layernormgradient_checkpointing	post_init)r/   r{   blockmamba_layerslinear_layersr}   r  layer_id
layer_typeprefix_name	tied_keysr   r2   s               r3   r)   ZambaModel.__init__O  s    !.. ++LL):):F<N<NPTP`P`a*62!'!9!9v//0A''*g5##$:6$OP))!,8$$RYYt{{/F/FH_H_fk%lm##$:6$OP 1 L)]+"$$-d.D.D$E HX% 'z3
	 +pD,C,C*odmFndm]`UXGXdmFn*o'.ud=6I4P\K]^_d<01# %F$ mmF+$*$?$?!+F,>,>FDWDWX&+# Gos   +I6	input_idsr   position_idsr   inputs_embedsr8  r^  output_hidden_statesreturn_dictrj  rS   c                 
   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUn[        R                  " U5      nU(       a  Uc  [        R                  S5        U
c,  [        R                  " UR                  S   UR                  S9n
Uc  U
R!                  S5      nU R#                  X%U
5      nU(       a  SOS nU(       a  SOS n[%        U R&                  5       H  u  nnU(       a  X4-  nU R                  (       a6  U R                  (       a%  U R)                  UR*                  UUUUUUUUU
5
      nOU" UUUUUUUUU
S	9	nUS   nU(       d  Mu  US   c  M}  UUS   4-  nM     U R-                  U5      nU(       a  X4-  nU(       a  UR.                  (       d  S
Ul        [1        UU(       a  UOS UUS9nU	(       a  U$ UR3                  5       $ )NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fz{Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was provided, so no cache will be returned.r   rc   r   r   )r]  r   r   r   r   r^  r8  rj  T)last_hidden_stater   r@   
attentions)r{   r^  r  r8  use_return_dictrC  r  r   r  r  r  r+   r2  r  rG   ra   r  _update_causal_maskr  r  _gradient_checkpointing_func__call__r  rf   r   to_tuple)r/   r  r   r  r   r  r8  r^  r  r  rj  r@   r]  r   all_hidden_statesall_self_attnsr   layerrw  outputs                       r3   rC   ZambaModel.forward~  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<s  &&4==Yj I  --i8M%!&]!; 0:
 !"\\-*=*=a*@I]I]^N)33A6L..~n]"6BD0d )$++ 6Iu#!%55!**t}} $ A ANN!*"#%"! !&!+A'#1 +#2&7'#1
! *!,M   #/"}Q'7&99NE !7H ,,];  !11?#E#E15O.(+/8Od+%	
 %v;&//*;;r5   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nflash_attention_2r   r   r8   )
fill_valuer:   ra   )diagonalrc   r   r7   .sdpa)rA  xpunpu)r{   r   r:   ra   r+   finfor~  rG   fulltriur  rV   rU   r2  r   eqmasked_fillrB  r   _unmask_unattended)r/   r   input_tensorrj  r:   ra   	min_dtypesequence_lengthtarget_lengthr   mask_lengthpadding_masks               r3   r  ZambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Kr5   )	r   r  r  r  r  r  re   r  r  
NNNNNNNNNN)rK   rL   rM   rN   r   r   r)   r   r   r+   r   r   r]   r   rd  r   rF   r   rC   r  rO   rP   rQ   s   @r3   r  r  F  s#   -{ -^  151537=A59$(,0/3&*59k<E,,-k< !.k< u//0	k<
 ""9:k<   1 12k< D>k< $D>k< 'tnk< d^k< !!1!12k< 
u--	.k< k<\! !r5   r  c                     ^  \ rS rSrS\4U 4S jjrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\
R                  4   S\\\4   4S jj5       r      SS jrSrU =r$ )ZambaForCausalLMi  r{   c                    > [         TU ]  U5        [        U5      U l        S/U R                  R                  QU l        UR
                  U l        [        R                  " UR                  UR
                  SS9U l	        U R                  5         g )Nzlm_head.weightFr   )r(   r)   r  r{  r  r  r   r   r0   lm_headr  rN  s     r3   r)   ZambaForCausalLM.__init__  so     '
#3"Tdjj6S6S"T ++yy!3!3V5F5FUS 	r5   c                     Xl         g r   r{  )r/   decoders     r3   set_decoderZambaForCausalLM.set_decoder  s    
r5   c                     U R                   $ r   r  rH   s    r3   get_decoderZambaForCausalLM.get_decoder   s    zzr5   r  r   r  r   r  labelsr8  r^  r  r  rj  logits_to_keeprS   c                 .   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
U R	                  UUUUUUUU	UU
S9
nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nU
(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, ZambaForCausalLM

>>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
>>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
r  r   r  r   r  r8  r^  r  rj  r  r   r   losslogitsr   r@   r  )r{   r^  r  r  r{  r1  r   slicer  loss_functionr  r   r   r@   r  )r/   r  r   r  r   r  r  r8  r^  r  r  rj  r  r   rb  r@   slice_indicesr  r  r  s                       r3   rC   ZambaForCausalLM.forward#  sK   P 2C1N-TXT_T_TqTq %9$D $++JjJj 	 &1%<k$++B]B] **)%+'/!5)#  
  
8B>SV8W8W~ot4]kmA}a,?@A%%ffooPPDY,F'+'7D7V#CVC%#33!//))
 	
r5   c           	         US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUU R                  R                  US.5        U
$ )Nr8   r   r   )r:   ra   r  r  )r  r   r8  r   r  rj  )rG   r]   r{   r:   ra   longcumsummasked_fill_r   r   num_logits_to_keep)r/   r  r   r   r  rj  r  r8  r   empty_past_kvmodel_inputss              r3   prepare_inputs_for_generation.ZambaForCausalLM.prepare_inputs_for_generationu  sd    (4/  )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	5Y__Q/tzz$++O %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"&++"@"@"0		
 r5   )r  r  r{  r  )NNNNNNNNNNNr   )NNNNNT)rK   rL   rM   rN   r   r)   r  r  r   r   r+   r   r   r]   r   rd  r   r   rF   r   rC   r  rO   rP   rQ   s   @r3   r  r    sp   {   151537=A59-1$(,0/3&*5934O
E,,-O
 !.O
 u//0	O

 ""9:O
   1 12O
 ))*O
 D>O
 $D>O
 'tnO
 d^O
 !!1!12O
 c5<</0O
 
u,,	-O
 O
h 9 9r5   r  a  
    The Zamba Model with a sequence classification head on top (linear layer).

    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-2) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    )custom_introc                   N  ^  \ rS rSrU 4S jr\          SS\\R                     S\\R                     S\\R                     S\\
\\\R                     4      S\\R                     S\\R                     S	\\   S
\\   S\\   S\\   S\
\\4   4S jj5       rSrU =r$ )ZambaForSequenceClassificationi  c                   > [         TU ]  U5        UR                  U l        [        U5      U l        U R                  R
                  U l        [        R                  " UR                  U R                  SS9U l	        U R                  5         g rH  )r(   r)   
num_labelsr  r{  r  r   r   r0   scorer  rN  s     r3   r)   'ZambaForSequenceClassification.__init__  se      ++'
"&**"?"?YYv114??O
 	r5   r  r   r  r   r  r  r8  r^  r  r  rS   c                    U
b  U
OU R                   R                  n
U R                  UUUUUUUU	U
S9	nUS   nU R                  U5      nUb  UR                  S   nOUR                  S   nU R                   R
                  c  US:w  a  [        S5      eU R                   R
                  c  SnOUb  XR                   R
                  :g  R                  UR                  [        R                  5      n[        R                  " UR                  S   UR                  [        R                  S9nUU-  R                  S5      nO.Sn[        R                  U R                  R                    S35        U[        R                  " XR                  S	9U4   nSnUGb  UR                  UR                  5      nU R                   R"                  c  U R$                  S:X  a  S
U R                   l        OoU R$                  S:  aN  UR&                  [        R(                  :X  d  UR&                  [        R*                  :X  a  SU R                   l        OSU R                   l        U R                   R"                  S
:X  aJ  [-        5       nU R$                  S:X  a&  U" UR/                  5       UR/                  5       5      nOU" UU5      nOU R                   R"                  S:X  a=  [1        5       nU" UR3                  SU R$                  5      UR3                  S5      5      nO-U R                   R"                  S:X  a  [5        5       nU" UU5      nU
(       d  U4USS -   nUb  U4U-   $ U$ [7        UUUR8                  UR:                  UR<                  S9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
N)r   r  r   r  r8  r^  r  r  r   r   z=Cannot handle batch sizes > 1 if no padding token is defined.r8   r`   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`rc   
regressionsingle_label_classificationmulti_label_classificationr  )r{   r  r{  r   rG   r  rC  r;   ra   r+   int32r  argmaxr  r  r2   rK   problem_typer  r:   r  r   r
   r  r	   r   r   r   r   r@   r  )r/   r  r   r  r   r  r  r8  r^  r  r  transformer_outputsr@   r  r|   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr  s                         r3   rC   &ZambaForSequenceClassification.forward  s   ( &1%<k$++B]B]"jj)%+'/!5# ) 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||J}}MOaabYYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+- 2 22t GUWY))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r5   )r  r{  r  r   r  )rK   rL   rM   rN   r)   r   r   r+   r   r   r   r   listr   rd  rF   r   rC   rO   rP   rQ   s   @r3   r  r    s     151537KO59-1$(,0/3&*[
E,,-[
 !.[
 u//0	[

 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 $D>[
 'tn[
 d^[
 
u66	7[
 [
r5   r  )r  r  r  rz  )r   )Jr   r  typingr   r   r   r   r+   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   utils.import_utilsr   r   configuration_zambar   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater    causal_conv1dr!   r"   r  r  
get_loggerrK   r  Moduler%   r   r   r[   r]   r  r   r   r   rF  rT  rf  rp  rz  r  r  r  __all__r   r5   r3   <module>r%     sL  (   1 1    A A ! . ) > B q q F & , T , XR@P=-~DD-7**.0@BVXfg 
 
		H	%J299 J*	UU\\ 	U# 	U%,, 	Uffe ff` %II%<<% 
% <<	%
 U\\*% % %4C)RYY C)LQ]bii Q]jryy  < <~ARYY AHEryy EP )%? )% )%X G% G GV\+_ \~ g
%9 g
g
T gr5   