
    <hA                       S r SSKrSSKJrJrJr  SSKrSSKJs  J	r
  SSKrSSKJr  SSKJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSKJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  \" 5       (       a  SSKJ-r-  \*" 5       (       a  SSK.J/r/J0r0  SSK1J2r2  OSu  r2r0r/\)" 5       (       a	  SSK3J4r4J5r5  OSu  r5r4\6" \2\0\4\5\/45      r7\'Rp                  " \95      r:   S@S\\Rv                  \<\Rv                     S4   S\\=   S\\Rv                     S\\Rv                  \=4   4S jjr> " S S\R~                  5      r@S \Rv                  S!\=S\Rv                  4S" jrA " S# S$\5      rB " S% S&\R~                  5      rC " S' S(\C5      rD " S) S*\C5      rE\C\D\ES+.rF " S, S-\R~                  5      rG " S. S/\R~                  5      rH " S0 S1\R~                  5      rI " S2 S3\5      rJ " S4 S5\5      rK\% " S6 S7\ 5      5       rL\J\KS8.rM\% " S9 S:\L5      5       rN " S; S<\L\5      rO " S= S>\\L5      rP/ S?QrQg)AzPyTorch Jamba model.    N)AnyOptionalUnion)nn   )ACT2FN)CacheDynamicCacheDynamicLayer)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)is_causal_conv1d_availableis_mamba_ssm_available   )JambaConfig)_flash_attention_forward)mamba_inner_fnselective_scan_fn)selective_state_update)NNN)causal_conv1d_fncausal_conv1d_updateNNrouter_logitsnum_expertsattention_maskreturnc                 d   U b  [        U [        5      (       d  g[        U [        5      (       aB  U S   R                  n[        R                  " U  Vs/ sH  oUR                  U5      PM     snSS9n[        R                  R                  R                  WSS9n[        R                  " XrSS9u  p[        R                  R                  R                  X5      n
Uc:  [        R                  " U
R                  5       SS9n[        R                  " USS9nGO"UR                  u  pUR                  S   X-  -  nUSSS2SS2SS4   R                  XXU45      R                  SX!5      R                  W5      n[        R                   " U
R                  5       U-  SS9[        R                   " USS9-  nUSSS2SS2S4   R                  XXR                  S   45      R                  SUR                  S   5      R                  U5      n[        R                   " UU-  SS9[        R                   " USS9-  nUR                  S   [#        UR                  R$                  5      -  n[        R                   " USS2UUUR                  S   -   24   UR'                  S5      -  5      nUU-  $ s  snf )a|  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    router_logits:
        Logits from the `router`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [batch_size X sequence_length, num_experts].
    num_experts:
        Number of experts
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   dimr   )
isinstancetupledevicetorchcattor   
functionalsoftmaxtopkone_hotmeanfloatshapeexpandreshapesumintindex	unsqueeze)r%   r&   top_kr'   compute_devicelayer_routerconcatenated_router_logitsrouting_weights_selected_expertsexpert_masktokens_per_expertrouter_prob_per_expert
batch_sizesequence_lengthnum_hidden_layersexpert_attention_mask router_per_expert_attention_maskrankoverall_losss                       `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/jamba/modeling_jamba.pyload_balancing_loss_funcrR   G   s   : J}e$D$D-''&q)00%*YYANO__^,OUV&
" hh))112LRT1UO**_DA((%%--.>LK!JJ{'8'8':B "'O!C&4&:&:#
6<<Q?JD`a 4AtT12V&OKXYWR,R	 	 "IIk&7&7&9<Q&QWXY\a\e\e!q]
 
 4At+,V&OEZEZ[\E]^_WR..q12R	 	) "'?=]+]cd!ehmhqhq,!i
 "
   #c/*@*@*F*F&GGD99!TD?+@+@+C$CCCDG]GgGghiGjjL +%%c Ps   J-c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )JambaRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
JambaRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr0   onesweightvariance_epsilon)selfhidden_sizeeps	__class__s      rQ   rX   JambaRMSNorm.__init__   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   r,   T)keepdim)	dtyper2   r0   float32powr7   rsqrtr\   r[   )r]   hidden_statesinput_dtypevariances       rQ   forwardJambaRMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::rb   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r.   r[   r9   r\   r]   s    rQ   
extra_reprJambaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIrb   )r\   r[   )gư>)	__name__
__module____qualname____firstlineno__rX   rm   rq   __static_attributes____classcell__r`   s   @rQ   rT   rT      s    $;J Jrb   rT   rj   n_repc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)r9   r:   r;   )rj   rz   batchnum_key_value_headsslenhead_dims         rQ   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTrb   c                     ^  \ rS rSrSrSrSrSr\R                  S4U 4S jjr
 SS\R                  S\R                  S\S	\\\\4      S
\\R                  \R                  4   4
S jjrS\R&                  4S jrSS\\   S
\4S jjrS
\\\R                     \\R                     4   4S jr\SS\\\\R0                           S
S4S jj5       rSrU =r$ ) HybridMambaAttentionDynamicCache   a|  
A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
(which has a constant shape regardless of seq_len).

This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
NFc                   > [         T
U ]  [        S9  X0l        UR                  U l        SU l        UR                  UR                  -  nUR                  nUR                  n/ U l
        / U l        / U l        [        UR                  5       H  nU R                  U   S:X  aV  U =R                  [        R                   " X%XtUS9/-  sl
        U =R                  [        R                   " X%XdUS9/-  sl        Ml  U =R                  [        R"                  " / /U-  US9/-  sl
        U =R                  [        R"                  " / /U-  US9/-  sl        U R                  R%                  U5        M     [        UR                  5       V	s/ sH  n	[        R"                  " / /U-  US9PM     sn	U l        [        UR                  5       V	s/ sH  n	[        R"                  " / /U-  US9PM     sn	U l        g s  sn	f s  sn	f )N)layer_classesFmambar/   rf   r/   )rW   rX   r   rf   layers_block_typehas_previous_statemamba_expandr^   mamba_d_statemamba_d_convconv_states
ssm_statestransformer_layersrangerL   r0   zerostensorappend	key_cachevalue_cache)r]   configrJ   rf   r/   intermediate_sizessm_state_sizeconv_kernel_sizeirE   r`   s             rQ   rX   )HybridMambaAttentionDynamicCache.__init__   s   |4
!'!9!9"'"//&2D2DD--!.."$v//0A%%a(G3  KK
?Ofkl%   KK
~dij$    U\\2$2CF%S$TT ELL"
1B6$R#SS''..q1 1 SXX^XpXpRqrRqQ%,,tj'8HRqrTYZ`ZrZrTstTsqELL"
):6JTst sts   "G4	"G9
key_statesvalue_states	layer_idxcache_kwargsr(   c                 |   U R                   U   R                  S   S:X  a  XR                   U'   X R                  U'   Ob[        R                  " U R                   U   U/SS9U R                   U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                   U   U R                  U   4$ )Nr,   r   rd   r*   )r   r9   r   r0   r1   )r]   r   r   r   r   s        rQ   update'HybridMambaAttentionDynamicCache.update   s     >>)$**2.!3(2NN9%*6Y'(-		4>>)3Lj2Y_`(aDNN9%*/))T5E5Ei5PR^4_ef*gDY'~~i($*:*:9*EEErb   beam_idxc                    [        [        U R                  5      5       GHT  nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GMW     g)zDReorders the cache for beam search, given the selected beam indices.r   N)	r   lenr   r/   index_selectr2   r   r   r   )r]   r   r   r/   s       rQ   reorder_cache.HybridMambaAttentionDynamicCache.reorder_cache   s=   s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4rb   c                     XR                   ;  a  U R                   S   OUn[        U R                  5      U::  a  gU R                  U   R                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   )r   r   r   r9   )r]   r   s     rQ   get_seq_length/HybridMambaAttentionDynamicCache.get_seq_length  sP     3<CZCZ2ZD++A.`i	t~~)+~~i(..r22rb   c                     [        S5      eNzIHybridMambaAttentionDynamicCache does not have a legacy cache equivalent.NotImplementedErrorrp   s    rQ   to_legacy_cache0HybridMambaAttentionDynamicCache.to_legacy_cache  s    !"mnnrb   past_key_valuesr
   c                     [        S5      er   r   )clsr   s     rQ   from_legacy_cache2HybridMambaAttentionDynamicCache.from_legacy_cache  s    !"mnnrb   )r   rf   r   r   r   r   r   r   N)r   )rs   rt   ru   rv   __doc__r   r   is_compileabler0   float16rX   Tensorr=   r   dictstrr   r.   r   
LongTensorr   r   r   classmethodFloatTensorr   rw   rx   ry   s   @rQ   r   r      s!    IKN16t u@ 26FLLF llF 	F
 tCH~.F 
u||U\\)	*F"ie&6&6 i3 3c 3ouU\\':E%,,<O'O!P o ouUEVEV?W9X0Y oes o orb   r   c                   >  ^  \ rS rSrSrSS\S\\   4U 4S jjjr      SS\	R                  S\\	R                     S\\	R                     S	\\   S
\S\S\\	R                     S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )JambaAttentioni  z
Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
and "Generating Long Sequences with Sparse Transformers".
r   r   c                 "  > [         TU ]  5         Xl        X l        Uc-  [        R                  SU R                  R                   S35        UR                  U l        UR                  U l
        U R                  U R                  -  U l        UR                  U l        U R                  U R                  -  U l        SU l        UR                  U l        U R                  U R                  -  U R                  :w  a&  [!        SU R                   SU R                   S35      e["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  U R                  -  SS9U l        ["        R$                  " U R                  U R                  -  U R                  SS9U l        g )	NzInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.Tz?hidden_size must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).Fbias)rW   rX   r   r   loggerwarning_oncer`   rs   r^   num_attention_heads	num_headsr   r}   num_key_value_groups	is_causalattention_dropout
ValueErrorr   Linearq_projk_projv_projo_proj)r]   r   r   r`   s      rQ   rX   JambaAttention.__init__"  s   " !8!8 9 :, , "--33((DNN:#)#=#= $(NNd6N6N$N!!'!9!9MMDNN*t/?/??QRVRbRbQc$T^^$4B8  ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]rb   rj   r'   position_idspast_key_valueoutput_attentions	use_cachecache_positionr(   c                    UR                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      n[        R                  " XR                  SS5      5      [        R                   " U R                  5      -  nUb"  US S 2S S 2S S 2S UR"                  S   24   nX-   n[$        R&                  R)                  US[        R*                  S9R-                  UR.                  5      n[$        R&                  R1                  XR2                  U R4                  S9n[        R                  " X5      nUR                  5       XR
                  XR                  4:w  a5  [7        SXR
                  XR                  4 S	UR                  5        35      eUR                  SS5      R9                  5       nUR;                  XU R<                  5      nU R?                  U5      nU(       d  S nUX4$ )
Nr   rd   r   r   r,   r+   rf   )ptrainingz `attn_output` should be of size z	, but is ) sizer   r   r   viewr   r   	transposer}   r   r   r   r   r0   matmulmathsqrtr9   r   r3   r4   rg   r2   rf   dropoutr   r   r   
contiguousr;   r^   r   )r]   rj   r'   r   r   r   r   r   bszq_lenrE   query_statesr   r   attn_weightscausal_maskattn_outputs                    rQ   rm   JambaAttention.forward?  s    &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm%'5'<'<ZW[WeWe'f$J z+D+DE
 /H/HI||L2F2Fq!2LMPTPYPYZ^ZgZgPhh%(Aq2HJ4D4DR4H2H)HIK'5L }},,\r,WZZ[g[m[mn}},,\=S=S^b^k^k,lll<>#~~umm!LL2CP]P]3^2_ `$$&') 
 "++Aq1<<>!))#d6F6FGkk+. LL88rb   )r   r   r   r^   r   r   r   r   r   r}   r   r   r   r   NNNFFN)rs   rt   ru   rv   r   r   r   r=   rX   r0   r   r   r   boolr.   rm   rw   rx   ry   s   @rQ   r   r     s    
^{ ^x} ^ ^@ 2637EI"'5949||49 !.49 u//0	49
 !!AB49  49 49 !!1!1249 
u||Xell3XeELL>Q5RR	S49 49rb   r   c                      ^  \ rS rSrSrU 4S jr      SS\R                  S\\R                     S\\R                     S\\
   S\S	\S
\\R                     4S jjrSrU =r$ )JambaFlashAttention2iw  a6  
Jamba flash attention module. This module inherits from `JambaAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 D   > [         TU ]  " U0 UD6  [        5       U l        g r   )rW   rX   r   _flash_attn_uses_top_left_mask)r]   argskwargsr`   s      rQ   rX   JambaFlashAttention2.__init__~  s#    $)&)
 /P.Q+rb   rj   r'   r   r   r   r   r   c                 p   UR                  5       u  pnU R                  U5      nU R                  U5      nU R                  U5      nUR	                  XU R
                  U R                  5      nUR	                  XU R                  U R                  5      R                  SS5      nUR	                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                  5      n[        XR                  5      nU R                  (       d  SOU R                  nUR                  nUR                   R"                  S:w  a  UR                   R"                  OSnU[$        R&                  :X  a  [$        R(                  " 5       (       aA  [+        [$        S5      (       a  [$        R,                  " U5      O[$        R.                  " 5       nOR[+        U R0                  S5      (       a  U R0                  R2                  nO U R                  R4                  R                  n[6        R9                  SU S	35        UR;                  U5      nUR;                  U5      nUR;                  U5      nUR                  SS5      nUR                  SS5      n[=        UUUUU
U[?        U R0                  S
S 5      U R@                  U RB                  S9	nURE                  XU RF                  5      RI                  5       nU RK                  U5      nU(       d  S nUWU4$ )Nr   rd           mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .sliding_window)r   r   r   use_top_left_mask)&r   r   r   r   r   r   r   r}   r   r   r   r   r   r   r   rf   r/   typer0   rg   is_autocast_enabledhasattrr   get_autocast_gpu_dtyper   r   r[   r   r   r2   r   getattrr   r   r;   r^   r   r   )r]   rj   r'   r   r   r   r   r   r   r   r   rE   r   r   r   dropout_raterk   device_typetarget_dtyper   r   s                        rQ   rm   JambaFlashAttention2.forward  s    &**,A{{=1[[/
{{=1
 $((T^^T]]S__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm%'5'<'<ZW[WeWe'f$J z+D+DE
 /H/HI"&--sT5K5K
 #((2>2E2E2J2Je2Sl))..Y^%--'((** u&:;; ,,[9557  &?@@#{{BB#{{1177 >$ (??<8L#|4J'??<8L  ))!Q/
#--a3. "4;;0@$Gnn"AA

 "))#d6F6FGRRTkk+. LL.88rb   )r   r   )rs   rt   ru   rv   r   rX   r0   r   r   r   r   r   rm   rw   rx   ry   s   @rQ   r   r   w  s    R 2637EI"'59R9||R9 !.R9 u//0	R9
 !!ABR9  R9 R9 !!1!12R9 R9rb   r   c                   "  ^  \ rS rSrSr      SS\R                  S\\R                     S\\R                     S\\	   S\
S\
S	\\R                     S
\\R                  \\R                     \\\R                        4   4U 4S jjjrSrU =r$ )JambaSdpaAttentioni  z
Jamba attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
`JambaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
SDPA API.
rj   r'   r   r   r   r   r   r(   c           	      (  > U(       a'  [         R                  S5        [        TU ]  UUUUUUS9$ UR	                  5       u  pn
U R                  U5      nU R                  U5      nU R                  U5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUR                  XU R                  U R                  5      R                  SS5      nUb  UR                  XU R                  5      u  p[        XR                   5      n[        XR                   5      nUnUb  US S 2S S 2S S 2S UR"                  S   24   nUR$                  R&                  S:X  a3  Ub0  UR)                  5       nUR)                  5       nUR)                  5       nU R*                  =(       a    US L =(       a    U	S:  n[,        R.                  R0                  R3                  UUUUU R4                  (       a  U R6                  OSUS9nUR                  SS5      R)                  5       nUR                  XU R8                  5      nU R;                  U5      nUS U4$ )	Na  JambaModel is using JambaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)rj   r'   r   r   r   r   r   rd   r   cudar   )	attn_mask	dropout_pr   )r   r   rW   rm   r   r   r   r   r   r   r   r   r}   r   r   r   r   r9   r/   r   r   r   r0   r   r3   scaled_dot_product_attentionr   r   r^   r   )r]   rj   r'   r   r   r   r   r   r   r   rE   r   r   r   r   r   r   r`   s                    rQ   rm   JambaSdpaAttention.forward  sk    [ 7?+-)-"3# #   &**,A{{=1[[/
{{=1#((T^^T]]S]]^_abc__S1I1I4==Yccdeghi
#((T5M5Mt}}]gghiklm%'5'<'<ZW[WeWe'f$Jz+D+DE
 /H/HI$%%aA/E1A1A"1E/E&EFK ##v-.2L'224L#..0J'224L
 NNH{d':Huqy	hh))FF!04d,,3 G 
 "++Aq1<<>!&&s43C3CDkk+.D.00rb    r   )rs   rt   ru   rv   r   r0   r   r   r   r   r   r.   rm   rw   rx   ry   s   @rQ   r
  r
    s     2637EI"'59G1||G1 !.G1 u//0	G1
 !!ABG1  G1 G1 !!1!12G1 
u||Xell3XeELL>Q5RR	SG1 G1rb   r
  )eagerflash_attention_2sdpac                      ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	S\
\R                     4S jjrSS\	S\
\R                     4S	 jjr  SS\	S\
\R                     4S
 jjrSrU =r$ )JambaMambaMixeri6  uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r   c           	        > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        UR                  U l        UR                  UR                  -  U l
        UR                  U l        UR                  U l        UR                  U l        ["        R$                  " U R                  U R                  U R                  U R                  U R                  U R                  S-
  S9U l        UR(                  U l        [,        UR(                     U l        UR0                  U l        ["        R4                  " U R                  U R                  S-  U R                   S9U l        ["        R4                  " U R                  U R                  U R                  S-  -   SS9U l        ["        R4                  " U R                  U R                  SS9U l        [<        R>                  " SU R                  S-   5      S S S 24   nURA                  U R                  S5      RC                  5       n["        RD                  " [<        RF                  " U5      5      U l$        ["        RD                  " [<        RJ                  " U R                  5      5      U l&        ["        R4                  " U R                  U R                  U R                   S9U l'        [Q        U R                  URR                  S9U l*        [Q        U R                  URR                  S9U l+        [Q        U R                  URR                  S9U l,        [Z        (       d  [\        R_                  S	5        g g )
Nr   )in_channelsout_channelsr   kernel_sizegroupspaddingrd   r   FTr,   r_   ap  The fast path is not available because on of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config)0rW   rX   r   r   r^   r   r   r   r   r   r   mamba_dt_ranktime_step_rankmamba_conv_biasuse_conv_biasmamba_proj_biasuse_biasr   Conv1dconv1d
hidden_act
activationr   actuse_mamba_kernelsuse_fast_kernelsr   in_projx_projdt_projr0   aranger:   r   rY   logA_logrZ   Dout_projrT   rms_norm_epsdt_layernormb_layernormc_layernormis_fast_path_availabler   r   )r]   r   r   Ar`   s       rQ   rX   JambaMambaMixer.__init__>  s   "!--$22 & 3 3!'!4!4v7I7I!I$22#33..ii..//##--))))A-
 !++&++, & 8 8 yy!1!143I3IA3MTXTaTabii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!34T1W=HHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQUQ^Q^_()<)<&BUBUV'(;(;ATATU'(;(;ATATU%%^ &rb   rj   cache_paramsr'   c                 l
   UR                   u  pEnUS L=(       a|    UR                  =(       ai    US:H  =(       a]    UR                  U R                     R                   S   UR                  U R                     R                   S   s=:H  =(       a    U:H  Os  nU R                  U5      R                  SS5      nUR                  SSS9u  pUb  XR                  S5      -  nU R                  R                  R                  U R                  R                  R                  S5      U R                  R                  R                  S5      5      n
U(       ae  [        UR                  S5      UR                  U R                     U
U R                  R                  U R                   5      nUR                  S5      nOUbc  ["        R$                  R'                  XR(                  UR                   S   -
  S45      nUR                  U R                     R+                  U5        [-        XU R                  R                  U R                   S9nUb  XR                  S5      -  nU R/                  UR                  SS5      5      n[0        R2                  " XR4                  U R6                  U R6                  /SS9u  pnU R9                  U5      nU R;                  U5      nU R=                  U5      nU R>                  R                  R@                  n[0        RB                  " 5          [0        RD                  " U R>                  R                  R@                  5      U R>                  R                  l         S S S 5        U R?                  U5      R                  SS5      n[0        RB                  " 5          UU R>                  R                  l         S S S 5        [0        RF                  " U RH                  RK                  5       5      * nUb  URK                  5       OS nU(       aZ  [M        UR                  U R                     US   US   UUS S 2S4   US S 2S4   U RN                  U	S   USS	9
R                  S5      nO{[Q        UUUUR                  SS5      UR                  SS5      U RN                  RK                  5       U	USSS
9
u  nnUb+  Ub(  UR                  U R                     R+                  U5        U RS                  UR                  SS5      5      nU$ ! , (       d  f       GN= f! , (       d  f       GNd= f)Nr   r   rd   r*   r,   )r'  ).r   T)dt_softplus)delta_softplusreturn_last_state)*r9   r   r   r   r   r+  r   chunkr?   r%  r[   r   r   r#   squeezer   r'  r   r3   padr   copy_r"   r,  r0   splitr  r   r4  r5  r6  r-  datano_grad
zeros_likeexpr0  r8   r!   r1  r    r2  )r]   rj   r:  r'   rJ   seq_lenrE   use_precomputed_statesprojected_statesgateconv_weightsr   ssm_parameters	time_stepBCtime_proj_biasdiscrete_time_stepr8  scan_outputs	ssm_statecontextualized_statess                         rQ   cuda_kernels_forward$JambaMambaMixer.cuda_kernels_forwardr  sg    "/!4!4
Q$ //1 ((8>>qA&&t~~6<<Q? 	 	  <<6@@AF /44QA4>%),D,DQ,GGM {{))..t{{/A/A/F/Fq/I4;;K]K]KbKbcdKef!0%%b)((8  M *33B7M' mm//@U@UXeXkXklnXo@oqr?st((8>>{K,]$++JZJZgkgvgvwM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	a %%i0	QQ **//]]_%*%5%5dll6G6G6L6L%MDLL" !\\)4>>q!D]]_%3DLL"  YYtzz'')**3A3M--/SW!1''7f%"6*!Q$!Q$V  im  '8"Aq!Aq!#"&'#L) $)A''7==iH !%l.D.DQ.J K$$S _ _s   AT+T$
T!$
T3c           	         UR                   u  pEnUR                  nU R                  U5      R                  SS5      nUR	                  SSS9u  pUb  XR                  S5      -  n	[        U[        5      nU(       GaB  UR                  U R                     R                   S   U:X  Ga  U R                  (       a(  UR                  U R                     R                  5       nOUR                  U R                     nUR                  U	R                  5      nUR                  (       Ga  US:X  Ga  UR                  U R                     R                   S   U:X  a  UR                  U R                     n[         R"                  " USSS9nU	S S 2S S 2S4   US S 2S S 2S4'   XR                  U R                  '   [         R$                  " XR&                  R(                  S S 2SS S 24   -  SS9n	U R*                  (       a  XR&                  R,                  -  n	U R/                  U	5      R                  U5      R                  S5      n	O[0        R2                  R5                  U	U R6                  U	R                   S   -
  S45      nXR                  U R                  '   U R/                  U R'                  U	5      SS U24   5      n	O][         R8                  " X@R:                  U R<                  4U	R                  US9nU R/                  U R'                  U	5      SS U24   5      n	Ub  XR                  S5      -  n	U R?                  U	R                  SS5      5      n[         R@                  " XRB                  U R<                  U R<                  /SS9u  nnnU RE                  U5      nU RG                  U5      nU RI                  U5      nU RK                  U5      n[0        R2                  RM                  U5      R                  SS5      n[         RN                  " U RP                  RS                  5       5      * n[         RN                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RS                  5       -  nUU	S S 2S S 2S S 2S 4   RS                  5       -  n/ n[U        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[         RV                  " UR                  U5      US S 2US S 24   R                  S5      5      nURY                  US S 2S S 2S4   5        M     [         RZ                  " USS9nUXR\                  S S S 2S 4   -  -   nUU R/                  U
5      -  nU(       a  XR                  U R                  '   U R_                  UR                  SS5      5      nU$ )	Nr   rd   r*   r   r,   )shiftsdims.r   )0r9   rf   r+  r   r?  r?   r-   r   r   r   r   cloner2   r/   r   r   r0   rollr<   r%  r[   r!  r   r(  r   r3   rA  r   r   r   r   r,  rC  r  r4  r5  r6  r-  softplusrG  r0  r8   r   r   r   stackr1  r2  )r]   input_statesr:  r'   rJ   rH  rE   rf   rJ  rj   rK  r   rT  
conv_staterM  rN  rO  rP  rR  r8  
discrete_A
discrete_BdeltaB_urS  r   scan_outputrU  s                              rQ   slow_forwardJambaMambaMixer.slow_forward  s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM|-MN	00@FFqIZW}}(33DNNCIIK	(33DNNC	!]%9%9:I...7a< ,,T^^<BB1ES)55dnnE
"ZZ
2BG
'4Q1W'=
1a8$;E((8 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O]]..!**]-@-@-DDaH
 <F((8 $])CC'M)R S33T5H5HI$++5I !HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	1a %%i0	QQ!\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DDwA"1aA:.:XaAqj=QQI,,y||E':AaAgJ<P<PQS<TUKAq!G 45   kk,B7!]VVD!TM5J%JK"TXXd^36?##DNN3 !%k.C.CAq.I J$$rb   c                     U R                   (       aV  [        (       a.  SU R                  R                  R                  R
                  ;  a  [        S5      eU R                  XU5      $ U R                  XU5      $ )Nr  zsFast Mamba kernels are not available. Make sure to they are installed and that the mamba module is on a CUDA device)	r*  r7  r,  r[   r/   r   r   rV  re  )r]   rj   r:  r'   s       rQ   rm   JambaMambaMixer.forward2  sk       ))V4;;;M;M;T;T;Y;Y-Y  J  ,,].YY  nMMrb   )r0  r1  r(  r'  r5  r6  r   r%  r   r4  r-  r^   r+  r   r   r2  r   r  r#  r!  r*  r,  r$   )rs   rt   ru   rv   r   r   rX   r0   r   r   r   r   rV  re  rm   rw   rx   ry   s   @rQ   r  r  6  s    2{ 2n :>59	h%||h% 7h% !!1!12	h%VR%7W R%pxy~  zJ  zJ  qK R%p :>59	N 7N !!1!12	N Nrb   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )JambaMLPiB  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g NFr   )rW   rX   r   r^   r   r   r   	gate_projup_proj	down_projr   r&  act_fnr]   r   r`   s     rQ   rX   JambaMLP.__init__C  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../rb   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )ro  rp  rm  rn  )r]   xro  s      rQ   rm   JambaMLP.forwardM  s6    NN4;;t~~a/@#ADLLQRO#ST	rb   )rp  r   ro  rm  r^   r   rn  )rs   rt   ru   rv   rX   rm   rw   rx   ry   s   @rQ   rj  rj  B  s    0 rb   rj  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	\R                  \R                  4   4S jr
SrU =r$ )	JambaSparseMoeBlockiS  a  
This implementation is
strictly equivalent to standard MoE with full capacity (no
dropped tokens). It's faster since it formulates MoE operations
in terms of block-sparse operations to accommodate imbalanced
assignments of tokens to experts, whereas standard MoE either
(1) drop tokens at the cost of reduced performance or (2) set
capacity factor to number of experts and thus waste computation
and memory on padding.
r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " [        U R                  5       Vs/ sH  n[        U5      PM     sn5      U l        g s  snf rl  )rW   rX   r^   
hidden_dimr   ffn_dimr&   num_experts_per_tokr@   r   r   router
ModuleListr   rj  experts)r]   r   rE   r`   s      rQ   rX   JambaSparseMoeBlock.__init___  s     ,,//!--//
ii1A1AN}}dFVFV@W%X@W1hv&6@W%XY%Xs   *Crj   r(   c                    UR                   u  p#nUR                  SU5      nU R                  U5      n[        R                  " US[
        R                  S9n[
        R                  " X`R                  SS9u  pgUR                  UR                  5      n[
        R                  " X#-  U4UR                  UR                  S9n[
        R                  R                  R                  XpR                   S9R#                  SSS5      n	[%        U R                   5       H  n
U R&                  U
   n[
        R(                  " X   5      u  pUR                   S   S:X  a  MA  US	U4   R+                  SU5      nU" U5      XmUS	4   -  nUR-                  SXR                  UR                  5      5        M     UR+                  X#U5      nX4$ )
 r,   r   r   r*   )rf   r/   )num_classesrd   r   N)r9   r   r|  Fr4   r0   r8   r5   r@   r2   rf   r   r/   r   r3   r6   r&   permuter   r~  wherer;   
index_add_)r]   rj   rJ   rK   ry  r%   rD   rF   final_hidden_statesrG   
expert_idxexpert_layeridxtop_xcurrent_statecurrent_hidden_statess                   rQ   rm   JambaSparseMoeBlock.forwardi  s   2?2E2E/
Z%**2z:M2))MqL,1JJ

XZ,[)),,]-@-@A#kk):6m>Q>QZgZnZn
 hh))112BP`P`1aiijkmnpqr   0 01J<<
3L[%<=JC{{1~"
 *$+6>>r:NM$0$?/Y\^bRbBc$c!  **1e5M5MmNaNa5bc 2  299*Wab"11rb   )r~  rz  ry  r&   r|  r@   )rs   rt   ru   rv   r   r   rX   r0   r   r.   rm   rw   rx   ry   s   @rQ   rw  rw  S  sD    	Z{ Z&2U\\ &2eELL%,,<V6W &2 &2rb   rw  c                   D  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )JambaAttentionDecoderLayeri  r   r   c                 F  > [         TU ]  5         UR                  U   n[        UR                     " X5      U l        US:  a  [        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r  )rW   rX   layers_num_expertsJAMBA_ATTENTION_CLASSES_attn_implementation	self_attnrw  rj  feed_forwardrT   r^   r3  input_layernormpre_ff_layernormr]   r   r   r&   ffn_layer_classr`   s        rQ   rX   #JambaAttentionDecoderLayer.__init__  s    //	:01L1LMf`1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yrb   rj   r'   r   r   r   output_router_logitsr   r   r(   c	           
      <   Un	U R                  U5      nU R                  UUUUUUUS9u  pnX-   nUn	U R                  U5      nU R                  U5      n[	        U[
        5      (       a  Uu  pOUSpX-   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )b  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
)rj   r'   r   r   r   r   r   N)r  r  r  r  r-   r.   )r]   rj   r'   r   r   r   r  r   r   residualself_attn_weightspresent_key_value
ff_outputsr%   outputss                  rQ   rm   "JambaAttentionDecoderLayer.forward  s    < !,,];>Bnn')%)/) ?M ?
;*; !0 !--m<&&}5
j%((+5(M=+5t= 0 "++G++G''Grb   )r  r  r  r  NNNFFFNrs   rt   ru   rv   r   r=   rX   r0   r   r   r   r   r   r.   r   rm   rw   rx   ry   s   @rQ   r  r    s    Z{ Zs Z 2637EI,1/4$)59D||D !.D u//0	D
 !!ABD $D>D 'tnD D>D !!1!12D 
u  (51B1BEDUDU1U+V"WW	XD Drb   r  c                   D  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\   S\	\R                     S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )JambaMambaDecoderLayeri  r   r   c                 &  > [         TU ]  5         UR                  U   n[        XS9U l        US:  a  [
        O[        nU" U5      U l        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )N)r   r   r   r  )rW   rX   r  r  r   rw  rj  r  rT   r^   r3  r  r  r  s        rQ   rX   JambaMambaDecoderLayer.__init__  s{    //	:$FH
1<q-h+F3+F,>,>FDWDWX ,V-?-?VEXEX Yrb   rj   r'   r   r   r   r  r   r   r(   c	                 2   Un	U R                  U5      nU R                  UUUS9nSn
X-   nUn	U R                  U5      nU R                  U5      n[	        U[
        5      (       a  Uu  pOUSpX-   nU4nU(       a  X4-  nU(       a  X4-  nU(       a  X4-  nU$ )r  )rj   r:  r'   N)r  r   r  r  r-   r.   )r]   rj   r'   r   r   r   r  r   r   r  r  r  r%   r  s                 rQ   rm   JambaMambaDecoderLayer.forward  s    < !,,];

'') # 

 ! !0 !--m<&&}5
j%((+5(M=+5t= 0 "++G((G''Grb   )r  r  r   r  r  r  ry   s   @rQ   r  r    s    Z{ Zs Z 2637EI,1/4$)59A||A !.A u//0	A
 !!ABA $D>A 'tnA D>A !!1!12A 
u  (51B1BEDUDU1U+V"WW	XA Arb   r  c                   F    \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrSrS rS	rg
)JambaPreTrainedModeli3  r   modelTr  r  r   c                 J   U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR                  R                  R                  SUS9  UR                  b2  UR                  R                  UR                     R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g [        U[        5      (       a  [         R"                  " SUR$                  S-   5      S S S 24   nUR'                  UR(                  S5      R+                  5       nUR,                  R                  R/                  [         R0                  " U5      5        UR2                  R                  R                  S5        g g )Nr   )r7   stdg      ?r   r,   )r   initializer_ranger-   r   r   r$  r[   rD  normal_r   zero_	Embeddingpadding_idxrT   fill_r  r0   r.  r   r:   r   r   r0  rB  r/  r1  )r]   moduler  r8  s       rQ   _init_weights"JambaPreTrainedModel._init_weights?  s}   kk++fryy"))455MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--MM$$S)00Q 5 5 9:47CA1126AACALL##EIIaL1HHMM$	 1rb   r  N)rs   rt   ru   rv   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_is_statefulr  rw   r  rb   rQ   r  r  3  s;    &*#57OP"3NL%rb   r  )	attentionr   c                   L  ^  \ rS rSrSrS\4U 4S jjr\\          SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\
R                     S
\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\   S\4S jj5       5       rS rS rSrU =r$ )
JambaModeliV  z
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`JambaDecoderLayer`]

Args:
    config: JambaConfig
r   c                 <  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        / n[        UR                  5       H.  n[        UR                  U      nUR                  U" XS95        M0     [
        R                  " U5      U l        UR                   U l        [#        UR                  UR$                  S9U l        SU l        U R+                  5         g )N)r   r  F)rW   rX   pad_token_idr  
vocab_sizer   r  r^   embed_tokensr   rL   ALL_DECODER_LAYER_TYPESr   r   r}  layersr  rT   r3  final_layernormgradient_checkpointing	post_init)r]   r   decoder_layersr   layer_classr`   s        rQ   rX   JambaModel.__init___  s     !.. ++LL):):F<N<NPTP`P`av//0A1&2J2J12MNK!!+f"BC 1 mmN3$*$?$?!+F,>,>FDWDWX&+#rb   	input_idsr'   r   r   inputs_embedsr   r   output_hidden_statesr  r   r   r(   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nUnU(       a  Uc  [        R                  S5        U
c,  [        R                  " UR                  S   UR                  S9n
Uc  U
R                  S5      nU R!                  X%U
5      nU R#                  X*5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nU R$                   Hj  n['        U[(        5      (       a  UOUnU(       a  X4-  nU" UUUUUU	UU
S	9nUS   nU(       a  US   b	  UUS   4-  nU	(       d  MY  US
   c  Ma  UUS
   4-  nMl     U R+                  U5      nU(       a  X4-  nU(       a  UR,                  (       d  SUl        U(       d  S OUn[/        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzJamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was provided, so no cache will be returned.r   r   r   r  )r'   r   r   r   r  r   r   r,   T)last_hidden_stater   rj   
attentionsr%   )r   r   r  r  r   r   r  r   r   r   r  r0   r.  r9   r/   r?   _update_causal_mask_update_mamba_maskr  r-   r  r  r   r   )r]   r  r'   r   r   r  r   r   r  r  r   r   rj   r   
mamba_maskall_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputs
next_caches                         rQ   rm   JambaModel.forwardr  sh     2C1N-TXT_T_TqTq$8$D $++JjJj 	 %9$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%0:
 !"\\-*=*=a*@I]I]^N)33A6L..~n],,^L
"6BD0d"6BD![[M'1-AW'X'X^iJ#!%55!)))."3%9#-	M *!,M  #/"}Q'7&99N## $0%-*;)==%9 )< ,,];  !11?#E#E15O.!*T
%+&+%+
 	
rb   c                    U R                   R                  S:X  a  Ub  SU;   a  U$ g UR                  UR                  pT[        R
                  " U5      R                  nUR                  S   nUS   S-   n[        R                  " Xx4XdUS9n	US:w  a  [        R                  " U	SS9n	U	[        R                  " XS9UR                  SS5      :  -  n	U	S S S S 2S S 24   R                  UR                  S   SSS5      n	Ub  U	R                  5       n	UR                  5       S	:X  ac  UR                  S   n
U	S
S U
24   R                  S5      US S 2S S S S 24   R                  S5      -  nU	S
S U
24   R!                  X5      U	S
S U
24'   U R                   R                  S:X  a3  Ub0  UR                  R"                  S;   a  [$        R&                  " X5      n	U	$ )Nr  r   r   r,   )
fill_valuerf   r/   )diagonalr   r   rd   .r  )r  xpunpu)r   r  rf   r/   r0   finfominr9   fulltriur.  r;   r:   r[  r+   eqmasked_fillr   r   _unmask_unattended)r]   r'   input_tensorr   rf   r/   	min_dtyperK   target_lengthr   mask_lengthpadding_masks               rQ   r  JambaModel._update_causal_mask  s   ;;++/BB)c^.C%%$**L,?,?vKK&**	&,,Q/&r*Q.jj/!Aimsta**[1=Ku||MANDZDZ[]_`Daaa!$a"23::<;M;Ma;PRSUWY[\%%++-K!!#q(,2226*3+<=@@EWXZ^`dfgWgHhHkHkloHpp1<S,;,=N1O1[1[\h1tC+-. KK,,6*%%**.DD
 1CCK[Krb   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )r0   all)r]   r'   r   r  s       rQ   r  JambaModel._update_mamba_mask  s:     $
!q ^%?EIIn`aNaDbDbJrb   )r  r  r  r  r  r  r  )
NNNNNNNNNN)rs   rt   ru   rv   r   r   rX   r   r   r   r0   r   r   r   r   r   r   r   r   rm   r  r  rw   rx   ry   s   @rQ   r  r  V  s6   { &  151537FJ59$(,0/3/359e
E,,-e
 !.e
 u//0	e

 ""BCe
   1 12e
 D>e
 $D>e
 'tne
 'tne
 !!1!12e
 +,e
 
 e
  e
N!F	 	rb   r  c                     ^  \ rS rSrS/rS\4U 4S jjrS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\   S\\R                      S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  4   S\\   S\4S jj5       5       r       SS jrSrU =r$ )JambaForCausalLMi  zlm_head.weightr   c                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  5         g rl  )rW   rX   r  r  r  r   r   r^   lm_headrouter_aux_loss_coefr&   r{  r  rq  s     rQ   rX   JambaForCausalLM.__init__  s}     '
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#= rb   c                     Xl         g r   r  )r]   decoders     rQ   set_decoderJambaForCausalLM.set_decoder  s    
rb   c                     U R                   $ r   r  rp   s    rQ   get_decoderJambaForCausalLM.get_decoder  s    zzrb   r  r'   r   r   r  labelsr   r   r  r  r   logits_to_keepr   r(   c                    Ub  UOU R                   R                  nU
b  U
OU R                   R                  n
U	b  U	OU R                   R                  n	U R	                  UUUUUUUU	U
US9
nUR
                  n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb  U R                  " UX`R                  40 UD6nSnU
(       aZ  [        UR                  U R                  U R                  U5      nUb+  UU R                   UR#                  UR$                  5      -  -  n['        UUUUR(                  UR*                  UR,                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, JambaForCausalLM

>>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
>>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)
r  r'   r   r   r  r   r   r  r  r   )lossaux_losslogitsr   rj   r  r%   )r   r   r  r  r  r  r-   r=   slicer  loss_functionr  rR   r%   r&   r{  r  r2   r/   r   r   rj   r  )r]   r  r'   r   r   r  r  r   r   r  r  r   r  r   r  rj   slice_indicesr  r	  r
  s                       rQ   rm   JambaForCausalLM.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	
 %9$D $++JjJj 	
 +/**)%+'/!5!5) +5 +
  118B>SV8W8W~ot4]kmA}a,?@A%%ffooPPD/%%  ((	H !11HKK4LLL(#33!//))!//
 	
rb   c	           
         US L n
U
(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOaUR                   S   UR                   S   :w  a	  US S 2U4   nO7[        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U
(       d  US S 2UR                   S   * S 24   nUb  U
(       a  SU0nOSUR                  5       0nUR                  UUUUUU R                  R                  US.5        U$ )Nr,   r   r   r   r  r  )r   r   r   r'   r  r  r   )r9   r   r   rf   r/   longcumsummasked_fill_r   r   num_logits_to_keep)r]   r  r   r'   r  r  r   r   r   r   empty_past_kvmodel_inputss               rQ   prepare_inputs_for_generation.JambaForCausalLM.prepare_inputs_for_generation|  se    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0(<"&++"@"@"0
	
 rb   )r  r  r&   r{  r  r  )NNNNNNNNNNNr   )NNNFNNT)rs   rt   ru   rv   _tied_weights_keysr   rX   r  r  r   r   r   r0   r   r   r   r   r   r   r=   r   r   r   rm   r  rw   rx   ry   s   @rQ   r  r    s   *+	{ 	  151537FJ59-1$(,0/3/35934Y
E,,-Y
 !.Y
 u//0	Y

 ""BCY
   1 12Y
 ))*Y
 D>Y
 $D>Y
 'tnY
 'tnY
 !!1!12Y
 c5<</0Y
 +,Y
 
#Y
  Y
| ": :rb   r  c                       \ rS rSrSrg)JambaForSequenceClassificationi  r  N)rs   rt   ru   rv   rw   r  rb   rQ   r  r    s    ^arb   r  )r  r  r  r  )Nrd   N)Rr   r   typingr   r   r   r0   torch.nn.functionalr   r3   r  torch.utils.checkpointactivationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.import_utilsr   r   configuration_jambar   r   &mamba_ssm.ops.selective_scan_interfacer   r    +mamba_ssm.ops.triton.selective_state_updater!   causal_conv1dr"   r#   r  r7  
get_loggerrs   r   r   r.   r=   rR   ModulerT   r   r   r   r   r
  r  r  rj  rw  r  r  r  r  r  r  r  __all__r  rb   rQ   <module>r1     s  (   ' '      ! < < ) > h R - & R R T , J XR@P=-~DD-7**.0@BVXfg 
 
		H	% "&
-1	T&uU\\':D@AT&#T& U\\*	T&
 5<<T&pJ299 J*	UU\\ 	U# 	U%,, 	UXou XoxW9RYY W9va9> a9JO1 O1f - HNbii HNXryy "<2")) <2~O!; OdL7 L^ %? % %< )CMcd  p% p phk+_ k\ b%EG[ a grb   