
    <h                        S r SSKrSSKJr  SSKJrJrJr  SSKrSSK	rSSKJ
r
  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJrJrJr  SSKJrJrJr  SSKJ r   \RB                  " \"5      r#\" 5       (       a  SSK$J%r%  OSr%\" 5       (       a  SSK&J'r'J(r(  SSK)J*r*  OSu  r*r(r'\" 5       (       a	  SSK+J,r,J-r-  OSu  r-r, " S S5      r. " S S\
R^                  5      r0 " S S\
R^                  5      r1 " S S\5      r2\ " S S \5      5       r3\\" S!S"9 " S# S$\5      5       5       r4\\" S%S"9 " S& S'\5      5       5       r5\ " S( S)\35      5       r6\" S*S"9 " S+ S,\3\5      5       r7/ S-Qr8g).zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNN)causal_conv1d_fncausal_conv1d_update)NNc                      \ rS rSrSrSr\R                  S4S\S\	S\R                  S\\R                  \S4   4S	 jjrS
\	S\R                  S\R                   S\R                  4S jrS
\	S\R                  4S jrS rSrg)
MambaCache;   a@  
Cache for mamba model which does not have attention mechanism and key value states.

Arguments:
    config (`PretrainedConfig):
        The configuration file defining the shape-related attributes required to initialize the static cache.
    max_batch_size (`int`):
        The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used.
    dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
        The default `dtype` to use when initializing the layer.
    device (`torch.device` or `str`, *optional*):
        The device on which the cache should be initialized. Should be the same as the layer.

Example:

    ```python
    >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

    >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
    >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

    >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

    >>> # Prepare a cache class and pass it to model's forward
    >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
    >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True)
    >>> outputs.past_key_values
    MambaCache()
    ```
TNconfigmax_batch_sizedtypedevicec           	         X l         X0l        UR                  U l        UR                  U l        UR
                  U l        / U l        / U l        Ub  [        R                  " U5      OS n[        UR                  5       H  n[        R                  " U R                   U R                  U R                  UU R                  S9n[        R                  " U R                   U R                  U R                  UU R                  S9n[        R                  R                  U5        [        R                  R                  U5        U R                  R!                  U5        U R                  R!                  U5        M     g )Nr$   r#   )r"   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr$   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr!   r"   r#   r$   _
conv_state	ssm_states           `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mamba/modeling_mamba.py__init__MambaCache.__init__^   s&    -!'!9!9$// & 2 2/1.0)/);f%v//0A',{{##&&%%kk(J ',kk##&&##kk'I MM--j9MM--i8##J/OO""9-' 1    	layer_idxnew_conv_statecache_positionreturnc                    U R                   U   R                  UR                  :w  a5  U R                   U   R                  UR                  5      U R                   U'   U R                   U   nUR                  SU R                  S-
  5      nUR                  SSS9nUR                  UR                  UR                  S9US S 2S S 2U4'   U R                   U   R                  5         U R                   U==   U-  ss'   U R                   U   $ )Nr   r   )shiftsdimsr&   )r-   r$   toclampr,   rollr#   zero_)r6   r>   r?   r@   r8   s        r:   update_conv_stateMambaCache.update_conv_state   s    
 I&--1F1FF*.*:*:9*E*H*HI^I^*_DY'%%i0
'--a1F1F1JK__BR_8
+9+<+<JDUDU]g]m]m+<+n
1a'(#))+#z1#	**r=   new_ssm_statec                     U R                   U   R                  5         U R                   U==   UR                  U R                   U   R                  5      -  ss'   U R                   U   $ N)r.   rI   rF   r$   )r6   r>   rL   s      r:   update_ssm_stateMambaCache.update_ssm_state   sT    	"((*	"m&6&6ty7Q7X7X&YY"y))r=   c                     [        [        U R                  5      5       H=  nU R                  U   R                  5         U R                  U   R                  5         M?     g rN   )r0   lenr-   rI   r.   )r6   r>   s     r:   resetMambaCache.reset   sH    s4#3#345IY'--/OOI&,,. 6r=   )r'   r,   r-   r(   r"   r*   r.   )__name__
__module____qualname____firstlineno____doc__is_compileabler/   float16r   intr#   r   r$   strr;   Tensor
LongTensorrJ   rO   rS   __static_attributes__ r=   r:   r   r   ;   s    > N #]]15#. #. #. {{	#.
 ellC-.#.J++.3ll+LQL\L\+	+"*# *ell *
/r=   r   c            
       ^  ^  \ rS rSrSrS\S\4U 4S jjrS r   SS\	R                  S\\   S	\\	R                     S
\\	R                     4S jjrSS\\   S	\\	R                     S
\\	R                     4S jjr   SS\\   S	\\	R                     S
\\	R                     4S jjrSrU =r$ )
MambaMixer   uo  
Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
and is why Mamba is called **selective** state spaces)
r!   r>   c           	        > [         TU ]  5         Xl        UR                  U l        UR                  U l        UR                  U l        UR                  U l        [        UR                  5      U l
        X l        UR                  U l        [        R                  " U R                  U R                  UR                  UR                  U R                  UR                  S-
  S9U l        UR                   U l        [$        UR                      U l        UR(                  U l        [        R*                  " U R                  U R                  S-  UR,                  S9U l        [        R*                  " U R                  U R                  U R
                  S-  -   SS9U l        [        R*                  " U R                  U R                  SS9U l        [4        R6                  " SU R
                  S-   [4        R8                  S9S S S 24   nUR;                  U R                  S5      R=                  5       n[        R>                  " [4        R@                  " U5      5      U l!        [        R>                  " [4        RD                  " U R                  5      5      U l#        [        R*                  " U R                  U R                  UR,                  S9U l$        UR,                  U l        U RK                  5         g )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   rh   FTr#   rC   )&superr;   r!   hidden_sizer)   r*   r+   r,   r(   r\   time_step_rankr>   use_conv_biasr   Conv1dconv1d
hidden_act
activationr
   actuse_mambapyLinearuse_biasin_projx_projdt_projr/   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r6   r!   r>   A	__class__s       r:   r;   MambaMixer.__init__   s-   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%%'r=   c                    [        [        [        [        [        [
        45      nU(       dW  U R                  (       a0  [        5       (       a  [        R                  S5        g [        S5      e[        R                  S5        g g )Na7  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation and https://github.com/Dao-AILab/causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)allr   r   r   r   r   rx   r   loggerwarning_onceImportError)r6   is_fast_path_availables     r:   r   #MambaMixer.warn_slow_implementation   sz    !$#%68HJ^`no"
 &'))''F & Z  ##J &r=   hidden_statescache_paramsr@   attention_maskc                 	   U R                  U5      R                  SS5      nU R                  (       Ga.  UGc*  [        UU R                  R
                  U R                  (       a  U R                  R                  OS U R                  R
                  U R                  R
                  U R                  R
                  U R                  (       a$  U R                  R                  R                  5       OS [        R                  " U R                  R                  5       5      * S S U R                   R                  5       U R                  R                  R                  5       SS9nU$ UR#                  SSS9u  pUb  XR%                  S5      -  nU R                  R
                  R'                  U R                  R
                  R)                  S5      U R                  R
                  R)                  S5      5      nUbn  US   S:  ae  [+        UR-                  S5      UR.                  U R0                     UU R                  R                  U R2                  5      nUR%                  S5      nOUbW  [4        R6                  R9                  XR:                  UR<                  S   -
  S45      n	UR?                  U R0                  X5        [A        XU R                  R                  U R2                  S9nUb  XR%                  S5      -  nU R                  UR                  SS5      5      n
[        RB                  " XRD                  U RF                  U RF                  /SS9u  pnU R                  R
                  UR                  SS5      -  n[        R                  " U R                  R                  5       5      * n[I        U R                  S	5      (       a$  U R                  R                  R                  5       OS nUbc  US   S:  aZ  [K        URL                  U R0                     US
   US
   UUS S 2S4   US S 2S4   U R                   US
   USS9
R%                  S5      nOo[O        UUUUR                  SS5      UR                  SS5      U R                   R                  5       UUSSS9
u  nnUb  Ub  URQ                  U R0                  U5        U R                  UR                  SS5      5      nU$ )Nr   rl   T)
delta_biasdelta_softplusdimr   rC   )rv   rh   ).r   )dt_softplus)r   return_last_state))r{   	transposetrainingr   rt   weightrr   rh   r|   r}   r   rz   floatr/   expr   r   chunk	unsqueezeviewsizer   squeezer-   r>   rv   r   
functionalpadr,   shaperJ   r   splitrq   r*   hasattrr   r.   r   rO   )r6   r   r   r@   r   projected_statescontextualized_statesgateconv_weightsr-   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsr9   s                      r:   cuda_kernels_forwardMambaMixer.cuda_kernels_forward   s+     <<6@@AF===\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!p %$O #3"8"8"8"BM) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;_ 0!1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOI! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:W:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$r=   c           	      ^   UR                   u  pVnUR                  nU R                  U5      R                  SS5      n	U	R	                  SSS9u  pUb  XR                  S5      -  n
UGb  UR                  U R                     R                  5       nUR                  U
R                  5      nUR                   S   U R                  :X  a  [        R                  R                  U
U R                  U
R                   S   -
  S45      nUR                  U R                  X5        U R!                  U R#                  U
5      SS U24   5      n
GO6UR                  U R                  X5      nUR                  U R"                  R$                  R                  5      n[&        R(                  " XR"                  R$                  S S 2SS S 24   -  SS9n
U R*                  (       a  XR"                  R,                  -  n
U R!                  U
5      R                  U5      R                  S5      n
O][&        R.                  " XPR0                  U R2                  4U
R                  US9nU R!                  U R#                  U
5      SS U24   5      n
Ub  XR                  S5      -  n
U R5                  U
R                  SS5      5      n[&        R6                  " XR8                  U R2                  U R2                  /SS9u  nnnU R;                  U5      n[        R                  R=                  U5      R                  SS5      n[&        R>                  " U R@                  RC                  5       5      * n[&        R>                  " US S S 2S S S 24   US S 2S S 2S S 2S 4   -  5      nUS S 2S S 2S S 2S 4   US S 2S S S 2S S 24   RC                  5       -  nUU
S S 2S S 2S S 2S 4   RC                  5       -  nU RD                  (       a  U RF                  (       a  Uc  [I        UR                  SS5      UR                  SS5      5      nUUR                  S5      -  RK                  S5      R                  SS5      nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nO/ n[O        U5       H  nUS S 2S S 2US S 24   U-  US S 2S S 2US S 24   -   n[&        RP                  " UR                  U5      US S 2US S 24   R                  S5      5      nURS                  US S 2S S 2S4   5        M     [&        RT                  " USS9nUXRL                  S S S 2S 4   -  -   nUU R!                  U5      -  nUb(  UR                  U R                     RW                  U5        U RY                  UR                  SS5      5      nU$ )	Nr   rl   r   r   rC   .r&   r	   )-r   r#   r{   r   r   r   r.   r>   clonerF   r$   r,   r   r   r   rJ   rw   rt   r   r/   sumrr   rh   r2   r(   r*   r|   r   rq   r}   softplusr   r   r   rx   r   r   r   r   r0   matmulr5   stackcopy_r   )r6   input_statesr   r@   r   
batch_sizeseq_lenr7   r#   r   r   r   r9   r8   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   s                               r:   slow_forwardMambaMixer.slow_forwardP  s   !-!3!3
Q""<<5??1E.44QA4>%),D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~zZ $])CC'M)R S);;DNNMj
']]4;;+=+=+D+DE
 %		*{{7I7I!QPQ'7R*RXZ [%%![[%5%55M $ 7 : :5 A K KB O33T5H5HI$++5I !HHT[[%?XgX%NOM%),D,DQ,GGM ]%<%<Q%BC++00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89 $  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$r=   c                 B   [        [        [        [        [        [
        45      nU(       ac  SU R                  R                  R                  R                  ;   a5  [        R                  R                  5       (       d  U R                  XX45      $ U R                  XX45      $ )Ncuda)r   r   r   r   r   r   r|   r   r$   typer/   r3   is_compilingr   r   )r6   r   r   r@   r   r   s         r:   forwardMambaMixer.forward  s~     "%#%68HJ^`no"
 "f0B0B0I0I0N0N&NW\WdWdWqWqWsWs,,].ii  n]]r=   )r   r   rw   rv   r!   rt   r,   r}   rp   r{   r(   r>   r   r*   rq   rz   rr   rx   r|   r   )rU   rV   rW   rX   rY   r   r\   r;   r   r/   r^   r   r   r_   r   r   r   r`   __classcell__r   s   @r:   rc   rc      s!   )({ )(s )(V4 .25959c%||c% z*c% !!1!12	c%
 !!1!12c%LO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959^ z*^ !!1!12	^
 !!1!12^ ^r=   rc   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )MambaRMSNormi  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z<
MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
N)ro   r;   r   r   r/   r   r   variance_epsilon)r6   rp   epsr   s      r:   r;   MambaRMSNorm.__init__  s/     	ll5::k#:; #r=   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nrl   rC   T)keepdim)	r#   rF   r/   r   powmeanrsqrtr   r   )r6   r   input_dtypevariances       r:   r   MambaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r=   c                 R    U R                   R                  S    SU R                   3$ )Nr   z, eps=)r   r   r   r6   s    r:   
extra_reprMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEEr=   )r   r   )gư>)	rU   rV   rW   rX   r;   r   r   r`   r   r   s   @r:   r   r     s    $;F Fr=   r   c                      ^  \ rS rSrU 4S jr   SS\\   S\\R                     S\\R                     4S jjr	Sr
U =r$ )	
MambaBlocki  c                    > [         TU ]  5         Xl        X l        UR                  U l        [        UR                  UR                  S9U l        [        XS9U l
        g )Nr   r>   )ro   r;   r!   r>   residual_in_fp32r   rp   layer_norm_epsilonnormrc   mixer)r6   r!   r>   r   s      r:   r;   MambaBlock.__init__  sL    " & 7 7 !3!39R9RS	<
r=   r   r@   r   c                 
   UnU R                  UR                  U R                   R                  R                  S95      nU R                  (       a  UR                  [
        R                  5      nU R                  XX4S9nXQ-   nU$ )Nrn   r   r@   r   )r   rF   r   r#   r   r/   r   r   )r6   r   r   r@   r   residuals         r:   r   MambaBlock.forward  sx     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^ # 
 !0r=   )r!   r>   r   r   r   r   )rU   rV   rW   rX   r;   r   r   r/   r_   r   r`   r   r   s   @r:   r   r     sV    = .25959 z* !!1!12	
 !!1!12 r=   r   c                   :    \ rS rSr% \\S'   SrSS/rSrSr	S r
Srg	)
MambaPreTrainedModeli  r!   backboner   rc   Tc                 r   U R                   R                  n[        U[        5      (       GaP  [        R
                  " SUR                  S-   [        R                  S9SSS24   nUR                  UR                  S5      R                  5       nUR                  R                  [        R                  " U5      5        SUR                  l        SUR                  l        UR                  R                   R#                  S5        U R                   R$                  S-  U R                   R&                  -  nU R                   R(                  S:X  a5  [*        R,                  R/                  UR0                  R2                  U5        OPU R                   R(                  S	:X  a6  [*        R,                  R5                  UR0                  R2                  U* U5        [        R6                  " [        R8                  " U R                   R                  5      [:        R                  " U R                   R<                  5      [:        R                  " U R                   R>                  5      -
  -  [:        R                  " U R                   R>                  5      -   5      RA                  U R                   RB                  S
9nU[        R                  " [        RD                  " U* 5      * 5      -   nUR0                  RF                  R                  U5        SUR0                  RF                  l$        [*        R,                  RK                  URL                  R2                  [:        RN                  " S5      S9  URL                  RF                  bY  [Q        URL                  RF                  SS5      (       d3  [*        R,                  RS                  URL                  RF                  5        [*        R,                  RK                  URT                  R2                  [:        RN                  " S5      S9  U R                   RV                  (       aC  URT                  R2                  nU[:        RN                  " U R                   RX                  5      -  n[        U[*        RZ                  5      (       a  [Q        UR2                  SS5      (       d(  [*        R,                  R]                  UR2                  US9  URF                  bG  [Q        URF                  SS5      (       d*  [*        R,                  RS                  URF                  5        ggg[        U[^        5      (       a&  UR2                  R                   R#                  S5        g[        U[*        R`                  5      (       a)  [*        R,                  R]                  UR2                  US9  gg)zInitialize the weights.r   rn   NrC   Tg      ?g      constantrandom)min   )a
_no_reinitF)std)1r!   initializer_range
isinstancerc   r/   r~   r*   r   r   r(   r   r   r   r   _no_weight_decayr   datafill_rq   time_step_scaletime_step_init_schemer   init	constant_r}   r   uniform_r   randmathtime_step_maxtime_step_minrG   time_step_floorexpm1rh   r   kaiming_uniform_rt   sqrtgetattrzeros_r   rescale_prenorm_residualr1   ry   normal_r   	Embedding)r6   moduler   r   dt_init_stddtinv_dtps           r:   _init_weights"MambaPreTrainedModel._init_weights  s   kk++fj)) Q 5 5 9OPTVWPWXA1126AACALLuyy|,,0FLL)(,FHH%HHMM$++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f--1FNN*GG$$V]]%9%9TYYq\$J}}!!-v}}11<GGGGNN6==#5#56GG$$V__%;%;tyy|$L{{33 OO**TYYt{{<<==fbii((6==,>>37{{&v{{L%@@GGNN6;;/ A ' --MM$$S)--GGOOFMMsO3 .r=   ra   N)rU   rV   rW   rX   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr  r`   ra   r=   r:   r   r     s)    "%|4&*#L:4r=   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\   \	S'   Sr\\\R                        \	S'   Srg)MambaOutputi&  a%  
cache_params (`MambaCache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlast_hidden_stater   r   ra   )rU   rV   rW   rX   rY   r$  r   r/   FloatTensorr  r   r   r   tupler`   ra   r=   r:   r#  r#  &  sH     6:x 1 129)-L(:&-8<M8E%"3"345<r=   r#  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\   \	S'   Sr\\\R                        \	S'   Srg)	MambaCausalLMOutputi:  az  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
cache_params (`MambaCache`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.

    Includes both the State space model state matrices after the selective scan, and the Convolutional states
Nlosslogitsr   r   ra   )rU   rV   rW   rX   rY   r)  r   r/   r%  r  r*  r   r   r   r&  r`   ra   r=   r:   r(  r(  :  s\    
 )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<r=   r(  c                     ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\   S\	\
R                     S\	\
R                     S\\\4   4S jj5       rSrU =r$ )
MambaModeliS  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        SU l        [        UR
                  UR                  S9U l        U R!                  U R"                  5        U R%                  5         g s  snf )Nr   Fr   )ro   r;   r   r  
vocab_sizerp   
embeddings
ModuleListr0   r1   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r6   r!   idxr   s      r:   r;   MambaModel.__init__U  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$rRq3Z%FRq$rs&+#"6#5#56;T;TU//? %ss   (Cc                 l    U H.  nSU;   d  M  UR                  U5      XR                  SS5      '     g    g )Nz
embedding.zembeddings.)popreplace)r6   
state_dictprefixargsks        r:   r5  MambaModel.load_hooka  s4    Aq EO^^TUEV
99\=AB r=   c                     U R                   $ rN   r/  r   s    r:   get_input_embeddingsMambaModel.get_input_embeddingsg  s    r=   c                     Xl         g rN   rB  r6   new_embeddingss     r:   set_input_embeddingsMambaModel.set_input_embeddingsj  s    (r=   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictr@   r   rA   c	                    Ub  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  U5      nU R                  (       a  U R                  (       a	  U(       a  SnU(       a  Ucn  [        U R                   UR                  S5      UR                  UR                  S9n[        R                  " SU R                   R                  UR                  S9nOUc  [        S5      eOSnUn	U(       a  SOSn
U R                   H  nU" U	UUUS	9n	U(       d  M  X4-   n
M     U R!                  U	5      n	U(       a  X4-   n
U(       d  [#        S
 XU
4 5       5      $ [%        U	U(       a  UU
S9$ SU
S9$ )ay  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
NFz:You must specify exactly one of input_ids or inputs_embedsr   r&   r$   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyra   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frN   ra   ).0vs     r:   	<genexpr>%MambaModel.forward.<locals>.<genexpr>  s     f$Tq$Ts   	)r$  r   r   )r!   rM  r   rL  use_return_dict
ValueErrorr/  r2  r   r   r$   r#   r/   r~   r+   r1  r3  r&  r#  )r6   rJ  rK  r   rL  rM  rN  r@   r   r   all_hidden_statesmixer_blocks               r:   r   MambaModel.forwardm  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !; 	 (  L%"6BD;;K')--	M $#$58H$H! ' M2 14D Df]BS$Tfff+)2+
 	
8<+
 	
r=   )r/  r2  r1  r3  )NNNNNNNN)rU   rV   rW   rX   r;   r5  rC  rH  r   r   r/   r_   r   boolr   r&  r#  r   r`   r   r   s   @r:   r,  r,  S  s    
)  1548-1$(/3&*5959L
E,,-L
   0 01L
 z*	L

 D>L
 'tnL
 d^L
 !!1!12L
 !!1!12L
 
uk!	"L
 L
r=   r,  z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                     ^  \ rS rSrS/rU 4S jrS rS r SS\S\	\
\4   S\S	\	\
\4   4S
 jjr     SS\\   S\\R"                     S\\R"                     4S jjr\         SS\\R"                     S\\R"                     S\\R(                     S\\   S\\R"                     S\\   S\\   S\\   S\\R,                     S	\\\4   4S jj5       rSrU =r$ )MambaForCausalLMi  zlm_head.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFrm   )
ro   r;   r,  r   r   ry   rp   r.  lm_headr6  )r6   r!   r   s     r:   r;   MambaForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr=   c                 6    U R                   R                  5       $ rN   )r   rC  r   s    r:   rC  %MambaForCausalLM.get_input_embeddings  s    }}1133r=   c                 8    U R                   R                  U5      $ rN   )r   rH  rF  s     r:   rH  %MambaForCausalLM.set_input_embeddings  s    }}11.AAr=   outputsmodel_kwargsnum_new_tokensrA   c                    UR                  SS 5      US'   UR                  SS5      (       a  SU;   a  US   b  US   SS  U-   US'   SU;   a<  US   n[        R                  " XUR                  UR                  S   S45      /SS	9US'   U$ )
Nr   rL  Tr@   rC   r   r   r   r   )getr/   catnew_onesr   )r6   re  rf  rg  kwargsr   s         r:   #_update_model_kwargs_for_generation4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H^$[$// L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* r=   r   r@   r   c                 2   SUR                  5       0nU(       a  Uc  [        R                  " SU R                  R                  R
                  UR                  S9nUb  SU0nUR                  S5      n	OUR                  S5      n	[        U R                  R                  XR                  U R                  S9nU(       a4  US   S:  a+  US S 2S4   R                  S5      R                  5       US'   S nU(       d  Ub  SU0nUR                  UUUUS.5        U$ )NrJ  r   rP  rK  r&   rC   )r   rL  r@   r   )r   r/   r~   r   r!   r+   r$   r   r   r#   r   update)
r6   rJ  rK  rL  r   r@   r   rl  model_inputsr"   s
             r:   prepare_inputs_for_generation.MambaForCausalLM.prepare_inputs_for_generation  s    $Y%9%9%;<-
 #\\!T]]-A-A-M-MV_VfVfgN( /?!.!3!3A!6!*!2%dmm&:&:NS^S^fjfpfpqL*Q.(1!R%(8(B(B2(F(Q(Q(SL%!N]6+];L ,&"0"0		
 r=   rJ  rK  labelsrM  rN  rL  c
                    Ub  UOU R                   R                  nU R                  UUUUUUU	US9nUS   nU R                  UR	                  U R                  R
                  R                  5      5      R                  5       nSnUb  UR	                  UR                  5      nUSSS2SS24   R                  5       nUSSS24   R                  5       n[        5       nU" UR                  SUR                  S5      5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
cache_params (`MambaCache`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
N)r   rK  rM  rN  rL  r@   r   r   .rC   r   )r)  r*  r   r   )r!   rV  r   r_  rF   r   r#   r   r$   r   r   r   r   r(  r   r   )r6   rJ  r   rK  r   rt  rM  rN  rL  r@   rl  mamba_outputsr   r*  r)  shift_logitsshift_labelsloss_fctoutputs                      r:   r   MambaForCausalLM.forward  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
r=   )r   r_  )r   )NNNNN)	NNNNNNNNN)rU   rV   rW   rX   _tied_weights_keysr;   rC  rH  r   dictr]   r   r\   rm  r   r   r/   r_   rr  r   r%  r[  r^   r   r&  r(  r   r`   r   r   s   @r:   r]  r]    s    ++4B YZ"26sCx.RU	c3h, -15959(
 z*( !!1!12( !!1!12(T  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
r=   r]  )r]  r,  r   r   )9rY   r	  dataclassesr   typingr   r   r   r/   torch.utils.checkpointr   torch.nnr   activationsr
   configuration_utilsr   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   configuration_mambar   
get_loggerrU   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   causal_conv1dr   r   r   Modulerc   r   r   r   r#  r(  r,  r]  __all__ra   r=   r:   <module>r     s     ! ' '    % ! 3 ) 9 - 
 k j , 
		H	%#EXR@P=-~DD-7**b/ b/JN^ N^bF299 F(+ 8 A4? A4 A4H 
=+ = = 
=+ = =& f
% f
 f
R J
+_ J
J
Z Sr=   