
    PhÜ                        d Z ddlZddlmZ ddlmZmZmZ ddlZddlm	Z	 ddl
mZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZ ddlmZmZmZmZ ddlm Z   ejB                  e"      Z# e       rddl$m%Z% ndZ% e       rddl&m'Z'm(Z( ddl)m*Z* nd\  Z*Z(Z'da+d Z, G d d      Z- G d de	j\                        Z/ G d de	j\                        Z0 G d de      Z1e G d de             Z2e ed !       G d" d#e                    Z3e ed$!       G d% d&e                    Z4e G d' d(e2             Z5 ed)!       G d* d+e2e             Z6g d,Z7y)-zPyTorch MAMBA model.    N)	dataclass)AnyOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)PretrainedConfig)GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringlogging)is_causal_conv1d_availableis_kernels_availableis_mamba_ssm_availableis_mambapy_available   )MambaConfig)pscan)mamba_inner_fnselective_scan_fn)selective_state_updateNNNc                      t         t         S t               r,ddlm}   | d      }|j                  |j
                  fa t         S t               rddlm}m} ||fa t         S da t         S )Nr   )
get_kernelzkernels-community/causal-conv1d)causal_conv1d_fncausal_conv1d_update)NN)_causal_conv1d_cacher   kernelsr   r    r   r   causal_conv1d)r   _causal_conv1d_kernelr   r    s       b/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/mamba/modeling_mamba.py_lazy_load_causal_conv1dr&   <   sp    '##& *+L M 5 J JLaLrLrs   
$	%H 46FG    ,    c                       e Zd ZdZdZej                  dfdededej                  de
ej                  edf   fdZd	ed
ej                  dej                  dej                  fdZd	edej                  fdZd Zy)
MambaCachea.  
    Cache for mamba model which does not have attention mechanism and key value states.

    Arguments:
        config (`PretrainedConfig):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used. Note that a new instance must be instantiated if
            a smaller batch size is used.
        dtype (`torch.dtype`, *optional*, defaults to `torch.float16`):
            The default `dtype` to use when initializing the layer.
        device (`torch.device` or `str`, *optional*):
            The device on which the cache should be initialized. Should be the same as the layer.

    Example:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache

        >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf")
        >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf")

        >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt")

        >>> # Prepare a cache class and pass it to model's forward
        >>> cache_params = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype)
        >>> cache_position = torch.arange(len(inputs["input_ids"][0]), device=model.device)  # sequence length
        >>> outputs = model(**inputs, cache_params=cache_params, cache_position=cache_position, use_cache=True)
        >>> outputs.cache_params
        ```
    TNconfigmax_batch_sizedtypedevicec                    || _         || _        |j                  | _        |j                  | _        |j
                  | _        g | _        g | _        |t        j                  |      nd }t        |j                        D ]  }t        j                  | j                   | j                  | j                  || j                        }t        j                  | j                   | j                  | j                  || j                        }t        j                  j                  |       t        j                  j                  |       | j                  j!                  |       | j                  j!                  |        y )Nr-   r,   )r+   _dtypeintermediate_size
state_sizessm_state_sizeconv_kernelconv_kernel_sizeconv_states
ssm_statestorchr-   rangenum_hidden_layerszeros_dynamomark_static_addressappend)selfr*   r+   r,   r-   _
conv_state	ssm_states           r%   __init__zMambaCache.__init__t   s*    -!'!9!9$// & 2 2/1.0)/);f%v//0A',{{##&&%%kk(J ',kk##&&##kk'I MM--j9MM--i8##J/OO""9-' 1r'   	layer_idxnew_conv_statecache_positionreturnc                 "   | j                   |   j                  |j                  k7  r5| j                   |   j                  |j                        | j                   |<   | j                   |   }|j                  d| j                  dz
        }|j                  dd      }|j                  |j                  |j                        |d d d d |f<   | j                   |   j                          | j                   |xx   |z  cc<   | j                   |   S )Nr   r   )shiftsdimsr/   )r6   r-   toclampr5   rollr,   zero_)r?   rD   rE   rF   rA   s        r%   update_conv_statezMambaCache.update_conv_state   s    
 I&--1F1FF*.*:*:9*E*H*HI^I^*_DY'%%i0
'--a1F1F1JK__BR_8
+9+<+<JDUDU]g]m]m+<+n
1a'(#))+#z1#	**r'   new_ssm_statec                     | j                   |   j                          | j                   |xx   |j                  | j                   |   j                        z  cc<   | j                   |   S N)r7   rO   rL   r-   )r?   rD   rQ   s      r%   update_ssm_statezMambaCache.update_ssm_state   sT    	"((*	"m&6&6ty7Q7X7X&YY"y))r'   c                     t        t        | j                              D ]<  }| j                  |   j                          | j                  |   j                          > y rS   )r9   lenr6   rO   r7   )r?   rD   s     r%   resetzMambaCache.reset   sH    s4#3#345IY'--/OOI&,,. 6r'   )__name__
__module____qualname____doc__is_compileabler8   float16r   intr,   r   r-   strrC   Tensor
LongTensorrP   rT   rW    r'   r%   r)   r)   O   s    B N #]]15#. #. #. {{	#.
 ellC-.#.J++.3ll+LQL\L\+	+"*# *ell *
/r'   r)   c            
       F    e Zd ZdZdedef fdZd Z	 	 	 ddej                  de
e   de
ej                     d	e
ej                     fd
Zdde
e   de
ej                     d	e
ej                     fdZ	 	 	 dde
e   de
ej                     d	e
ej                     fdZ xZS )
MambaMixeru  
    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
    and is why Mamba is called **selective** state spaces)
    r*   rD   c           	         t         |           || _        |j                  | _        |j                  | _        |j                  | _        |j                  | _        t        |j                        | _
        || _        |j                  | _        t        j                  | j                  | j                  |j                  |j                  | j                  |j                  dz
        | _        |j                   | _        t$        |j                      | _        |j(                  | _        t        j*                  | j                  | j                  dz  |j,                        | _        t        j*                  | j                  | j                  | j
                  dz  z   d      | _        t        j*                  | j                  | j                  d      | _        t5        j6                  d| j
                  dz   t4        j8                        d d d f   }|j;                  | j                  d      j=                         }t        j>                  t5        j@                  |            | _!        t        j>                  t5        jD                  | j                              | _#        t        j*                  | j                  | j                  |j,                        | _$        |j,                  | _        | jK                          y )	Nr   )in_channelsout_channelsbiaskernel_sizegroupspadding   rh   FTr,   rI   )&superrC   r*   hidden_sizer2   r3   r4   r5   r1   r^   time_step_rankrD   use_conv_biasr   Conv1dconv1d
hidden_act
activationr
   actuse_mambapyLinearuse_biasin_projx_projdt_projr8   arangefloat32expand
contiguous	ParameterlogA_logonesDout_projwarn_slow_implementation)r?   r*   rD   A	__class__s       r%   rC   zMambaMixer.__init__   s1   !--$// & 2 2!'!9!9!&"7"78"#11ii..//%%**))&&*
 !++&++,!-- yy!1!143I3IA3MTZTcTcdii 6 68K8KdNaNadeNe8elqryy!4!4d6L6LSWX LLD//!35==I$PQ'RHHT++R0;;=\\%))A,/
ejj)?)?@A		$"8"8$:J:JQWQ`Q`a%%'r'   c                     t               \  }}t        t        t        ||t        f      }|sM| j
                  r+t               rt        j                  d       y t        d      t        j                  d       y y )Na  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the mamba.py backend. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1dzuse_mambapy is set to True but the mambapy package is not installed. To install it follow https://github.com/alxndrTL/mamba.py.a  The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)` is None. Falling back to the sequential implementation of Mamba, as use_mambapy is set to False. To install follow https://github.com/state-spaces/mamba/#installation for mamba-ssm and install the kernels library using `pip install kernels` or https://github.com/Dao-AILab/causal-conv1d for causal-conv1d. For the mamba.py backend, follow https://github.com/alxndrTL/mamba.py.)
r&   allr   r   r   rx   r   loggerwarning_onceImportError)r?   r    r   is_fast_path_availables       r%   r   z#MambaMixer.warn_slow_implementation   s    1I1K..!$#%68HJ^`no"
 &')''S & Z  ##W &r'   hidden_statescache_paramsrF   attention_maskc                 	   | j                  |      j                  dd      }| j                  r%|"t        || j                  j
                  | j                  r| j                  j                  nd | j                  j
                  | j                  j
                  | j                  j
                  | j                  r$| j                  j                  j                         nd t        j                  | j                  j                                d d | j                   j                         | j                  j                  j                         d      }|S t#               \  }}|j%                  dd      \  }}	|||j'                  d      z  }| j                  j
                  j)                  | j                  j
                  j+                  d      | j                  j
                  j+                  d            }
|j|d   dkD  rb ||j-                  d      |j.                  | j0                     |
| j                  j                  | j2                        }|j'                  d      }n|Yt4        j6                  j9                  || j:                  |j<                  d   z
  df      }|j?                  | j0                  ||        |||
| j                  j                  | j2                        }|||j'                  d      z  }| j                  |j                  dd            }t        j@                  || jB                  | jD                  | jD                  gd      \  }}}| j                  j
                  |j                  dd      z  }t        j                  | j                  j                                }tG        | j                  d	      r$| j                  j                  j                         nd }|e|d   dkD  r]tI        |jJ                  | j0                     |d
   |d
   ||d d df   |d d df   | j                   |	d
   |d
      j'                  d      }nptM        ||||j                  dd      |j                  dd      | j                   j                         |	|dd
      \  }}|||jO                  | j0                  |       | j                  |j                  dd            }|S )Nr   rl   T)
delta_biasdelta_softplusdimr   rI   )rv   rh   ).r   )dt_softplus)r   return_last_state)(r{   	transposetrainingr   rt   weightrr   rh   r|   r}   r   rz   floatr8   expr   r   r&   chunk	unsqueezeviewsizesqueezer6   rD   rv   r   
functionalpadr5   shaperP   splitrq   r3   hasattrr   r7   r   rT   )r?   r   r   rF   r   projected_statescontextualized_statesr    r   gateconv_weightsr6   ssm_parameters	time_stepBCdiscrete_time_stepr   time_proj_biasscan_outputsrB   s                        r%   cuda_kernels_forwardzMambaMixer.cuda_kernels_forward  sG     <<6@@AF==\1$2 ""$($6$6  D""##$$.2mm""((*4::++-..<<,,224#%!r %$Q 6N5O2 "2"2"8"8"8"BM4) -0H0H0K K  ;;--224;;3E3E3J3J13Mt{{OaOaOfOfghOijL'N1,=,A 4!))"- ,,T^^< KK$$OO! !. 7 7 ;+"$--"3"3%(=(=@S@STV@W(WYZ'[#K !224>>;P^_ 0!<1A1Adoo! ) -0H0H0K K "[[)@)@A)FGN#kk!4!4d6I6I4K^K^ _egOIq! "&!4!4y7J7J1a7P!P4::++-..A:A$,,PV:WT\\..446]aN'N1,=,A5 ++DNN;!&)&v.adGadGFFL" $  )B-  +<!&KK1%KK1%FFLLN"#'&*+'i (\-E 11$..)L %)MM,2H2HA2N$O!$$r'   c           	      X   |j                   \  }}}|j                  }| j                  |      j                  dd      }	|	j	                  dd      \  }
}||
|j                  d      z  }
||j                  | j                     j                         }|j                  |
j                        }|j                   d   | j                  k(  rt        j                  j                  |
| j                  |
j                   d   z
  df      }|j                  | j                  ||       | j!                  | j#                  |
      dd |f         }
n9|j                  | j                  |
|      }|j                  | j"                  j$                  j                        }t'        j(                  || j"                  j$                  d d dd d f   z  d      }
| j*                  r|
| j"                  j,                  z  }
| j!                  |
      j                  |      j                  d      }
n`t'        j.                  || j0                  | j2                  f|
j                  |      }| j!                  | j#                  |
      dd |f         }
||
|j                  d      z  }
| j5                  |
j                  dd            }t'        j6                  || j8                  | j2                  | j2                  gd      \  }}}| j;                  |      }t        j                  j=                  |      j                  dd      }t'        j>                  | j@                  jC                                }t'        j>                  |d d d d d d f   |d d d d d d d f   z        }|d d d d d d d f   |d d d d d d d f   jC                         z  }||
d d d d d d d f   jC                         z  }| jD                  r| jF                  r|tI        |j                  dd      |j                  dd            }||j                  d      z  jK                  d      j                  dd      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }ng }tO        |      D ]}  }|d d d d |d d f   |z  |d d d d |d d f   z   }t'        jP                  |j                  |      |d d |d d f   j                  d            }|jS                  |d d d d df           t'        jT                  |d      }||
| jL                  d d d d f   z  z   }|| j!                  |      z  }|(|j                  | j                     jW                  |       | jY                  |j                  dd            }|S )	Nr   rl   r   r   rI   .r/   r	   )-r   r,   r{   r   r   r   r7   rD   clonerL   r-   r5   r   r   r   rP   rw   rt   r   r8   sumrr   rh   r;   r1   r3   r|   r   rq   r}   softplusr   r   r   rx   r   r   r   r   r9   matmulr>   stackcopy_r   )r?   input_statesr   rF   r   
batch_sizeseq_lenr@   r,   r   r   r   rB   rA   r   r   r   r   r   r   
discrete_A
discrete_BdeltaB_uhsscan_outputr   ir   s                               r%   slow_forwardzMambaMixer.slow_forwardh  s   !-!3!3
GQ""<<5??1E.44QA4>t%)N,D,DQ,GGM #$//?EEGI!]%9%9:I ##A&$*?*??]]..!**]-@-@-DDaH

 ..t~~z>Z $])CC'M)R S);;DNNM[ij
']]4;;+=+=+D+DE
 %		*t{{7I7I!QPQ'7R*RXZ [%%!T[[%5%55M $ 7 : :5 A K KB OT33T5H5HI$++5I !HHT[[%?XgX%NOM%)N,D,DQ,GGM ]%<%<Q%BC++T00$2E2EtGZGZ[ac
	1a "\\)4]]334FGQQRSUVW YYtzz'')**YYqq$!125G1aQU5VVW
'1a61dAq=9I9O9O9QQ
aAtm < B B DD ,2Fz++Aq183E3Ea3KLBB/88;EEaKK%tQ}8M(MMK%6KL7^&q!Qz2Y>!QPQST*AUU	#ll9<<+>!Q'
@T@TUW@XY##K1a$89 $  ++l;K%a9N)NOK&$7K'''7==iH !%k.C.CAq.I J$$r'   c                 <   t               \  }}t        t        t        ||t        f      }|r^d| j
                  j                  j                  j                  v r2t        j                  j                         s| j                  ||||      S | j                  ||||      S )Ncuda)r&   r   r   r   r   r|   r   r-   typer8   r<   is_compilingr   r   )r?   r   r   rF   r   r    r   r   s           r%   forwardzMambaMixer.forward  s     2J1K..!$#%68HJ^`no"
 "f0B0B0I0I0N0N&NW\WdWdWqWqWs,,]L.Zhii  nn]]r'   r   )rX   rY   rZ   r[   r   r^   rC   r   r8   r`   r   r)   ra   r   r   r   __classcell__r   s   @r%   rd   rd      s   )({ )(s )(V6 .25959d%||d% z*d% !!1!12	d%
 !!1!12d%NO%x
7K O%aijojzjza{ O%  S[  \a  \l  \l  Sm O%j .25959^ z*^ !!1!12	^
 !!1!12^r'   rd   c                   ,     e Zd Zd fd	Zd Zd Z xZS )MambaRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zL
        MambaRMSNorm is equivalent to T5LayerNorm and LlamaRMSNorm
        N)ro   rC   r   r   r8   r   r   variance_epsilon)r?   rp   epsr   s      r%   rC   zMambaRMSNorm.__init__  s1     	ll5::k#:; #r'   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nrl   rI   T)keepdim)	r,   rL   r8   r   powmeanrsqrtr   r   )r?   r   input_dtypevariances       r%   r   zMambaRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r'   c                 R    | j                   j                  d    d| j                   S )Nr   z, eps=)r   r   r   r?   s    r%   
extra_reprzMambaRMSNorm.extra_repr  s*    ++##A&'vd.C.C-DEEr'   )gư>)rX   rY   rZ   rC   r   r   r   r   s   @r%   r   r     s    $;Fr'   r   c                   t     e Zd Z fdZ	 	 	 ddee   deej                     deej                     fdZ xZ	S )
MambaBlockc                     t         |           || _        || _        |j                  | _        t        |j                  |j                        | _        t        ||      | _
        y )Nr   rD   )ro   rC   r*   rD   residual_in_fp32r   rp   layer_norm_epsilonnormrd   mixer)r?   r*   rD   r   s      r%   rC   zMambaBlock.__init__  sR    " & 7 7 !3!39R9RS	)<
r'   r   rF   r   c                    |}| j                  |j                  | j                   j                  j                              }| j                  r|j                  t
        j                        }| j                  ||||      }||z   }|S )Nrn   r   rF   r   )r   rL   r   r,   r   r8   r   r   )r?   r   r   rF   r   residuals         r%   r   zMambaBlock.forward  s     !		-"2"29I9I9O9O"2"PQ  {{5==1H

^dr # 
 !=0r'   r   )
rX   rY   rZ   rC   r   r)   r8   ra   r   r   r   s   @r%   r   r     sQ    = .25959 z* !!1!12	
 !!1!12r'   r   c                   2    e Zd ZU eed<   dZddgZdZdZd Z	y)MambaPreTrainedModelr*   backboner   rd   Tc                 
   | j                   j                  }t        |t              r!t	        j
                  d|j                  dz   t        j                        dddf   }|j                  |j                  d      j                         }|j                  j                  t	        j                  |             |j                  j                  j!                  d       | j                   j"                  dz  | j                   j$                  z  }| j                   j&                  dk(  r5t(        j*                  j-                  |j.                  j0                  |       nO| j                   j&                  dk(  r6t(        j*                  j3                  |j.                  j0                  | |       t	        j4                  t	        j6                  | j                   j                        t9        j                  | j                   j:                        t9        j                  | j                   j<                        z
  z  t9        j                  | j                   j<                        z         j?                  | j                   j@                  	      }|t	        j                  t	        jB                  |              z   }|j.                  jD                  j                  |       d
|j.                  jD                  _#        t(        j*                  jI                  |jJ                  j0                  t9        jL                  d             |jJ                  jD                  TtO        |jJ                  jD                  dd      s3t(        j*                  jQ                  |jJ                  jD                         t(        j*                  jI                  |jR                  j0                  t9        jL                  d             | j                   jT                  rB|jR                  j0                  }|t9        jL                  | j                   jV                        z  }t        |t(        jX                        rtO        |j0                  dd      s+t(        j*                  j[                  |j0                  |       |jD                  BtO        |jD                  dd      s*t(        j*                  jQ                  |jD                         yyyt        |t\              r&|j0                  j                  j!                  d       yt        |t(        j^                        r,t(        j*                  j[                  |j0                  |       yy)zInitialize the weights.r   rn   NrI   g      ?g      constantrandom)minT   )a
_no_reinitF)std)0r*   initializer_range
isinstancerd   r8   r~   r3   r   r   r1   r   r   r   r   r   datafill_rq   time_step_scaletime_step_init_schemer   init	constant_r}   r   uniform_r   randmathtime_step_maxtime_step_minrM   time_step_floorexpm1rh   r   kaiming_uniform_rt   sqrtgetattrzeros_r   rescale_prenorm_residualr:   ry   normal_r   	Embedding)r?   moduler   r   dt_init_stddtinv_dtps           r%   _init_weightsz"MambaPreTrainedModel._init_weights  s   kk++fj) Q 5 5 9OPTVWPWXA1126AACALLuyy|,HHMM$++44d:T[[=X=XXK{{00J>!!&.."7"7E22h>  !6!6kR

4;;88988DKK556$++B[B[9\\^((4;;4456 e33e4	  %))U[["%5$566FNN%%f--1FNN*GG$$V]]%9%9TYYq\$J}}!!-v}}11<GGGNN6==#5#56GG$$V__%;%;tyy|$L{{33 OO**TYYt{{<<==fbii(6==,>37{{&v{{L%@GGNN6;;/ A ' -MM$$S)-GGOOFMMsO3 .r'   N)
rX   rY   rZ   r   __annotations__base_model_prefix_no_split_modulessupports_gradient_checkpointing_is_statefulr  rb   r'   r%   r   r     s)    "%|4&*#L84r'   r   z,
    Class for the MAMBA model outputs.
    )custom_introc                   |    e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeeej                        ed<   y)MambaOutputa9  
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlast_hidden_stater   r   )rX   rY   rZ   r[   r  r   r8   FloatTensorr  r   r)   r   tuplerb   r'   r%   r  r  =  sH     6:x 1 129)-L(:&-8<M8E%"3"345<r'   r  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
ee   ed<   dZeeej                        ed<   y)MambaCausalLMOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    cache_params (`MambaCache`):
        The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
        avoid providing the old `input_ids`.

        Includes both the State space model state matrices after the selective scan, and the Convolutional states
    Nlosslogitsr   r   )rX   rY   rZ   r[   r  r   r8   r  r  r  r   r)   r   r  rb   r'   r%   r  r  Q  s\    
 )-D(5$$
%,*.FHU&&'.)-L(:&-8<M8E%"3"345<r'   r  c                       e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee   dee	j                     dee	j                     deeef   fd       Z xZS )
MambaModelc           	         t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _        t        |j
                  |j                        | _        | j!                  | j"                         | j%                          y c c}w )Nr   Fr   )ro   rC   r   r	  
vocab_sizerp   
embeddings
ModuleListr9   r:   r   layersgradient_checkpointingr   r   norm_f"_register_load_state_dict_pre_hook	load_hook	post_init)r?   r*   idxr   s      r%   rC   zMambaModel.__init__l  s     ,,v'8'8&:L:LMmmRWX^XpXpRq$rRq3Z#%FRq$rs&+#"6#5#56;T;TU//? %ss   &Cc                 f    |D ],  }d|v s|j                  |      ||j                  dd      <    y  y )Nz
embedding.zembeddings.)popreplace)r?   
state_dictprefixargsks        r%   r)  zMambaModel.load_hookx  s6    Aq EO^^TUEV
199\=AB r'   c                     | j                   S rS   r#  r   s    r%   get_input_embeddingszMambaModel.get_input_embeddings~  s    r'   c                     || _         y rS   r4  r?   new_embeddingss     r%   set_input_embeddingszMambaModel.set_input_embeddings  s	    (r'   	input_idsinputs_embedsr   	use_cacheoutput_hidden_statesreturn_dictrF   r   rG   c	                 8   ||n| j                   j                  }||n#| j                  s| j                   j                  nd}||n| j                   j                  }|du |duz  rt        d      || j                  |      }| j                  r| j                  r|rd}|r|st        | j                   |j                  d      |j                  |j                        }t        j                  d| j                   j                  |j                        }n|t        d      d}|}	|rdnd}
| j                  D ]  } ||	|||	      }	|s|
|	fz   }
 | j!                  |	      }	|r|
|	fz   }
|st#        d
 |	||
fD              S t%        |	|r||
      S d|
      S )a  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        NFz:You must specify exactly one of input_ids or inputs_embedsr   r/   r-   zYou have to specify the `cache_position` manually when `use_cache=True` and `cache_params` is passed, you don't have to pass a `cache_params` if you are in prefilling stage because in that case it will be initialized for you automaticallyrb   r   c              3   &   K   | ]	  }||  y wrS   rb   ).0vs     r%   	<genexpr>z%MambaModel.forward.<locals>.<genexpr>  s     f$TqXYXe$Ts   )r  r   r   )r*   r=  r   r<  use_return_dict
ValueErrorr#  r&  r)   r   r-   r,   r8   r~   r4   r%  r'  r  r  )r?   r:  r;  r   r<  r=  r>  rF   r   r   all_hidden_statesmixer_blocks               r%   r   zMambaModel.forward  s   ( %9$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]-t";<YZZ  OOI6M&&4==YI#)KK!3!3A!6}?S?S[h[n[n  "'a1H1HQ^QeQe!f' !;   L%"6BD;;K')--	M $$58H$H! ' M2 1]4D Df]LBS$Tfff+)2+
 	
8<+
 	
r'   )NNNNNNNN)rX   rY   rZ   rC   r)  r5  r9  r   r   r8   ra   r)   boolr   r  r  r   r   r   s   @r%   r   r   j  s    
)  1548-1$(/3&*5959L
E,,-L
   0 01L
 z*	L

 D>L
 'tnL
 d^L
 !!1!12L
 !!1!12L
 
uk!	"L
 L
r'   r   z
    The MAMBA Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                       e Zd ZdgZ fdZd Zd Z	 ddedee	e
f   dedee	e
f   fd	Z	 	 	 	 	 dd
ee   deej                      deej                      fdZe	 	 	 	 	 	 	 	 	 ddeej                      deej                      deej&                     d
ee   deej                      dee   dee   dee   deej*                     deeef   fd       Z xZS )MambaForCausalLMzlm_head.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFrm   )
ro   rC   r   r   r   ry   rp   r"  lm_headr*  )r?   r*   r   s     r%   rC   zMambaForCausalLM.__init__  sF     "6*yy!3!3V5F5FUSr'   c                 6    | j                   j                         S rS   )r   r5  r   s    r%   r5  z%MambaForCausalLM.get_input_embeddings  s    }}1133r'   c                 8    | j                   j                  |      S rS   )r   r9  r7  s     r%   r9  z%MambaForCausalLM.set_input_embeddings  s    }}11.AAr'   outputsmodel_kwargsnum_new_tokensrG   c                    |j                  dd       |d<   |j                  dd      rd|v r|d   |d   dd  |z   |d<   d|v r?|d   }t        j                  ||j                  |j                  d   df      gd	      |d<   |S )
Nr   r<  TrF   rI   r   r   r   r   )getr8   catnew_onesr   )r?   rP  rQ  rR  kwargsr   s         r%   #_update_model_kwargs_for_generationz4MambaForCausalLM._update_model_kwargs_for_generation  s     (/{{>4'H^$[$/ L0-.:-9:J-KBC-PSa-aL)*|+)*:;N-2YY!8!8.:N:Nq:QST9U!VW]_.L)* r'   r   rF   r   c                 \   d|j                         i}|r|t        j                  d| j                  j                  j
                  |j                        }|d|i}|j                  d      }	n|j                  d      }	t        | j                  j                  |	| j                  | j                        }|r3|d   dkD  r+|d d df   j                  d      j                         |d<   d }|s|d|i}|j                  ||||d       |j                         D ]  \  }
}|
|vs|||
<    |S )Nr:  r   r@  r;  r/   rI   )r   r<  rF   r   )r   r8   r~   r   r*   r4   r-   r   r)   r,   r   updateitems)r?   r:  r;  r<  r   rF   r   rW  model_inputsr+   keyvalues               r%   prepare_inputs_for_generationz.MambaForCausalLM.prepare_inputs_for_generation  s<    $Y%9%9%;<-
 #\\!T]]-A-A-M-MV_VfVfgN( /?!.!3!3A!6!*!2%dmm&:&:NSWS^S^fjfpfpqL*Q.(1!R%(8(B(B2(F(Q(Q(SL%!N]6+];L ,&"0"0		
 !,,.JC,&$)S! ) r'   r:  r;  labelsr=  r>  r<  c
           
         ||n| j                   j                  }| j                  |||||||	|      }|d   }| j                  |j	                  | j                  j
                  j                              j                         }d}||j	                  |j                        }|dddddf   j                         }|dddf   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )aS  
        cache_params (`MambaCache`, *optional*):
            If passed along, the model uses the previous state in all the blocks (which will give the output for the
            `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        use_cache (`bool`, *optional*):
            If set to `True`, the `cache_params` is returned and can be used to quickly generate the next logits.
        N)r   r;  r=  r>  r<  rF   r   r   .rI   r   )r  r  r   r   )r*   rE  r   rM  rL   r   r,   r   r-   r   r   r   r   r  r   r   )r?   r:  r   r;  r   r`  r=  r>  r<  rF   rW  mamba_outputsr   r  r  shift_logitsshift_labelsloss_fctoutputs                      r%   r   zMambaForCausalLM.forward-  s_   2 &1%<k$++B]B]%'!5#)) & 	
 &a(m..t||/B/B/H/HIJPPRYYv}}-F!#ssA+.99;L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`DYqr!22F)-)9TGf$EvE"&33'55	
 	
r'   )r   )NNNNN)	NNNNNNNNN)rX   rY   rZ   _tied_weights_keysrC   r5  r9  r   dictr_   r   r^   rX  r   r)   r8   ra   r_  r   r  rI  r`   r   r  r  r   r   r   s   @r%   rK  rK    s    ++4B YZ"26sCx.RU	c3h, -15959.
 z*. !!1!12. !!1!12.`  155959-1-1/3&*$(15<
E,,-<
 !!1!12<
   1 12	<

 z*<
 ))*<
 'tn<
 d^<
 D><
 !.<
 
u))	*<
 <
r'   rK  )rK  r   r   r)   )8r[   r   dataclassesr   typingr   r   r   r8   r   torch.nnr   activationsr
   configuration_utilsr   
generationr   modeling_layersr   modeling_utilsr   utilsr   r   r   utils.import_utilsr   r   r   r   configuration_mambar   
get_loggerrX   r   mambapy.pscanr   &mamba_ssm.ops.selective_scan_interfacer   r   +mamba_ssm.ops.triton.selective_state_updater   r!   r&   r)   Modulerd   r   r   r   r  r  r   rK  __all__rb   r'   r%   <module>rz     s     ! ' '   % ! 3 ) 9 - 
  - 
		H	%#EXR@P=-~  &d/ d/NQ^ Q^hF299 F(+ 8 ?4? ?4 ?4D 
=+ = = 
=+ = =& f
% f
 f
R P
+_ P
P
f Sr'   