
    <hQ                     b   S SK JrJrJrJr  S SKrS SKJs  Jr	  S SKJr  SSK
Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  \" 5       (       a	  S SK(J)r)J*r*  OSu  r)r*\)\*4r+\," \+5      r-\R\                  " \/5      r0 " S S\"5      r1 " S S\#5      r2 " S S\Rf                  5      r4 " S S\5      r5 " S S\5      r6 " S S\Rf                  5      r7 " S  S!\5      r8 " S" S#\!5      r9 " S$ S%\ 5      r: " S& S'\5      r;/ S(Qr<g))    )AnyCallableOptionalUnionN)nn   )DynamicCache)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging)is_causal_conv1d_available   )apply_mask_to_padding_states)LlamaAttentionLlamaForCausalLM
LlamaModelLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNc                       \ rS rSrSrg)Lfm2RMSNorm7    N__name__
__module____qualname____firstlineno____static_attributes__r$       ]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/lfm2/modular_lfm2.pyr"   r"   7       r+   r"   c                       \ rS rSrSrg)Lfm2RotaryEmbedding;   r$   Nr%   r$   r+   r,   r/   r/   ;   r-   r+   r/   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Lfm2MLP?   configc                   > [         TU ]  5         UR                  nUR                  (       aa  [	        SU-  S-  5      nUR
                  bC  [	        UR
                  U-  5      nUR                  X!R                  -   S-
  UR                  -  -  n[        R                  " UR                  USS9U l
        [        R                  " UR                  USS9U l        [        R                  " X!R                  SS9U l        g )Nr   r   r   Fbias)super__init__intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearhidden_sizew1w3w2)selfr4   r:   	__class__s      r,   r9   Lfm2MLP.__init__@   s    "44** #A(9$9A$= >..:$'(G(GJ[([$\!$*$<$<&)A)AAAE&JbJbb%! ))F..0AN))F..0AN))-/A/ANr+   c                     U R                  [        R                  " U R                  U5      5      U R	                  U5      -  5      $ N)rC   FsilurA   rB   )rD   xs     r,   forwardLfm2MLP.forwardO   s/    wwqvvdggaj)DGGAJ677r+   )rA   rC   rB   )	r&   r'   r(   r)   r   r9   rL   r*   __classcell__rE   s   @r,   r2   r2   ?   s    Oz O8 8r+   r2   c                   |   \ rS rSrSrSrSrSrSr\	R                  S4S\S\S\	R                  S\\	R                  \S4   4S	 jjr SS
\	R$                  S\	R$                  S\S\\\\4      S\\	R$                  \	R$                  4   4
S jjrS\	R0                  4S jrSS\\   S\4S jjrS\	R$                  S\S\\\4   4S jrS\4S jrS\S\\	R$                  \	R$                  4   4S jrS\\\	R$                     \\	R$                     4   4S jr\SS\\\\	R@                           SS4S jj5       r!S r"Sr#g) Lfm2HybridConvCacheS   z
Attention and conv cache for Lfm2.

It stores the Key and Value states as a list of tensors, one for each layer.
Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
NFr4   max_batch_sizedtypedevicec           	      ,   / U l         / U l        X l        UR                  U l        U R                  R	                  S5      U l        UR                  U l        X0l        / U l        Ub  [        R                  " U5      OS n[        UR                  5       H}  n[        R                  " U R                  UR                  U R                  U R                  US9n[        R                  R!                  U5        U R                  R#                  U5        M     g )Nfull_attention)rT   rU   )	key_cachevalue_cacherS   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cachetorchrU   rangenum_hidden_layerszerosr@   _dynamomark_static_addressappend)rD   r4   rS   rT   rU   _
conv_states          r,   r9   Lfm2HybridConvCache.__init__b   s     ,!--%)%5%5%;%;<L%M""//.0)/);f%v//0A##""!!kkJ MM--j9OO"":. 1r+   
key_statesvalue_states	layer_idxcache_kwargsreturnc                 0   UGbu  [        U R                  5      U::  a  [        [        U R                  5      U5       Ha  nU R                  R                  [        R
                  " / 5      5        U R                  R                  [        R
                  " / 5      5        Mc     U R                  R                  U5        U R                  R                  U5        OU R                  U   R                  5       (       d  XR                  U'   X R                  U'   Ob[        R                  " U R                  U   U/SS9U R                  U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                  U   U R                  U   4$ )a#  
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

Parameters:
    key_states (`torch.Tensor`):
        The new key states to cache.
    value_states (`torch.Tensor`):
        The new value states to cache.
    layer_idx (`int`):
        The index of the layer to cache the states for.
    cache_kwargs (`Dict[str, Any]`, `optional`):
        Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

Return:
    A tuple containing the updated key and value states.
dim)	lenrX   ra   rf   r`   tensorrY   numelcat)rD   rj   rk   rl   rm   rg   s         r,   updateLfm2HybridConvCache.update   s<   0 !4>>"i/s4>>2I>ANN))%,,r*:;$$++ELL,<= ? %%j1  ''5NN9-3355,6y).:  +,1IIt~~i7PR\6]ce,fy).3ii9I9I)9TVb8cik.l  +~~i($*:*:9*EEEr+   beam_idxc                 J   [        [        U R                  5      5       GH   nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GM     g)zDReorders the cache for beam search, given the selected beam indices.r   N)ra   rs   rX   rU   index_selecttorY   r_   )rD   ry   rl   rU   s       r,   reorder_cache!Lfm2HybridConvCache.reorder_cache   s    s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r+   c                     U R                   U   S:w  a  U R                  OUn[        U R                  5      U::  d!  U R                  U   R	                  5       S:X  a  gU R                  U   R
                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.rW   r   rp   )rZ   r\   rs   rX   ru   shaperD   rl   s     r,   get_seq_length"Lfm2HybridConvCache.get_seq_length   sm     372B2B92MQa2aD..gp	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r+   cache_positionc                 R    SnUR                   S   nU R                  5       nXE-   nXc4$ )a  
Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
the given layer at `layer_idx`.
The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
for each layer.
r   )r   r   )rD   r   rl   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r,   get_mask_sizes"Lfm2HybridConvCache.get_mask_sizes   s:      %++A...0 3	--r+   
max_lengthc                    US:  a  U R                  5       [        U5      -
  nU R                  5       U::  a  g[        [        U R                  5      5       Hs  nU R                  U   R                  5       (       d  M'  U R                  U   SSU2SS24   U R                  U'   U R                  U   SSU2SS24   U R                  U'   Mu     g)z"Crop the cache to the given lengthr   N.)r   absra   rs   rX   ru   rY   )rD   r   idxs      r,   cropLfm2HybridConvCache.crop   s    >,,.Z@J J.T^^,-C~~c"((**&*nnS&9#{
{A:M&Ns#(,(8(8(=c;J;PQ>Q(R  % .r+   c                 >    U R                   U   U R                  U   4$ rH   )rX   rY   r   s     r,   __getitem__Lfm2HybridConvCache.__getitem__   s!    ~~i($*:*:9*EEEr+   c                     [        S5      eNz<Lfm2HybridConvCache does not have a legacy cache equivalent.NotImplementedError)rD   s    r,   to_legacy_cache#Lfm2HybridConvCache.to_legacy_cache   s    !"`aar+   past_key_valuesr	   c                     [        S5      er   r   )clsr   s     r,   from_legacy_cache%Lfm2HybridConvCache.from_legacy_cache   s    !"`aar+   c                     [        [        U R                  5      5       H   nU R                  U   R                  5         M"     g rH   )ra   rs   r_   zero_r   s     r,   resetLfm2HybridConvCache.reset   s/    s4??34IOOI&,,. 5r+   )r^   r]   r_   r\   rX   rZ   rS   rY   rH   )r   )$r&   r'   r(   r)   __doc__rS   is_compileablerX   rY   r`   float32r   r<   rT   r   rU   strr9   Tensorr   dictr   tuplerw   
LongTensorr}   r   r   r   r   r   classmethodFloatTensorr   r   r*   r$   r+   r,   rQ   rQ   S   s    NNIK #]]15// / {{	/
 ellC-./D 26)FLL)F ll)F 	)F
 tCH~.)F 
u||U\\)	*)FV	ie&6&6 	i3 3c 3.U\\ .c .eTWY\T\o .Ss SFS FU5<<3M-N FbuU\\':E%,,<O'O!P b buUEVEV?W9X0Y bes b b/r+   rQ   c                   8  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Lfm2Attention   r4   rl   c                   > [         TU ]  X5        [        R                  " UR                  UR
                  U R                  -  SS9U l        [        R                  " UR                  UR                  U R                  -  SS9U l	        [        R                  " UR                  UR                  U R                  -  SS9U l
        [        R                  " UR
                  U R                  -  UR                  SS9U l        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        U ?U ?g )NFr6   eps)r8   r9   r   r?   r@   num_attention_headshead_dimq_projnum_key_value_headsk_projv_projout_projr"   norm_epsq_layernormk_layernormo_projattention_dropoutrD   r4   rl   rE   s      r,   r9   Lfm2Attention.__init__   s    +ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejk		&"<"<t}}"LfN`N`glm&t}}&//J&t}}&//JK"r+   hidden_statesposition_embeddingsattention_maskpast_key_valuer   rn   c                 *   UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R                  " U6 5      R                  SS5      n	U R                  U R                  U5      R                  " U6 5      R                  SS5      n
U R                  U5      R                  " U6 R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4SU R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nU R'                  U5      nUU4$ )Nr   r   )sincosr   eagerg        )dropoutscaling)r   r   r   r   view	transposer   r   r   r   rw   rl   r   r4   _attn_implementationr   r   reshape
contiguousr   )rD   r   r   r   r   r   kwargsinput_shapehidden_shapequery_statesrj   rk   r   r   rm   attention_interfaceattn_outputattn_weightsoutputs                      r,   rL   Lfm2Attention.forward   s    $))#2.88b8$--8''M(B(G(G(VWaabcefg%%dkk-&@&E&E|&TU__`acde
{{=166EOOPQSTU&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
 LL	%
 	%
!\ "));;;;FFH{+|##r+   )r   r   r   r   r   r   r    )r&   r'   r(   r)   r   r<   r9   r`   r   r   r   rQ   r   rL   r*   rN   rO   s   @r,   r   r      s    	#z 	#c 	#  9=59'$||'$ #5<<#=>'$ !.	'$
 !!45'$ !!1!12'$ 
u||Xell3XeELL>Q5RR	S'$ '$r+   r   c            
         ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S	 jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S
 jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S jjrSrU =r$ )Lfm2ShortConvi  r4   rl   c           	      "  > [         TU ]  5         Xl        X l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  U R
                  UR                  U R                  U R
                  S-
  S9U l        [        R                  " UR                  SUR                  -  U R                  S9U l        [        R                  " UR                  UR                  U R                  S9U l        g )Nr   )in_channelsout_channelskernel_sizegroupsr7   paddingr   r6   )r8   r9   r4   rl   r]   L_cache	conv_biasr7   r   Conv1dr@   convr?   in_projr   r   s      r,   r9   Lfm2ShortConv.__init__  s    
 	"**$$	II**++%%LL1$
	 yy!3!3Q9K9K5KRVR[R[\		&"4"4f6H6HtyyYr+   rK   r   r   r   c                    [        X5      nU R                  U5      R                  SS5      nUR                  SSS9u  pgnXa-  nU R                  R
                  R                  U R                  R
                  R                  S5      U R                  R
                  R                  S5      5      n	Ubd  US   S:  a[  [        UR                  S5      UR                  U R                     U	U R                  R                  S 5      n
U
R                  S5      n
OUbc  [        R                  R!                  XR"                  UR$                  S   -
  S45      nUR                  U R                     R'                  U5        [)        XU R                  R                  S S9n
Xz-  nU R+                  UR                  SS5      R-                  5       5      nU$ )Nr   rp   r   rq   r   r   )
activation)r   r   r   chunkr   weightr   sizer   squeezer_   rl   r7   	unsqueezer   
functionalpadr   r   copy_r   r   r   )rD   rK   r   r   r   BCxBCBxconv_weightsconv_outrh   ys                r,   cuda_kernels_forward"Lfm2ShortConv.cuda_kernels_forward4  s{    );ll1o''B/))A2)&aUyy'',,TYY-=-=-B-B1-EtyyGWGWG\G\]^G_`%.*;a*?+

2))$..9		H  ))"-H)]]..rLL288B<4OQR3ST
))$..9??
K'$))..UYZHLMM!++b"-88:;r+   c                 |   UR                   S   n[        X5      nU R                  U5      R                  SS5      nUR	                  SSS9u  pxnXq-  n	UGb2  US   S:  Ga(  UR
                  U R                     n
UR                  SU R                  S-
  5      nU
R                  SSS9n
U	R                  U
R                  U
R                  S9U
S S 2S S 2U4'   UR
                  U R                     R                  U
5        [        R                  " U
R                  U	R                  5      U R                   R"                  S S 2SS S 24   -  SS9nU R$                  (       a  XR                   R$                  -  nUR'                  S5      nO~Ubc  [(        R*                  R-                  XR                  U	R                   S   -
  S45      n
UR
                  U R                     R                  U
5        U R!                  U	5      S	S U24   nX-  nUR                  SS5      R/                  5       nU R1                  U5      nU$ )
Nr   r   rp   r   rq   r   )shiftsdims)rU   rT   .)r   r   r   r   r   r_   rl   clampr   rollr|   rU   rT   r   r`   sumr   r   r7   r   r   r   r   r   r   )rD   rK   r   r   r   seqlenr   r   r   r   rh   r   r   s                r,   slow_forwardLfm2ShortConv.slow_forwardV  s    (;ll1o''B/))A2)&aU%.*;a*?'224>>BJ+11!T\\A5EFN#<J/1uuJ<M<MU_UeUeu/fJq!^+,%%dnn5;;JGyyryy!9DII<L<LQPQSTW<U!U[]^HyyIINN*))"-H)]]..rLL288B<4OQR3ST
))$..9??
Kyy}S'6'\2HLKKB**,MM!r+   r   c                     [         (       aO  SUR                  R                  ;   a5  [        R                  R                  5       (       d  U R                  XX45      $ U R                  XX45      $ )Ncuda)is_fast_path_availablerU   typer`   rd   is_compilingr   r  )rD   r   r   r   r   s        r,   rL   Lfm2ShortConv.forward|  sX     "!f0D0D0I0I&IRWR_R_RlRlRnRn,,]Nkk  __r+   )r   r7   r4   r   r   rl   r   )NNN)r&   r'   r(   r)   r   r<   r9   r`   r   r   rQ   r   r   r  rL   r*   rN   rO   s   @r,   r   r     s&   ZZ Z2 9=5915 <<  !!45  !!1!12	 
 !. J 9=5915$<<$ !!45$ !!1!12	$
 !.$R 9=5915	`||	` !!45	` !!1!12		`
 !.	` 	`r+   r   c                   .  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\
\	\R                        S
\
\R                     S\R                  4S jjrSrU =r$ )Lfm2DecoderLayeri  r4   rl   c                 `  > [         TU ]  5         UR                  U   S:H  U l        U R                  (       a  [	        X5      U l        O[        X5      U l        [        U5      U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )NrW   r   )r8   r9   rZ   is_attention_layerr   	self_attnr   r   r2   feed_forwardr"   r@   r   operator_normffn_normr   s      r,   r9   Lfm2DecoderLayer.__init__  s    "("4"4Y"?CS"S""*6=DN%f8DI#FO(););Q#F$6$6FOOLr+   r   r   r   position_idsr   r   rn   c           
         UnU R                   (       a+  U R                  " SU R                  U5      UUUUUS.UD6u  pO!U R                  U R                  U5      UUUS9nX-   nXR	                  U R                  U5      5      -   nU$ )N)r   r   r   r  r   r   )r   r   r   r   r$   )r  r  r  r   r  r  )
rD   r   r   r   r  r   r   r   residualrg   s
             r,   rL   Lfm2DecoderLayer.forward  s     !""#~~  "00?$7-)--   M1 !II"00?---	 & M &0%(9(9$--:V(WWr+   )r   r  r  r  r  r  )NNNN)r&   r'   r(   r)   r   r<   r9   r`   r   r   r   r   rL   r*   rN   rO   s   @r,   r  r    s    
Mz 
Mc 
M  26378<59|| #5<<#=> !.	
 u//0 !u||!45 !!1!12 
 r+   r  c                       \ rS rSrSrSrg)Lfm2PreTrainedModeli  Fr$   N)r&   r'   r(   r)   _can_compile_fullgraphr*   r$   r+   r,   r  r    s    "r+   r  c                     ^  \ rS rSrS\4U 4S jjr       SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\R                     S\\   S\4S jjrSrU =r$ )	Lfm2Modeli  r4   c                    > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  S9U l        U ?U ?	g )Nr   )
r8   r9   r/   pos_embr"   r@   r   embedding_normnorm
rotary_emv)rD   r4   rE   s     r,   r9   Lfm2Model.__init__  s?     *62)&*<*<&//RIOr+   	input_idsr   r  r   inputs_embeds	use_cacher   r   rn   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a;  Uc8  UR                  S   n	[        U R                  XR
                  U R                  S9nUcD  Ub  UR                  5       OSn
[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9nUnU R                  X5      nU R                  S U R                  R                    H  nU" U4UUUUUS.UD6nM     U R                  U5      n[!        UUS9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )r4   rS   rT   rU   r   )rU   )r4   input_embedsr   r   r   r  )r   r  r   r   r   )last_hidden_stater   )
ValueErrorembed_tokensr   rQ   r4   rT   rU   r   r`   aranger   r
   r  layersrb   r  r   )rD   r#  r   r  r   r$  r%  r   r   
batch_sizer   causal_maskr   r   decoder_layers                  r,   rL   Lfm2Model.forward  sz    -t";<YZZ  --i8M0&,,Q/J1{{:ZZX\XcXcO !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"ll=G "[[)H4;;+H+HIM)*).-$7 M J ++M:&++
 	
r+   )r  r  )NNNNNNN)r&   r'   r(   r)   r   r9   r   r`   r   r   rQ   r   boolr   r   r   rL   r*   rN   rO   s   @r,   r  r    s    z  1515379=59$(59=
E,,-=
 !.=
 u//0	=

 ""56=
   1 12=
 D>=
 !!1!12=
 +,=
 
!=
 =
r+   r  c                       \ rS rSrSrg)Lfm2ForCausalLMi  r$   Nr%   r$   r+   r,   r3  r3    r-   r+   r3  )r3  r  r  )=typingr   r   r   r   r`   torch.nn.functionalr   r   rI   cache_utilsr	   masking_utilsr
   modeling_layersr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.import_utilsr   bamba.modeling_bambar   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_lfm2r   causal_conv1dr   r   kernel_modulesallr  
get_loggerr&   loggerr"   r/   Moduler2   rQ   r   r   r  r  r  r3  __all__r$   r+   r,   <module>rH     s(   2 1     ' / 9 7 5 & 0 < ?	 	 	 + DD-7** #$89^,  
		H	%	, 		. 	8bii 8(Q/, Q/h3$N 3$lh`BII h`V,1 ,^#. #E

 E
P	& 	 Br+   