
    <h}                        S SK JrJrJrJr  S SKrS SKJs  Jr	  S SKJr  SSK
JrJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)  \'" 5       (       a	  S SK*J+r+J,r,  OSu  r+r,\" S5       " S S\RZ                  5      5       r. " S S\RZ                  5      r/ " S S\RZ                  5      r0 " S S\5      r1S r2S;S jr3S \Rh                  S!\5S"\Rh                  4S# jr6 S<S$\RZ                  S%\Rh                  S&\Rh                  S'\Rh                  S(\\Rh                     S)\7S*\7S+\\!   4S, jjr8 " S- S.\RZ                  5      r9S/ r:\+\,4r;\<" \;5      r= " S0 S1\RZ                  5      r> " S2 S3\5      r?\" " S4 S5\5      5       r@\" " S6 S7\@5      5       rA\" " S8 S9\@\5      5       rB/ S:QrCg)=    )AnyCallableOptionalUnionN)nn   )CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPastCausalLMOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs)is_causal_conv1d_available   )
Lfm2Config)causal_conv1d_fncausal_conv1d_updateNNRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Lfm2RMSNorm/   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z*
Lfm2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/lfm2/modeling_lfm2.pyr&   Lfm2RMSNorm.__init__1   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   hidden_statesinput_dtypevariances       r0   forwardLfm2RMSNorm.forward9   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler*   shaper+   r,   s    r0   
extra_reprLfm2RMSNorm.extra_repr@   s*    ))*+6$2G2G1HIIr2   )r+   r*   )gư>)	__name__
__module____qualname____firstlineno__r&   r@   rF   __static_attributes____classcell__r/   s   @r0   r"   r"   /   s    $;J Jr2   r"   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )Lfm2RotaryEmbeddingD   configc                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r%   r&   hasattr
isinstancerT   dictgetrU   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrR   r   rope_init_fnattention_scalingregister_bufferrX   original_inv_freq)r,   rR   devicerX   r/   s       r0   r&   Lfm2RotaryEmbedding.__init__E   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r2   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r5   r   mpscpuF)device_typeenabledr4   dim)r7   )rX   floatexpandrD   r8   re   r[   rV   strr(   autocast	transposecatcosrb   sinr7   )
r,   xposition_idsinv_freq_expandedposition_ids_expandedrj   freqsembrt   ru   s
             r0   r@   Lfm2RotaryEmbedding.forwardV   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)rb   rR   r_   rd   r`   ra   rU   N)rH   rI   rJ   rK   r   r&   r(   no_gradr   r@   rL   rM   rN   s   @r0   rP   rP   D   s6    /z / /" ]]_<  <r2   rP   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )Lfm2MLPf   rR   c                   > [         TU ]  5         UR                  nUR                  (       aa  [	        SU-  S-  5      nUR
                  bC  [	        UR
                  U-  5      nUR                  X!R                  -   S-
  UR                  -  -  n[        R                  " UR                  USS9U l
        [        R                  " UR                  USS9U l        [        R                  " X!R                  SS9U l        g )Nr4   r   r   Fbias)r%   r&   intermediate_sizeblock_auto_adjust_ff_dimintblock_ffn_dim_multiplierblock_multiple_ofr   Linearr-   w1w3w2)r,   rR   r   r/   s      r0   r&   Lfm2MLP.__init__g   s    "44** #A(9$9A$= >..:$'(G(GJ[([$\!$*$<$<&)A)AAAE&JbJbb%! ))F..0AN))F..0AN))-/A/ANr2   c                     U R                  [        R                  " U R                  U5      5      U R	                  U5      -  5      $ r}   )r   Fsilur   r   )r,   rv   s     r0   r@   Lfm2MLP.forwardv   s/    wwqvvdggaj)DGGAJ677r2   )r   r   r   )	rH   rI   rJ   rK   r   r&   r@   rL   rM   rN   s   @r0   r   r   f   s    Oz O8 8r2   r   c                   |   \ rS rSrSrSrSrSrSr\	R                  S4S\S\S\	R                  S\\	R                  \S4   4S	 jjr SS
\	R$                  S\	R$                  S\S\\\\4      S\\	R$                  \	R$                  4   4
S jjrS\	R0                  4S jrSS\\   S\4S jjrS\	R$                  S\S\\\4   4S jrS\4S jrS\S\\	R$                  \	R$                  4   4S jrS\\\	R$                     \\	R$                     4   4S jr\SS\\\\	R@                           SS4S jj5       r!S r"Sr#g) Lfm2HybridConvCachez   z
Attention and conv cache for Lfm2.

It stores the Key and Value states as a list of tensors, one for each layer.
Attention layer cache shape: `[batch_size, num_heads, seq_len, head_dim]`.
Conv layer cache shape: `[batch_size, hidden_size, L_cache-1]`.
NFrR   max_batch_sizer7   re   c           	      ,   / U l         / U l        X l        UR                  U l        U R                  R	                  S5      U l        UR                  U l        X0l        / U l        Ub  [        R                  " U5      OS n[        UR                  5       H}  n[        R                  " U R                  UR                  U R                  U R                  US9n[        R                  R!                  U5        U R                  R#                  U5        M     g )Nfull_attention)r7   re   )	key_cachevalue_cacher   layer_typesindexfirst_attention_layerconv_L_cache_dtype
conv_cacher(   re   rangenum_hidden_layerszerosr-   _dynamomark_static_addressappend)r,   rR   r   r7   re   _
conv_states          r0   r&   Lfm2HybridConvCache.__init__   s     ,!--%)%5%5%;%;<L%M""//.0)/);f%v//0A##""!!kkJ MM--j9OO"":. 1r2   
key_statesvalue_states	layer_idxcache_kwargsreturnc                 0   UGbu  [        U R                  5      U::  a  [        [        U R                  5      U5       Ha  nU R                  R                  [        R
                  " / 5      5        U R                  R                  [        R
                  " / 5      5        Mc     U R                  R                  U5        U R                  R                  U5        OU R                  U   R                  5       (       d  XR                  U'   X R                  U'   Ob[        R                  " U R                  U   U/SS9U R                  U'   [        R                  " U R                  U   U/SS9U R                  U'   U R                  U   U R                  U   4$ )a#  
Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.

Parameters:
    key_states (`torch.Tensor`):
        The new key states to cache.
    value_states (`torch.Tensor`):
        The new value states to cache.
    layer_idx (`int`):
        The index of the layer to cache the states for.
    cache_kwargs (`Dict[str, Any]`, `optional`):
        Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.

Return:
    A tuple containing the updated key and value states.
rl   )	lenr   r   r   r(   tensorr   numelrs   )r,   r   r   r   r   r   s         r0   updateLfm2HybridConvCache.update   s<   0 !4>>"i/s4>>2I>ANN))%,,r*:;$$++ELL,<= ? %%j1  ''5NN9-3355,6y).:  +,1IIt~~i7PR\6]ce,fy).3ii9I9I)9TVb8cik.l  +~~i($*:*:9*EEEr2   beam_idxc                 J   [        [        U R                  5      5       GH   nU R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   U R                  U   R                  nU R                  U   R	                  SUR                  U5      5      U R                  U'   GM     g)zDReorders the cache for beam search, given the selected beam indices.r   N)r   r   r   re   index_selectr8   r   r   )r,   r   r   re   s       r0   reorder_cache!Lfm2HybridConvCache.reorder_cache   s    s4>>23I^^I.55F(,y(A(N(NqRZR]R]^dRe(fDNN9%%%i077F*.*:*:9*E*R*RSTV^VaVabhVi*jDY'__Y/66F)-)C)P)PQRT\T_T_`fTg)hDOOI& 4r2   c                     U R                   U   S:w  a  U R                  OUn[        U R                  5      U::  d!  U R                  U   R	                  5       S:X  a  gU R                  U   R
                  S   $ )zYReturns the sequence length of the cached states. A layer index can be optionally passed.r   r   r   )r   r   r   r   r   rD   r,   r   s     r0   get_seq_length"Lfm2HybridConvCache.get_seq_length   sm     372B2B92MQa2aD..gp	t~~)+t~~i/H/N/N/PTU/U~~i(..r22r2   cache_positionc                 R    SnUR                   S   nU R                  5       nXE-   nXc4$ )a  
Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
the given layer at `layer_idx`.
The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns (i.e. sliding_window, chunk_size),
for each layer.
r   )rD   r   )r,   r   r   full_mask_kv_offsetquery_lengthpast_seen_tokens	kv_lengths          r0   get_mask_sizes"Lfm2HybridConvCache.get_mask_sizes   s:      %++A...0 3	--r2   
max_lengthc                    US:  a  U R                  5       [        U5      -
  nU R                  5       U::  a  g[        [        U R                  5      5       Hs  nU R                  U   R                  5       (       d  M'  U R                  U   SSU2SS24   U R                  U'   U R                  U   SSU2SS24   U R                  U'   Mu     g)z"Crop the cache to the given lengthr   N.)r   absr   r   r   r   r   )r,   r   idxs      r0   cropLfm2HybridConvCache.crop   s    >,,.Z@J J.T^^,-C~~c"((**&*nnS&9#{
{A:M&Ns#(,(8(8(=c;J;PQ>Q(R  % .r2   c                 >    U R                   U   U R                  U   4$ r}   )r   r   r   s     r0   __getitem__Lfm2HybridConvCache.__getitem__   s!    ~~i($*:*:9*EEEr2   c                     [        S5      eNz<Lfm2HybridConvCache does not have a legacy cache equivalent.NotImplementedErrorrE   s    r0   to_legacy_cache#Lfm2HybridConvCache.to_legacy_cache  s    !"`aar2   past_key_valuesr
   c                     [        S5      er   r   )clsr   s     r0   from_legacy_cache%Lfm2HybridConvCache.from_legacy_cache  s    !"`aar2   c                     [        [        U R                  5      5       H   nU R                  U   R                  5         M"     g r}   )r   r   r   zero_r   s     r0   resetLfm2HybridConvCache.reset  s/    s4??34IOOI&,,. 5r2   )r   r   r   r   r   r   r   r   r}   )r   )$rH   rI   rJ   rK   __doc__r   is_compileabler   r   r(   r9   r   r   r7   r   re   rp   r&   Tensorr   r\   r   rC   r   
LongTensorr   r   r   r   r   r   classmethodFloatTensorr   r   rL    r2   r0   r   r   z   s    NNIK #]]15// / {{	/
 ellC-./D 26)FLL)F ll)F 	)F
 tCH~.)F 
u||U\\)	*)FV	ie&6&6 	i3 3c 3.U\\ .c .eTWY\T\o .Ss SFS FU5<<3M-N FbuU\\':E%,,<O'O!P b buUEVEV?W9X0Y bes b b/r2   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr5   r4   rl   )rD   r(   rs   )rv   x1x2s      r0   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrt   ru   rw   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embr     sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   r=   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rD   ro   reshape)r=   r   batchnum_key_value_headsslenhead_dims         r0   	repeat_kvr   0  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr2   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr4   r   r   r5   )rm   r7   )ptrainingr   )r   num_key_value_groupsr(   matmulrr   rD   r   
functionalsoftmaxr9   r8   r7   r   r  
contiguous)r   r   r   r   r   r   r   r  r   r   attn_weightscausal_maskattn_outputs                r0   eager_attention_forwardr  <  s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r2   c                   <  ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\
\R                  \\R                     \\
\R                        4   4S jjrSrU =r$ )Lfm2AttentioniV  z=Multi-headed attention from 'Attention Is All You Need' paperrR   r   c                 j  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        SU l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR
                  UR                  U R                  -  SS9U l        [        R                  " UR                  U R                  -  UR
                  SS9U l        [%        U R                  UR&                  S9U l        [%        U R                  UR&                  S9U l        g )Nr   g      TFr   r.   )r%   r&   rR   r   getattrr-   num_attention_headsr   r   r  r   	is_causalr   r   q_projk_projv_projout_projr"   norm_epsq_layernormk_layernormr,   rR   r   r/   s      r0   r&   Lfm2Attention.__init__Y  sH   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*ii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejk		&"<"<t}}"LfN`N`glm&t}}&//J&t}}&//Jr2   r=   position_embeddingsr   past_key_valuer   r   c                 *   UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R                  " U6 5      R                  SS5      n	U R                  U R                  U5      R                  " U6 5      R                  SS5      n
U R                  U5      R                  " U6 R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4SU R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nU R'                  U5      nUU4$ )Nr5   r   r4   )ru   rt   r   eager        )r   r   )rD   r   r  r  viewrr   r  r  r  r   r   r   r  rR   _attn_implementationr   r   r   r	  r  )r,   r=   r  r   r  r   r  input_shapehidden_shapequery_statesr   r   rt   ru   r   attention_interfacer  r
  outputs                      r0   r@   Lfm2Attention.forwardh  s    $))#2.88b8$--8''M(B(G(G(VWaabcefg%%dkk-&@&E&E|&TU__`acde
{{=166EOOPQSTU&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
 LL	%
 	%
!\ "));;;;FFH{+|##r2   )rR   r   r  r  r  r   r  r  r  r  r   r  r   )rH   rI   rJ   rK   r   r   r   r&   r(   r   rC   r   r   r   r@   rL   rM   rN   s   @r0   r  r  V  s    GKz Kc K( 9=59'$||'$ #5<<#=>'$ !.	'$
 !!45'$ !!1!12'$ 
u||Xell3XeELL>Q5RR	S'$ '$r2   r  c                     UbO  UR                   S   S:  a<  UR                   S   S:  a)  U R                  nXSS2SS2S4   -  R                  U5      n U $ )ze
Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66
Nr   r   )rD   r7   r8   )r=   r   r7   s      r0   apply_mask_to_padding_statesr,    s_     !n&:&:1&=&AnFZFZ[\F]`aFa##&1d
)CCGGNr2   c            
         ^  \ rS rSrS\S\4U 4S jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S	 jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S
 jjr   SS\R                  S\	\
   S\	\R                     S\	\R                     4S jjrSrU =r$ )Lfm2ShortConvi  rR   r   c           	      "  > [         TU ]  5         Xl        X l        UR                  U l        UR                  U l        [        R                  " UR                  UR                  U R
                  UR                  U R                  U R
                  S-
  S9U l        [        R                  " UR                  SUR                  -  U R                  S9U l        [        R                  " UR                  UR                  U R                  S9U l        g )Nr   )in_channelsout_channelskernel_sizegroupsr   paddingr   r   )r%   r&   rR   r   r   L_cache	conv_biasr   r   Conv1dr-   convr   in_projr  r  s      r0   r&   Lfm2ShortConv.__init__  s    
 	"**$$	II**++%%LL1$
	 yy!3!3Q9K9K5KRVR[R[\		&"4"4f6H6HtyyYr2   rv   r  r   r   c                    [        X5      nU R                  U5      R                  SS5      nUR                  SSS9u  pgnXa-  nU R                  R
                  R                  U R                  R
                  R                  S5      U R                  R
                  R                  S5      5      n	Ubd  US   S:  a[  [        UR                  S5      UR                  U R                     U	U R                  R                  S 5      n
U
R                  S5      n
OUbc  [        R                  R!                  XR"                  UR$                  S   -
  S45      nUR                  U R                     R'                  U5        [)        XU R                  R                  S S9n
Xz-  nU R+                  UR                  SS5      R-                  5       5      nU$ )Nr5   r   r   rl   r   r4   )
activation)r,  r9  rr   chunkr8  r*   r#  sizer   squeezer   r   r   r   r   r  padr5  rD   copy_r   r  r	  )r,   rv   r  r   r   BCxBCBxconv_weightsconv_outr   ys                r0   cuda_kernels_forward"Lfm2ShortConv.cuda_kernels_forward  s{    );ll1o''B/))A2)&aUyy'',,TYY-=-=-B-B1-EtyyGWGWG\G\]^G_`%.*;a*?+

2))$..9		H  ))"-H)]]..rLL288B<4OQR3ST
))$..9??
K'$))..UYZHLMM!++b"-88:;r2   c                 |   UR                   S   n[        X5      nU R                  U5      R                  SS5      nUR	                  SSS9u  pxnXq-  n	UGb2  US   S:  Ga(  UR
                  U R                     n
UR                  SU R                  S-
  5      nU
R                  SSS9n
U	R                  U
R                  U
R                  S9U
S S 2S S 2U4'   UR
                  U R                     R                  U
5        [        R                  " U
R                  U	R                  5      U R                   R"                  S S 2SS S 24   -  SS9nU R$                  (       a  XR                   R$                  -  nUR'                  S5      nO~Ubc  [(        R*                  R-                  XR                  U	R                   S   -
  S45      n
UR
                  U R                     R                  U
5        U R!                  U	5      S	S U24   nX-  nUR                  SS5      R/                  5       nU R1                  U5      nU$ )
Nr   r5   r   r   rl   r   )shiftsdims)re   r7   .)rD   r,  r9  rr   r=  r   r   clampr5  rollr8   re   r7   rA  r(   sumr8  r*   r   r   r   r  r@  r	  r  )r,   rv   r  r   r   seqlenrB  rC  rD  rE  r   rG  rH  s                r0   slow_forwardLfm2ShortConv.slow_forward  s    (;ll1o''B/))A2)&aU%.*;a*?'224>>BJ+11!T\\A5EFN#<J/1uuJ<M<MU_UeUeu/fJq!^+,%%dnn5;;JGyyryy!9DII<L<LQPQSTW<U!U[]^HyyIINN*))"-H)]]..rLL288B<4OQR3ST
))$..9??
Kyy}S'6'\2HLKKB**,MM!r2   r=   c                     [         (       aO  SUR                  R                  ;   a5  [        R                  R                  5       (       d  U R                  XX45      $ U R                  XX45      $ )Ncuda)is_fast_path_availablere   rV   r(   r   is_compilingrI  rR  )r,   r=   r  r   r   s        r0   r@   Lfm2ShortConv.forward   sX     "!f0D0D0I0I&IRWR_R_RlRlRnRn,,]Nkk  __r2   )r5  r   rR   r8  r9  r   r  )NNN)rH   rI   rJ   rK   r   r   r&   r(   r   r   r   r   rI  rR  r@   rL   rM   rN   s   @r0   r.  r.    s&   ZZ Z2 9=5915 <<  !!45  !!1!12	 
 !. J 9=5915$<<$ !!45$ !!1!12	$
 !.$R 9=5915	`||	` !!45	` !!1!12		`
 !.	` 	`r2   r.  c                   .  ^  \ rS rSrS\S\4U 4S jjr    SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\
\	\R                        S
\
\R                     S\R                  4S jjrSrU =r$ )Lfm2DecoderLayeri  rR   r   c                 `  > [         TU ]  5         UR                  U   S:H  U l        U R                  (       a  [	        X5      U l        O[        X5      U l        [        U5      U l	        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        g )Nr   r  )r%   r&   r   is_attention_layerr  	self_attnr.  r8  r   feed_forwardr"   r-   r  operator_normffn_normr  s      r0   r&   Lfm2DecoderLayer.__init__  s    "("4"4Y"?CS"S""*6=DN%f8DI#FO(););Q#F$6$6FOOLr2   r=   r  r   rw   r  r   r   c           
         UnU R                   (       a+  U R                  " SU R                  U5      UUUUUS.UD6u  pO!U R                  U R                  U5      UUUS9nX-   nXR	                  U R                  U5      5      -   nU$ )N)r=   r  r   rw   r  r   )r=   r  r   r   r   )r\  r]  r_  r8  r^  r`  )
r,   r=   r  r   rw   r  r   r  residualr   s
             r0   r@   Lfm2DecoderLayer.forward  s     !""#~~  "00?$7-)--   M1 !II"00?---	 & M &0%(9(9$--:V(WWr2   )r8  r^  r`  r\  r_  r]  )NNNN)rH   rI   rJ   rK   r   r   r&   r(   r   rC   r   r   r@   rL   rM   rN   s   @r0   rZ  rZ    s    
Mz 
Mc 
M  26378<59|| #5<<#=> !.	
 u//0 !u||!45 !!1!12 
 r2   rZ  c                   R    \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\\S.rS	rg
)Lfm2PreTrainedModeli;  rR   modelTrZ  r   F)r=   
attentionsr   N)rH   rI   rJ   rK   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendrZ  r  _can_record_outputsrL   r   r2   r0   rf  rf  ;  sQ    &*#+,#4"5N""&)#r2   rf  c                     ^  \ rS rSrS\4U 4S jjr\\       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\	R                     S\\   S\4S jj5       5       rSrU =r$ )	Lfm2ModeliM  rR   c           	      &  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        US9U l        SU l        [        U5      U l        [%        UR                  UR&                  S9U l        U R+                  5         g s  snf )N)rR   Fr  )r%   r&   pad_token_idpadding_idx
vocab_sizer   	Embeddingr-   embed_tokens
ModuleListr   r   rZ  layersrP   
rotary_embgradient_checkpointingpos_embr"   r  embedding_norm	post_initr  s      r0   r&   Lfm2Model.__init__O  s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabBaYf0Bab
 .V<&+#*62)&*<*<&//R 	 cs   D	input_idsr   rw   r   inputs_embeds	use_cacher   r  r   c           
         US L US L-  (       a  [        S5      eUc  U R                  U5      nU(       a;  Uc8  UR                  S   n	[        U R                  XR
                  U R                  S9nUcD  Ub  UR                  5       OSn
[        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9nUnU R                  X5      nU R                  S U R                  R                    H  nU" U4UUUUUS.UD6nM     U R                  U5      n[!        UUS9$ )	Nz:You must specify exactly one of input_ids or inputs_embedsr   )rR   r   r7   re   r   )re   )rR   input_embedsr   r   r   rw   )r   rw   r  r   r  )last_hidden_stater   )
ValueErrorr{  rD   r   rR   r7   re   r   r(   aranger   r   r  r}  r   r  r   )r,   r  r   rw   r   r  r  r   r  
batch_sizer   r  r=   r  decoder_layers                  r0   r@   Lfm2Model.forward`  sz    -t";<YZZ  --i8M0&,,Q/J1{{:ZZX\XcXcO !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"ll=G "[[)H4;;+H+HIM)*).-$7 M J ++M:&++
 	
r2   )r{  r  r  r}  rx  r  r~  ry  )NNNNNNN)rH   rI   rJ   rK   r   r&   r   r   r   r(   r   r   r   r   boolr   r   r   r@   rL   rM   rN   s   @r0   ru  ru  M  s    z "  1515379=59$(59=
E,,-=
 !.=
 u//0	=

 ""56=
   1 12=
 D>=
 !!1!12=
 +,=
 
!=
  =
r2   ru  c                   ~  ^  \ rS rSrS/rSS0rSS/S/40rU 4S jrS rS	 r	\
\         SS
\\R                     S\\R                     S\\R                     S\\   S\\R"                     S\\R                     S\\   S\\R                     S\\\R                  4   S\\   S\4S jj5       5       rSrU =r$ )Lfm2ForCausalLMi  zlm_head.weightlm_headcolwise_repr=   logitsc                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r%   r&   ru  rg  ry  r   r   r-   r  r  )r,   rR   r/   s     r0   r&   Lfm2ForCausalLM.__init__  sU     v&
 ++yy!3!3V5F5FUS 	r2   c                     Xl         g r}   rg  )r,   decoders     r0   set_decoderLfm2ForCausalLM.set_decoder  s    
r2   c                     U R                   $ r}   r  rE   s    r0   get_decoderLfm2ForCausalLM.get_decoder  s    zzr2   r  r   rw   r   r  labelsr  r   logits_to_keepr  r   c
                 ~   U R                   " SUUUUUUUS.U
D6nUR                  n[        U	[        5      (       a  [	        U	* S5      OU	nU R                  USS2USS24   5      nSnUb)  U R                  " SXU R                  R                  S.U
D6n[        UUUR                  UR                  UR                  S9$ )ai  
Example:

```python
>>> from transformers import AutoTokenizer, Lfm2ForCausalLM

>>> model = Lfm2ForCausalLM.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("meta-lfm2/Lfm2-2-7b-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```)r  r   rw   r   r  r  r   N)r  r  ry  )lossr  r   r=   rh  r   )rg  r  r[   r   slicer  loss_functionrR   ry  r   r   r=   rh  )r,   r  r   rw   r   r  r  r  r   r  r  outputsr=   slice_indicesr  r  s                   r0   r@   Lfm2ForCausalLM.forward  s    @ ,0:: 	,
)%+')	,
 	,
  118B>SV8W8W~ot4]kmA}a,?@A%%pVt{{OeOepiopD%#33!//))
 	
r2   )r  rg  ry  )	NNNNNNNNr   )rH   rI   rJ   rK   _tied_weights_keys_tp_plan_pp_planr&   r  r  r   r   r   r(   r   r   r	   r   r  r   r   r   r   r   r@   rL   rM   rN   s   @r0   r  r    s:   *+=)H_-z:;H  151537+/59-1$(59348
E,,-8
 !.8
 u//0	8

 "%8
   1 128
 ))*8
 D>8
 !!1!128
 c5<</08
 +,8
 
 8
  8
r2   r  )r  ru  rf  )Nr   )r"  )Dtypingr   r   r   r   r(   torch.nn.functionalr   r  r   cache_utilsr	   r
   
generationr   integrationsr   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   utils.import_utilsr   configuration_lfm2r   causal_conv1dr   r   Moduler"   rP   r   r   r   r   r   r   r   rn   r  r  r,  kernel_modulesallrV  r.  rZ  rf  ru  r  __all__r   r2   r0   <module>r     s  ( 2 1     . ) 7 / 9 O K F & I I / < * DD-7** Y'J")) J (J(<")) <D8bii 8(Q/, Q/h(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%49$BII 9$x #$89^, h`BII h`V,1 ,^ /  " Q
# Q
 Q
h N
)? N
 N
b Br2   