
    <h                     d   S r SSKrSSKJr  SSKJr  SSKJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJr  SSKJr  \R6                  " \5      rSqS r " S S\	R@                  RB                  5      r"S(S jr#S(S jr$ " S S\RJ                  5      r& " S S\RJ                  5      r' " S S\5      r(\ " S S\5      5       r)\\" SS9 " S S\5      5       5       r*\\" SS9 " S  S!\5      5       5       r+\ " S" S#\)5      5       r,\" S$S9 " S% S&\)\5      5       r-/ S'Qr.g))zPyTorch RWKV model.    N)	dataclass)Path)OptionalUnion)nn   )GenerationMixin)GradientCheckpointingLayer)PreTrainedModel)ModelOutputauto_docstringis_bitsandbytes_availableis_ninja_availableis_torch_cuda_availablelogging   )
RwkvConfigc                    SSK Jn  [        [        5      R	                  5       R
                  R
                  R
                  S-  S-  nS Vs/ sH  o2U-  PM	     nn[        b  [        R                  U :X  a  g [        R                  SU  S35        SS	S
SSSSU  3/nU" SU  3U[        R                  " 5       [        R                  :H  US9qU [        l        g s  snf )Nr   )loadkernelsrwkv)z
wkv_op.cppzwkv_cuda.cuzwkv_cuda_bf16.cuz2Loading CUDA kernel for RWKV at context length of .z
-res-usagez--maxrregcount 60z--use_fast_mathz-O3z-Xptxas -O3z--extra-device-vectorizationz-DTmax=wkv_)namesourcesverboseextra_cuda_cflags)torch.utils.cpp_extensionr   r   __file__resolveparentrwkv_cuda_kernelmax_seq_lengthloggerinfor   get_verbosityDEBUG)context_lengthload_kernelkernel_folderfcuda_kernel_filesflagss         ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/rwkv/modeling_rwkv.pyload_wkv_cuda_kernelr/   /   s    = N**,33::AAIMPVVM4ef4eq*4ef #(8(G(G>(Y
KKD^DTTUVW 	&
.!"E #N#$!&&(GMM9	 '5#/ gs   Cc                   <    \ rS rSr\SS j5       r\SS j5       rSrg)RwkvLinearAttentionO   Nc                    UR                  5       u  pxn	U[        R                  :  a   [        SU S[        R                   S35      eXy-  [	        U	S5      -  S:w  a  [        SU SU	 S[	        U	S5       S	35      eUR
                  U l        UR                  R                  S
:w  dN  UR                  R                  S
:w  d4  UR                  R                  S
:w  d  UR                  R                  S
:w  a  [        S5      e[        R                  " UR                  5       R                  5       5      * nUR
                  [        R                  :X  a0  UR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       nUR                  5       n[        R                  " U[        R                  S9n
U(       d  Ub  UcT  [        R                   " UU	S[        R"                  UR                  [        R                  S9nUS S 2S S 2S4==   S-  ss'   OA[        R$                  " U Vs/ sH  oR'                  S5      PM     snSS9R                  5       nUR
                  [        R(                  :X  a  [        R*                  nO[        R,                  nU" XX4X5        OHUR
                  [        R(                  :X  a  [        R.                  O[        R0                  nU" XX4U
5        U R3                  XX4U
5        Ub3  [        R4                  " USSS9 Vs/ sH  oR7                  S5      PM     nnU
R9                  U R                  5      U4$ s  snf s  snf )NzCannot process a batch with z+ tokens at the same time, use a maximum of z with this model.    r   zThe product of batch size (z) and hidden size (z") needs to be a round multiple of r   cudazUCalling the CUDA kernel for wkv attention requires all tensors to be on CUDA devices.memory_formatr   )dtypedevicer7      籡*G)dim)sizer"   r#   
ValueErrorminr8   input_dtyper9   typetorchexpfloat
contiguousfloat16
empty_likecontiguous_formatzerosfloat32cat	unsqueezebfloat16forward_with_state_bf16forward_with_stateforward_bf16forwardsave_for_backwardchunksqueezeto)ctx
time_decay
time_firstkeyvaluestatereturn_state
batch_sizeseq_lenhidden_sizeoutputsforward_funcs                r.   rQ   RwkvLinearAttention.forwardP   s   +.88:(
[%444.wi7b#2233DF  #c+r&::a?-j\9L[M Z";34A7 
 )) ""f,  %%/zz&(||  F*tuuii
 0 0 2 = = ?@@
99%#))+J))+CKKME**,
nn  "!!#U5L5LM5,}--::"'"9"9 aAg$&		5"A5a;;q>5"AqITTVyyENN*/GG/BBVK<?II<W+88]m]u]uLVDjc&I+0;;uaQ+GH+GaYYq\+GEHyy)500 #B Is   	M.6M3c                 .   U R                   nU R                  u  pEpgn[        R                  " U[        R                  U[        R
                  :X  a  [        R
                  O[        R                  S9n	[        R                  " U[        R                  S9n
[        R                  " U[        R                  S9n[        R                  " U[        R                  S9nU[        R                  :X  a  UR                  5       nU[        R
                  :X  a  [        R                  O[        R                  nU" UUUUUUR                  5       U	U
UU5
        U	R                  U5      U
R                  U5      UR                  U5      UR                  U5      S S 4$ )N)r7   r8   r6   )r@   saved_tensorsrB   rG   rH   rM   rJ   rF   rD   r"   backward_bf16backwardrE   rU   )rV   g_outputg_stater@   rW   rX   rY   rZ   r`   g_time_decayg_time_firstg_keyg_valuebackward_funcs                 r.   rg   RwkvLinearAttention.backward   sF    oo585F5F2
F''11$/5>>$A%..u}}

 ''
%BYBYZ  E4K4KL""58O8OP%--'~~'H:E:W(66]m]v]v!	
 OOK(OOK(HH[!JJ{#
 	
     NFN)__name__
__module____qualname____firstlineno__staticmethodrQ   rg   __static_attributes__rq   rp   r.   r1   r1   O   s)    <1 <1| %
 %
rp   r1   c                    UR                  5       u  pgn[        R                  " U5      nUc  [        R                  " US S 2S4   [        R                  S9n	[        R                  " US S 2S4   [        R                  S9n
[        R                  " US S 2S4   [        R                  S9S-
  nOUu  pn[        R                  " U 5      * n [        U5       GH
  nUS S 2U4   R                  5       nUS S 2U4   n[        R                  " XU-   5      n[        R                  " X-
  5      n[        R                  " X-   U-
  5      nUU	-  UU-  -   nUU
-  U-   nUU-  R                  UR                  5      US S 2U4'   [        R                  " X-   U5      n[        R                  " X-   U-
  5      n[        R                  " UU-
  5      nUU	-  UU-  -   n	UU
-  U-   n
UnGM     U(       d  Ub  XU/nX4$ )Nr   )r8   r;   )
r=   rB   
zeros_likerJ   rC   rangerD   maximumrU   r8   )rW   rX   rY   rZ   r[   r\   _
seq_lengthr`   	num_state	den_state	max_statecurrent_indexcurrent_keycurrent_valuemax_for_outpute1e2	numeratordenominatormax_for_states                        r.   rwkv_linear_attention_cpur      s    xxzA1c"F}$$SAYemmD	$$SAYemmD	$$SAYemmDtK	*/'	i
 ))J''Jz*!]*+113a./ y
2JKYYy12YY{/.@ANR-%77	9nr)$-$;#?#?#Mq-  i&<kJYYy-=>YY{]23NR-%77	NR'	!	% +( u(y1=rp   c           	          [        S XX#4 5       5      nUR                  S5      S:H  n[        b  U(       d  U(       a  [        XX#XES9$ [        R                  XX#XE5      $ )Nc              3   P   #    U H  oR                   R                  S :g  v   M     g7f)r5   N)r9   rA   ).0ts     r.   	<genexpr>(rwkv_linear_attention.<locals>.<genexpr>   s     X3Wa((--6)3Ws   $&r   r[   r\   )anyr=   r"   r   r1   apply)rW   rX   rY   rZ   r[   r\   no_cuda	one_tokens           r.   rwkv_linear_attentionr      sZ    XJC3WXXG q I7i(SXtt"((Uaarp   c                   @   ^  \ rS rSrSU 4S jjrSS jrSS jrSrU =r$ )	RwkvSelfAttention   c                   > [         TU ]  5         Xl        [        S L=(       a    [        R                  UR
                  :H  n[        5       (       a,  [        5       (       a  U(       d   [        UR
                  5        X l        UR                  nUR                  b  UR                  OUnXPl        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " U5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R                   " ["        R$                  " SSU5      5      U l        [        R0                  " S5      U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XESS9U l        [        R4                  " XTSS9U l        g ! [         a    [        R                  S5         GNf = f)Nz9Could not load the custom CUDA kernel for RWKV attention.r   r   r   r   Fbias)super__init__configr"   r#   r(   r   r   r/   	Exceptionr$   r%   layer_idr_   attention_hidden_sizer   	ParameterrB   emptyrW   rX   time_mix_keytime_mix_valuetime_mix_receptance	ZeroPad2d
time_shiftLinearrY   rZ   
receptancer`   )selfr   r   kernel_loadedr_   r   	__class__s         r.   r   RwkvSelfAttention.__init__   s   (4q9I9X9X\b\q\q9q$;$=$=mY$V%:%:; !((,2,H,H,TF((Ze 	 &;",,u{{3H'IJ,,u{{3H'IJLLQ;)GH ll5;;q![+IJ#%<<Aq+0N#O ,,}599[eLYY{N
))KUSii 5O)  YWXYs   (H% %IIc                 p   UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   nXR
                  -  USU R
                  -
  -  -   nU R                  U5      nU R                  U5      n[        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XdXR4$ Nr   r   r   )r=   r   r   r   r   r   rY   rZ   rB   sigmoidr   )r   hiddenr[   shiftedrY   rZ   r   s          r.   extract_key_value#RwkvSelfAttention.extract_key_value  s2   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL,,,w!d>Q>Q:Q/RR666AH`H`D`9aa
hhsm

5!]]4??:#>?
,21b5ME!HQ4==(),,rp   c           	        ^  T R                  XS9u  pEpbUb  [        U 4S jUSS   5       5      OS n[        T R                  T R                  UUUUS9u  pUbT  US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   US   US   S S 2S S 2T R
                  4'   T R                  XH-  5      U4$ )	Nr[   c              3   L   >#    U H  oS S 2S S 2TR                   4   v   M     g 7frs   r   )r   ra   r   s     r.   r   ,RwkvSelfAttention.forward.<locals>.<genexpr>$  s     FIqaDMM12Is   !$r:   r   r   r   r      )r   tupler   rW   rX   r   r`   )	r   r   r[   	use_cacher   rY   rZ   layer_stater   s	   `        r.   rQ   RwkvSelfAttention.forward"  s    (,(>(>v(>(S%
JOJ[eFE!"IFFae1OOOO"
 ",7NE!HQ4==(),7NE!HQ4==(),7NE!HQ4==(){{:,-u44rp   )r   r   rY   r   r`   r   rW   rX   r   r   r   r   rZ   r   rs   rr   )	rt   ru   rv   rw   r   r   rQ   ry   __classcell__r   s   @r.   r   r      s    P<-&5 5rp   r   c                   6   ^  \ rS rSrSU 4S jjrSS jrSrU =r$ )RwkvFeedForwardi6  c                 8  > [         TU ]  5         Xl        X l        UR                  nUR
                  b  UR
                  OSUR                  -  n[        R                  " S5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " [        R                  " SSU5      5      U l        [        R                  " X4SS9U l        [        R                  " X3SS9U l        [        R                  " XCSS9U l        g )Nr   r   r   Fr   )r   r   r   r   r_   intermediate_sizer   r   r   r   rB   r   r   r   r   rY   r   rZ   )r   r   r   r_   r   r   s        r.   r   RwkvFeedForward.__init__7  s     (((.(@(@(LF$$RSV\VhVhRh 	 ,,}5LLQ;)GH#%<<Aq+0N#O 99[%H))K5IYY0EJ
rp   c                    UR                  S5      S:X  a  Ub  US   S S 2S S 2U R                  4   nO4U R                  U5      nUb   US   S S 2S S 2U R                  4   US S 2S4'   XR                  -  USU R                  -
  -  -   nXR                  -  USU R                  -
  -  -   n[
        R                  " [
        R                  " U R                  U5      5      5      nU R                  U5      n[
        R                  " U R                  U5      5      nUb   US S 2S4   US   S S 2S S 2U R                  4'   XV-  U4$ r   )r=   r   r   r   r   rB   squarerelurY   rZ   r   r   )r   r   r[   r   rY   r   rZ   s          r.   rQ   RwkvFeedForward.forwardH  s#   ;;q>Q5#4Ahq!T]]23Goof-G  %aAt}})< =1(((7a$:K:K6K+LL666AH`H`D`9aa
ll5::dhhsm45

3]]4??:#>?
,21b5ME!HQ4==()!5((rp   )r   rY   r   r   r   r   r   rZ   r   rs   rt   ru   rv   rw   r   rQ   ry   r   r   s   @r.   r   r   6  s    K") )rp   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )	RwkvBlocki\  c                   > [         TU ]  5         Xl        X l        US:X  a.  [        R
                  " UR                  UR                  S9U l        [        R
                  " UR                  UR                  S9U l	        [        R
                  " UR                  UR                  S9U l
        [        X5      U l        [        X5      U l        g )Nr   )eps)r   r   r   r   r   	LayerNormr_   layer_norm_epsilonpre_lnln1ln2r   	attentionr   feed_forward)r   r   r   r   s      r.   r   RwkvBlock.__init__]  s     q=,,v'9'9v?X?XYDK<< 2 28Q8QR<< 2 28Q8QR*6<+F=rp   c                    U R                   S:X  a  U R                  U5      nU R                  U R                  U5      X#S9u  pRX-   nU R	                  U R                  U5      US9u  pbX-   nX4nU(       a  Xu4-  nU$ US-  nU$ )Nr   )r[   r   r   rs   )r   r   r   r   r   r   )r   r   r[   r   output_attentionsr   r   outputss           r.   rQ   RwkvBlock.forwardk  s    ==A[[(F>>$((6*:%>]	#"//0@/N&/|#G  wGrp   )r   r   r   r   r   r   r   )NFFr   r   s   @r.   r   r   \  s    > rp   r   c                   \    \ rS rSr% \\S'   SrS/rSS/rSr	Sr
S\R                  4S	 jrS
rg)RwkvPreTrainedModeli~  r   r   r   rW   rX   Tmodulec           	      
   [        U[        5      (       GaU  UR                  nUR                  R                  nUR                  R
                  nUR                  nX#S-
  -  nSX#-  -
  n[        R                  " [        U5       Vs/ sH  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        U5       V
s/ sH  n
SSXS-
  -  SSU-  -   -  -  -   PM     nn
[        R                  " XR                  R                  UR                  R                  S9n[        R                  " [        U5       Vs/ sH  oS-   S	-  S-
  PM     snUR                  R                  UR                  R                  S9S
-  nXR                  l        [        R                   " UR                  ["        R$                  " S5      -  U-   5      UR                  l        [        R&                  " X5      UR                  l        [        R&                  " X5      SU-  -   UR(                  l        [        R&                  " U	S
U-  5      UR*                  l        g[        U[,        5      (       a  UR                  nUR                  R                  nUR                  R
                  nSX#-  -
  n[        R                  " [        U5       Vs/ sH  oU-  PM	     snUR                  R                  UR                  R                  S9n	U	SSSS24   n	[        R&                  " X5      UR                  l        [        R&                  " X5      UR*                  l        g[        U[.        R0                  5      (       a  UR2                  R                  R4                  nSnSnUR6                  b$  UR6                  R                  R9                  5         US   US   :  a  ["        R:                  " US   US   -  5      nUS   U R                  R<                  :X  a  US   U R                  R
                  :X  a  S
nX-  n[.        R>                  RA                  UR2                  US9  g[        U[.        RB                  5      (       ar  UR2                  R                  R4                  nS["        R:                  " [E        US   US   5      5      -  n[.        R>                  RA                  UR2                  US9  g[        U[.        RF                  5      (       aJ  UR2                  R                  RI                  S5        UR6                  R                  R9                  5         ggs  snf s  sn
f s  snf s  snf )zInitialize the weights.r   g      ?r8   r9   N   gffffff?g?r   g      ?g333333?r   )gaing-C6?)%
isinstancer   r   r   num_hidden_layersr_   r   rB   tensorr|   r   r8   r9   rW   rX   data	ones_likemathlogpowr   r   r   r   r   weightshaper   zero_sqrt
vocab_sizeinitorthogonal_	Embeddingmaxr   fill_)r   r   r   r   r_   r   ratio_0_to_1ratio_1_to_almost0itime_weighthdecay_speedzigzagr   r   scales                   r.   _init_weights!RwkvPreTrainedModel._init_weights  su   f/00H & ? ? --33K$*$@$@!#1'<=L!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4K 455A Q!q89sS<EW?WXXX5    ,,{:K:K:Q:QZ`ZkZkZrZrsK.34I.JK.J!eq[1_.JK ++11!,,33
   &1"%*__V5F5FRU5VY_5_%`F"',yy'QF$).;)SVY\hVh)hF!!&.3iiSK]E].^F&&+00H & ? ? --33K!$(D!E,,*/*<=*<Q[*<=))//**11K
 &dD!m4K',yy'QF$.3ii.XF&&+		**MM&&,,EDE{{&  &&(Qx%("yyqE!H!45Qx4;;111eAh$++BYBY6YMDGGD9--MM&&,,E$))Ca%($;<<DGGD9--MM$$S)KK""$ .w > L* >s   U1U6U;V rq   N)rt   ru   rv   rw   r   __annotations__base_model_prefix_no_split_modules_keep_in_fp32_modulessupports_gradient_checkpointing_is_statefulr   Moduler  ry   rq   rp   r.   r   r   ~  s>    $)<8&*#LI%BII I%rp   r   z+
    Class for the RWKV model outputs.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\\R                        \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)

RwkvOutputi  z
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.
Nlast_hidden_stater[   .hidden_states
attentionsrq   )rt   ru   rv   rw   __doc__r  r   rB   FloatTensorr  r[   listr  r   r  ry   rq   rp   r.   r  r    sw     6:x 1 129/3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rp   r  zK
    Base class for causal language model (or autoregressive) outputs.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   S
rg)RwkvCausalLMOutputi  aP  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
state (list of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`):
    The state of the model at the last time step. Can be used in a forward method with the next `input_ids` to
    avoid providing the old `input_ids`.
Nlosslogitsr[   .r  r  rq   )rt   ru   rv   rw   r  r  r   rB   r  r  r  r[   r  r  r   r  ry   rq   rp   r.   r  r    s     )-D(5$$
%,*.FHU&&'./3E8D**+,3=AM8E%"3"3S"89:A:>Ju00#567>rp   r  c                     ^  \ rS rSrU 4S jrS rS r\        SS\\	R                     S\\	R                     S\\	R                     S\\\	R                        S	\\   S
\\   S\\   S\\   S\\\4   4S jj5       rS rS rSrU =r$ )	RwkvModeli  c           
        > [         TU ]  U5        [        R                  " UR                  UR
                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        [        R                  " UR
                  5      U l        SU l        SU l        U R!                  5         g s  snf )Nr   F)r   r   r   r   r   r_   
embeddings
ModuleListr|   r   r   blocksr   ln_outlayers_are_rescaledgradient_checkpointing	post_init)r   r   idxr   s      r.   r   RwkvModel.__init__   s     ,,v'8'8&:L:LMmmPUV\VnVnPo$pPoYv%DPo$pqll6#5#56#( &+# 	 %qs   (C
c                     U R                   $ rs   r  r   s    r.   get_input_embeddingsRwkvModel.get_input_embeddings  s    rp   c                     Xl         g rs   r'  r   new_embeddingss     r.   set_input_embeddingsRwkvModel.set_input_embeddings  s    (rp   	input_idsattention_maskinputs_embedsr[   r   r   output_hidden_statesreturn_dictreturnc	           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UO(U R                  (       d  U R                   R                  OSnUb  UOU R                   R
                  nUb  [        R                  S5        U R                  U R                  :X  a  U R                  5         Ub  Ub  [        S5      eUc  Uc  [        S5      eUc  U R                  U5      nU(       a  Uc  UR                  S5      U R                   R                  U R                   R                  4n	[        S5       V
s/ sHC  n
[         R"                  " XS::  a  UR$                  O[         R&                  UR(                  S	.6PME     nn
US
==   S-  ss'   U R*                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUnU(       a  SOSnU(       a  SOSn[-        U R.                  5       Hz  u  pU" XXVS9u  pnU R                  (       a?  U R                   R0                  S:  a%  US-   U R                   R0                  -  S:X  a  US-  nU(       a  X4-   nU(       d  Mt  UU4-   nM|     U R3                  U5      nU(       a  X4-   nU(       d  [5        S XX4 5       5      $ [7        UUUUS9$ s  sn
f )a(  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
NFz<`attention_mask` was passed, but it is unused in this model.zDYou cannot specify both input_ids and inputs_embeds at the same timez5You have to specify either input_ids or inputs_embedsr      r   r   r   gꌠ9Y>)FzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...rq   )r[   r   r   r:   c              3   ,   #    U H  oc  M  Uv   M     g 7frs   rq   )r   xs     r.   r   $RwkvModel.forward.<locals>.<genexpr>u  s     t$bq$bs   	)r  r[   r  r  )r   r   r3  trainingr   use_return_dictr$   warning_oncer!  _rescale_layersr>   r  r=   r_   r   r|   rB   rI   r8   rJ   r9   r"  	enumerater  rescale_everyr   r   r  )r   r0  r1  r2  r[   r   r   r3  r4  r   r   r  all_self_attentionsall_hidden_statesr$  blockr  s                    r.   rQ   RwkvModel.forward  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IZ^ZgZgT[[=R=Rmr	%0%<k$++B]B]% ^_==D444  " ]%>cdd=#8TUU  OOI6M"''*DKK,C,CT[[EbEbcE
 q	 "A a-"5"5U]][h[o[o "	   !HH&&4==##p "	%$5b4"6BD#DKK0JC/4i0,M*
 ((KK--11W 9 99Q> - 1#$58H$H!  &9ZM&I#! 1$ M2 14D Dt];L$bttt++*	
 	
[s   A	Kc           	         U R                   U R                  (       + :X  a  g U R                  R                  S:  Ga  [        R
                  " 5          [        U R                  5       GH  u  pU R                  (       a  UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                  S[        XR                  R                  -  5      -  5        M  [        UR                  R                  R                  S5      (       a  UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R                   R#                  S[        XR                  R                  -  5      -  5        GM  [        UR                  R                  R                  S5      (       aO  U R%                  UR                  R                  U5        U R%                  UR                  R                  U5        GM!  UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        UR                  R                  R                  R#                  S[        XR                  R                  -  5      -  5        GM     S S S 5        U R                  (       + U l         g ! , (       d  f       N%= f)Nr   r:   SCBquant_state)r!  r;  r   r@  rB   no_gradr?  r  r   r`   r   mul_intr   rZ   hasattrrF  div_ _bnb_4bit_dequantize_and_rescale)r   block_idrC  s      r.   r>  RwkvModel._rescale_layers~  s?   ##DMM(9:;;$$q('0'=OH}}..55::1HP[P[PiPiDi@j;jk**0077<<Q#hR]R]RkRkFkBl=lm #5??#9#9#@#@%HH!OO2299==BB1HXcXcXqXqLqHrCrs!..44;;??DDQ#hZeZeZsZsNsJtEtu$U__%;%;%B%BMRR AA%//BXBXZbc AA%BTBTBZBZ\de!OO2299>>qCT_T_TmTmHmDn?no!..44;;@@c(VaVaVoVoJoFpApq (> !" (,}}#4 # !s   KM
Mc                    [        5       (       d  [        S5      eSSKnUR                  R	                  UR
                  R                  UR
                  R                  5      nUR                  S[        X R                  R                  -  5      -  5        UR                  R                  UR                  S5      SS9R                  UR                  5      n[!        USU5        g)	z
Perform the dequantization and rescaling of the weights of a given layer. After that operation the layer will
be quantized again.
z/Please install bitsandbytes to use this method.r   Nr:   cpuF)requires_gradr   )r   ImportErrorbitsandbytes
functionaldequantize_4bitr   r   rG  rL  rJ  r   r@  r   
Params4bitrU   r9   setattr)r   target_layerrN  bnbdequant_weightsquant_weights         r.   rM  *RwkvModel._bnb_4bit_dequantize_and_rescale  s    
 )**OPP"..889L9L9Q9QS_SfSfSrSrsQ#h++2K2K&K"LLM vv((););E)BRW(X[[\k\r\rsh5rp   )r  r  r"  r!  r   )NNNNNNNN)rt   ru   rv   rw   r   r)  r.  r   r   rB   
LongTensorr  r  boolr   r   r  rQ   r>  rM  ry   r   r   s   @r.   r  r    s    )  15595937$(,0/3&*g
E,,-g
 !!1!12g
   1 12	g

 U../0g
 D>g
 $D>g
 'tng
 d^g
 
uj 	!g
 g
R506 6rp   r  z
    The RWKV Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                   @  ^  \ rS rSrS/rU 4S jrS rS rSS jr\	         SS\
\R                     S\
\R                     S	\
\R                     S
\
\\R                        S\
\R                     S\
\   S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )RwkvForCausalLMi  zhead.weightc                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r   r   r  r   r   r   r_   r   headr#  )r   r   r   s     r.   r   RwkvForCausalLM.__init__  sH     f%	IIf00&2C2C%P	 	rp   c                     U R                   $ rs   rc  r(  s    r.   get_output_embeddings%RwkvForCausalLM.get_output_embeddings  s    yyrp   c                     Xl         g rs   rf  r,  s     r.   set_output_embeddings%RwkvForCausalLM.set_output_embeddings  s    "	rp   c                 j    Ub  US S 2S4   R                  S5      nUb  Uc  SU0nOSU0nX&S'   XFS'   U$ )Nr   r2  r0  r[   r   )rL   )r   r0  r[   r2  r   kwargsmodel_inputss          r.   prepare_inputs_for_generation-RwkvForCausalLM.prepare_inputs_for_generation  sY     !!R%(2226I $+];L'3L %W$-[!rp   r0  r1  r2  r[   labelsr   r   r3  r4  r5  c
           
      x   U	b  U	OU R                   R                  n	U R                  UUUUUUU	S9nUS   nU R                  U5      nSnUb*  U R                  " UU4SU R                   R
                  0U
D6nU	(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  S9$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
    `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
    sequence tokens in the vocabulary.

    If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
    `input_ids`.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
state (tuple of five `torch.FloatTensor` of shape `(batch_size, hidden_size, num_hidden_layers)`, *optional*):
    If passed along, the model uses the previous state in all the blocks (which will give the output for the
    `input_ids` provided as if the model add `state_input_ids + input_ids` as context).
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
    `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
    are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
use_cache (`bool`, *optional*):
    If set to `True`, the last state is returned and can be used to quickly generate the next logits.
N)r2  r[   r   r   r3  r4  r   r   r   )r  r  r[   r  r  )
r   r<  r   rc  loss_functionr   r  r[   r  r  )r   r0  r1  r2  r[   rq  r   r   r3  r4  rm  rwkv_outputsr  r  r  r`   s                   r.   rQ   RwkvForCausalLM.forward  s    J &1%<k$++B]B]yy'/!5# ! 
 %Q=)%%  ;;11 	D Yab!11F)-)9TGf$EvE!$$&44#..
 	
rp   )rc  r   )NNN)	NNNNNNNNN)rt   ru   rv   rw   _tied_weights_keysr   rg  rj  ro  r   r   rB   r^  r  r  r_  r   r   r  rQ   ry   r   r   s   @r.   ra  ra    s    (#"  15595937-1$(,0/3&*F
E,,-F
 !!1!12F
   1 12	F

 U../0F
 ))*F
 D>F
 $D>F
 'tnF
 d^F
 
u((	)F
 F
rp   ra  )ra  r  r   rr   )/r  r   dataclassesr   pathlibr   typingr   r   rB   torch.utils.checkpointr   
generationr	   modeling_layersr
   modeling_utilsr   utilsr   r   r   r   r   r   configuration_rwkvr   
get_loggerrt   r$   r"   r/   autogradFunctionr1   r   r   r  r   r   r   r   r  r  r  ra  __all__rq   rp   r.   <module>r     s{      !  "    ) 9 -  + 
		H	%  5@g
%..11 g
T)XbC5		 C5L#)bii #)L* D Q%/ Q% Q%h 

? 
? 
? 
? ? ?$ j6# j6 j6Z i
)? i
i
X Brp   