
    <h                     j   S SK r S SKJrJrJr  S SKrS SKJs  Jr	  S SKJr  SSK
Jr  SSKJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJr  SSKJrJ r   SSK!J"r"J#r#  SSK$J%r%  SSK&J'r'J(r(J)r)J*r*  SSK+J,r,J-r-  SSK.J/r/  \*" 5       (       a  S SK0J1r1  \" S5       " S S\Rd                  5      5       r3 " S S\Rd                  5      r4S r5SDS jr6S\Rn                  S\8S\Rn                  4S jr9 SES \Rd                  S!\Rn                  S"\Rn                  S#\Rn                  S$\\Rn                     S%\:S&\:S'\%\'   4S( jjr;   SFS \Rd                  S!\Rn                  S"\Rn                  S#\Rn                  S$\\Rn                  S)4   S%\\:   S*\\:   S+\\Rn                     S\<\Rn                  \Rn                  4   4S, jjr=\"" 5       r>\=\>S-'    " S. S/\Rd                  5      r? " S0 S1\Rd                  5      r@ " S2 S3\Rd                  5      rA " S4 S5\5      rB\( " S6 S7\#5      5       rC\( " S8 S9\C5      5       rD    SGS:\\Rn                  \<\Rn                     S4   S;\\8   S<\\8   S=\8S$\\Rn                     S\\Rn                  \84   4S> jjrE\( " S? S@\C\5      5       rF " SA SB\\C5      rG/ SCQrHg)H    N)CallableOptionalUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)compile_friendly_flex_attention)create_causal_mask!create_sliding_window_causal_mask) GenericForSequenceClassificationGradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)ROPE_INIT_FUNCTIONSdynamic_rope_update)AttentionInterfacePreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_available)OutputRecordercheck_model_inputs   )
DogeConfig)	BlockMaskRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )DogeRMSNorm3   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z*
DogeRMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/doge/modeling_doge.pyr(   DogeRMSNorm.__init__5   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor*   float32powmeanrsqrtr-   r,   )r.   hidden_statesinput_dtypevariances       r2   forwardDogeRMSNorm.forward=   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r4   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler,   shaper-   r.   s    r2   
extra_reprDogeRMSNorm.extra_reprD   s*    ))*+6$2G2G1HIIr4   )r-   r,   )gư>)	__name__
__module____qualname____firstlineno__r(   rB   rH   __static_attributes____classcell__r1   s   @r2   r$   r$   3   s    $;J Jr4   r$   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )DogeRotaryEmbeddingH   configc                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r'   r(   hasattr
isinstancerV   dictgetrW   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrT   r   rope_init_fnattention_scalingregister_bufferrZ   original_inv_freq)r.   rT   devicerZ   r1   s       r2   r(   DogeRotaryEmbedding.__init__I   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r4   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r7   r   mpscpuF)device_typeenabledr6   dim)r9   )rZ   floatexpandrF   r:   rg   r]   rX   strr*   autocast	transposecatcosrd   sinr9   )
r.   xposition_idsinv_freq_expandedposition_ids_expandedrl   freqsembrv   rw   s
             r2   rB   DogeRotaryEmbedding.forwardZ   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)rd   rT   ra   rf   rb   rc   rW   N)rJ   rK   rL   rM   r    r(   r*   no_gradr   rB   rN   rO   rP   s   @r2   rR   rR   H   s6    /z / /" ]]_<  <r4   rR   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr7   r6   rn   )rF   r*   ru   )rx   x1x2s      r2   rotate_halfr   j   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkrv   rw   ry   unsqueeze_dimq_embedk_embeds           r2   apply_rotary_pos_embr   q   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr4   r?   n_repreturnc                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r   N)rF   rq   reshape)r?   r   batchnum_key_value_headsslenhead_dims         r2   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr4   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr6   r   r7   )ro   r9   ptrainingr   )r   num_key_value_groupsr*   matmulrt   rF   r   
functionalsoftmaxr;   r:   r9   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r2   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r4   r!   softcap	head_maskc                 6  ^^^ S n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUU4S jn
[        UUUU
U	SUSS9u  pUR	                  UR
                  5      nUR                  SS5      R                  5       nX4$ )Nr   c                    > Tb  T[         R                  " U T-  5      -  n Tb  U TU   U   U   U   -   n Tb  U TU   U   S   S   -   n U $ )Nr   )r*   tanh)score	batch_idxhead_idxq_idxkv_idxr   r   r   s        r2   	score_mod)flex_attention_forward.<locals>.score_mod   sm    ejj99E"K	28<UCFKKE Ii0:1=a@@Er4   T)r   
block_mask
enable_gqascale
return_lser   r6   )r]   r!   rF   r   r:   r9   rt   r   )r   r   r   r   r   r   r   r   r   r   r   r   attention_weightsr   s         ``     @r2   flex_attention_forwardr      s     JK.),,#
$!!Q?SYYr]?":; &E &"K *,,U[[9''1-88:K))r4   doge_flex_attentionc                     ^  \ rS rSrSS\S\\   4U 4S jjjr   SS\R                  S\
\R                  \R                  4   S\\R                     S\\   S	\\R                     S
\
\R                  \\R                     \\
\R                        4   4S jjr  SS\R                  S\R                  S\S\\R                     4S jjrSrU =r$ )DogeAttention   rT   	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R&                  " [(        R*                  " UR                  5      5      U l        [        R                  " UR                  U R                  -  UR                  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [3        U R                  UR4                  S9U l        [3        U R                  UR4                  S9U l        g )Nr   g      ࿩biasr0   )r'   r(   rT   r   getattrr/   num_attention_headsr   r   r   r   attention_dropoutkeep_window_sizer   Linearattention_biasq_projk_projv_projr)   r*   zerosAdt_projo_projr$   rms_norm_epsq_normk_normr.   rT   r   r1   s      r2   r(   DogeAttention.__init__   s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9 & 7 7ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ekk&*D*DEFyy&&68R8RY_YnYn
 ii&&68J8JQWQfQf
 "$--V5H5HI!$--V5H5HIr4   r?   position_embeddingsr   past_key_valuecache_positionr   c                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  pU R                  UR                  SS5      R                  UR                   S   UR                   S   S5      5      n[        R                  " U R                   ["        R$                  " U5      -  5      R                  SS5      nU R'                  UUU R(                  US9n[+        UU R,                  5      n[.        nU R0                  R2                  S:w  a  [4        U R0                  R2                     nU" U U	U
U4UU R6                  (       d  S	OU R8                  U R:                  S
.UD6u  nnUR                  " / UQSP76 R=                  5       nU R?                  U5      nUU4$ )Nr7   r   r6   )rw   rv   r   r   r   )r?   	dt_statesr   r   eager        )r   r   r   ) rF   r   r   r   viewrt   r   r   r   r   updater   r   r   r*   expr   Fsoftplusprepare_dynamic_maskr   r   r   r   rT   _attn_implementationALL_ATTENTION_FUNCTIONSr   r   r   r   r   )r.   r?   r   r   r   r   r   input_shapehidden_shapequery_statesr   r   rv   rw   cache_kwargsr   	attn_maskattention_interfacer   r   s                       r2   rB   DogeAttention.forward  s^    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J LL""1a(001C1CA1FHZHZ[]H^`bc
	 IIdffqzz)'<<=GGBO	--'!22)	 . 
	 i)B)BC	(?;;++w6"9$++:Z:Z"[$7		%

 %#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r4   r   r   c           	         [         R                  " UR                  5      R                  nUR                  nUSS2SS2SSS24   R	                  SSUR
                  S   S5      nUb  [        U[        5      (       d  UR                  [         R                  :X  aB  UR                  n[         R                  " U[         R                  " SUR                  US9U5      nUR                  USS2SS2SS2SUR
                  S   24   S:g  U5      nUR
                  S   U:  ah  [         R                  " XvUR                  S9n[         R                  " XsSSS	S
9R                  n	UR!                  SU	S5      nUR                  US:H  U5      nU$ )a  
The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

Args:
    hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
    dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
    keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
Nr7   r   r   )rg   r9   r   r9   rg   TF)ro   largestsorted      ?)r*   finfor9   minrq   rF   r]   r!   boolwheretensorrg   masked_fill
zeros_liketopkindicesscatter)
r.   r?   r   r   r   	min_dtyper9   r   active_masktopk_indicess
             r2   r   "DogeAttention.prepare_dynamic_mask>  se   $ KK 3 3488	##aD!m,33M''*B
	 %j.S.S##uzz1%++!&"ELL^=R=RZ_$`bk" "--nQ1F[	XZH[F[=[.\`a.aclmI??2!11**9)JZJZ[K ::irSW`efnnL%--b,DK!--kS.@)LIr4   )r   r   rT   r   r   r   r   r   r   r   r   r   r   r   r   r   NNN)i   N)rJ   rK   rL   rM   r    r   intr(   r*   TensorrE   r	   
LongTensorrB   r   rN   rO   rP   s   @r2   r   r      s   Jz Jhsm J JD 26*.596)||6) #5<<#=>6) !.	6)
 !6) !!1!126) 
u||Xell3XeELL>Q5RR	S6)x !%15#||# <<# 	#
 !.# #r4   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )DogeMLPid  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nr   )r'   r(   rT   r/   intermediate_sizer   r   mlp_bias	gate_projup_proj	down_projr   
hidden_actact_fnr.   rT   r1   s     r2   r(   DogeMLP.__init__e  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r4   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r  r  r  r  )r.   rx   r  s      r2   rB   DogeMLP.forwardo  s6    NN4;;t~~a/@#ADLLQRO#ST	r4   )r  rT   r  r  r/   r  r  )rJ   rK   rL   rM   r(   rB   rN   rO   rP   s   @r2   r	  r	  d  s    0 r4   r	  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DogeCDMoEit  rT   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UR
                     U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        UR                  U l        UR                  U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  S-  SS9U l        [        R,                  " U R                  U R                  5      U l        [        R,                  " U R                  U R                  5      U l        g )Nr   r6   F)r'   r(   r/   r  r   r  r  num_expertsmathfloorsqrtnum_keysnum_experts_per_toktop_knorm_topk_probr   r   r  r  r  r  router_gate	Embedding
down_embedup_embedr  s     r2   r(   DogeCDMoE.__init__u  s_   !--!'!9!9V../!--

499T-=-=#>?//
$33 4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRab 99T%5%5t}}q7HuU ,,t'7'79I9IJT%5%5t7G7GHr4   r?   r   c                    UR                   u  p4nU R                  U5      R                  SX4-  S5      nUR                  U R                  SS9u  u  pxu  pUR                  S5      UR                  S5      -   nU	R                  S5      U R                  -  U
R                  S5      -   nUR                  " / UR                   S S QSP76 nUR                  " / UR                   S S QSP76 nUR                  U R                  SS9u  pUR                  SU5      n[        R                  " USS9nU R                  (       a  UUR                  SSS9-  nU R                  U5      nU R                  U5      n[        R                  " UUR                  X4-  SS5      5      R                  X4-  S5      nU R!                  U5      U-  n[        R                  " UR                  X4-  SS5      U5      R                  X4S5      nU R#                  U R!                  U R%                  U5      5      U R'                  U5      -  5      nUU-   nX4$ )Nr6   r7   rn   r   T)ro   r8   r   )rF   r!  r   r   r  r   r  gatherr   r   r   sumr#  r$  r*   r   r  r  r  r  )r.   r?   r   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr#  r$  experts_weightsexperts_statess                        r2   rB   DogeCDMoE.forward  s+   
 (--a ((7<<QrR 8E7I7I$--]_7I7`44y''+h.@.@.DD
))"-=	@S@STV@WW__@j&6&6s&;@R@
!&&C(9(9#2(>CC#-??4::2?#F $$R)9:))F322r42HHO __W-
==),,z=3E3EcmUWYZ3[\aabeboqst++o6Ho&:&:3=!R&PRZ[``adoqrt{{4>>-3P'QTXT`T`anTo'op%6++r4   )r  r#  r  r  r/   r  r   r  r  r!  r  r$  r  )rJ   rK   rL   rM   r    r(   r*   r  rB   rN   rO   rP   s   @r2   r  r  t  s5    Iz I.,||, 
	, ,r4   r  c                     ^  \ rS rSrSS\S\\   4U 4S jjjr     SS\R                  S\
\R                  \R                  4   S\\R                     S\\R                     S	\\
\R                        S
\\   S\\R                     S\\   S\
\R                  \\
\R                  \R                  4      4   4S jjrSrU =r$ )DogeDecoderLayeri  rT   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  S9U l        [        XS9U l        [        R                  " [        R                  " UR                  5      5      U l        [        UR                  UR
                  S9U l        UR                  (       d  [!        U5      O
[#        U5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   )rT   r   )r'   r(   hidden_dropoutr$   r/   r   input_layernormr   	self_attnr   r)   r*   r+   input_residualpost_attention_layernormis_moer	  r  mlppost_attention_residualr   s      r2   r(   DogeDecoderLayer.__init__  s    $33*6+=+=6CVCVW&fJ ll5::f6H6H+IJ(3F4F4FFL_L_(`%*0--76?Yv=N')||EJJv?Q?Q4R'S$r4   r?   r   r   ry   r   	use_cacher   r   r   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  p[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nUn	U R                  U5      nU R                  U5      n[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nU$ )N)r?   r   r   ry   r   rE  r   r    )
r=  r>  r   r   r<  r   r?  r@  rB  rC  )r.   r?   r   r   ry   r   rE  r   r   residualself_attn_weightss              r2   rB   DogeDecoderLayer.forward  s     !,,];+/>> 	,
' 3)%))	,
 	,
( 		-3F3FQUQ^Q^_++h6F !55mD/		-3F3FQUQ^Q^_44x?-Or4   )r<  r=  r?  rB  r@  rC  r>  r   )NNNFN)rJ   rK   rL   rM   r    r   r  r(   r*   r  rE   r  r   r   r   FloatTensorrB   rN   rO   rP   s   @r2   r:  r:    s   
Tz 
Thsm 
T 
T  26378<$)59"||" #5<<#=>" !.	"
 u//0" !u||!45" D>" !!1!12" +," 
u  (51B1BEDUDU1U+V"WW	X" "r4   r:  c                   r   ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSrSr\" \SS	9\\S
.rU 4S jrSrU =r$ )DogePreTrainedModeli  rT   modelTr:  past_key_valuesFr   )index)r,  r?   
attentionsc                   > [         TU ]  U5        [        U[        5      (       a7  [	        US5      (       a%  UR
                  R                  R                  5         gg[        U[        5      (       an  [	        US5      (       a%  UR                  R                  R                  S5        [	        US5      (       a&  UR                  R                  R                  S5        ggg)zInitialize the weightsr   r?  r   rC  N)r'   _init_weightsr]   r   r\   r   datazero_r:  r?  fill_rC  )r.   r   r1   s     r2   rS  !DogePreTrainedModel._init_weights  s    f%fm,,vs####% $ 011v/00%%**005v899..3399#> : 2r4   rG  )rJ   rK   rL   rM   r    __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr   r  r:  r   _can_record_outputsrS  rN   rO   rP   s   @r2   rM  rM    sf    &*#+,#4"5 N""&'	;)#
? 
?r4   rM  c                     ^  \ rS rSrS\4U 4S jjr\\       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\	R                     S	\\   S
\\	R                     S\\   S\4S jj5       5       rSrU =r$ )	DogeModeli  rT   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [#        US9U l        SU l        U R)                  5         g s  snf )Nr   )rT   F)r'   r(   pad_token_idpadding_idx
vocab_sizer   r"  r/   embed_tokens
ModuleListrangenum_hidden_layersr:  layersr$   r   normrR   
rotary_embgradient_checkpointing	post_initr   s      r2   r(   DogeModel.__init__   s     !.. ++LL):):F<N<NPTP`P`ammBGH`H`BabBaYf0Bab
   2 28K8KL	-V<&+# 	 cs   C>	input_idsr   ry   rO  inputs_embedsrE  r   r   r   c                 z   US L US L-  (       a  [        S5      eU(       a  Uc
  [        5       nUc  U R                  U5      nUcD  Ub  UR                  5       OSn	[        R
                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nU R                  R                  c  [        O[        n
U
" U R                  UUUUUS9nUnU R                  X5      nU R                  S U R                  R                    H  nU" U4UUUUUUS.UD6nM     U R!                  U5      n[#        UUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )rg   )rT   input_embedsr   r   rO  ry   )r   r   ry   r   rE  r   )last_hidden_staterO  )
ValueErrorr
   ri  get_seq_lengthr*   arangerF   rg   r   rT   sliding_windowr   r   ro  rm  rl  rn  r   )r.   rs  r   ry   rO  rt  rE  r   r   past_seen_tokensmask_functionr   r?   r   decoder_layers                  r2   rB   DogeModel.forward  sh    -t";<YZZ0*nO  --i8M!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L.2kk.H.H.P*Vw#;;&))+%
 & #oomJ![[)H4;;+H+HIM)	$7*).#-	 	M J 		-0%++
 	
r4   )ri  rp  rm  rn  rg  ro  rh  )NNNNNNN)rJ   rK   rL   rM   r    r(   r   r   r   r*   r  r  r	   rK  r   r   r   r   rB   rN   rO   rP   s   @r2   rd  rd    s    z    151537+/59$(59<
E,,-<
 !.<
 u//0	<

 "%<
   1 12<
 D><
 !!1!12<
 +,<
 
 <
  <
r4   rd  gate_logitsr  r  r  c                    U b  [        U [        5      (       d  gU S   R                  nU S   R                  n/ n/ nU  GH  n	U	R	                  U5      n	U	R                  USS9u  u  pu  pU
R                  S5      UR                  S5      -   nUR                  S5      U-  UR                  S5      -   nUR                  " / UR                  SS QSP76 nUR                  " / UR                  SS QSP76 nUR                  USS9u  nnUR                  SU5      n[        R                  " USS9nUR                  U5        UR                  U5        GM     [        R                  " USS9n[        R                  " USS9nUcu  UR                  S5      n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      UR                  S   -  n[        R$                  " USS9nGO;UR                  u  nn['        U 5      nUSSS2SS2S4   R)                  UUUU45      R+                  S5      R	                  U5      nUR                  S5      UR-                  5          n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      [        R.                  " U5      -  nUSSS2SS2S4   R)                  UUUU45      R+                  SU5      R	                  U5      n[        R.                  " UU-  SS9[        R.                  " USS9-  n[        R.                  " UU-  5      nUU-  $ )a  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [2, batch_size * sequence_length, num_keys].
    num_experts:
        Number of experts
    num_keys:
        Number of keys
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r7   rn   r   r   )r]   rE   r9   rg   r:   r   r   r   rF   r'  r   r   appendr*   ru   r   	ones_likescatter_add_r=   lenrq   r   r   r(  )r  r  r  r  r   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr-  r.  r/  r0  r1  r2  r+  r4  expert_indicesr5  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthrl  expert_attention_mask router_per_expert_attention_maskoverall_losss                                r2   load_balancing_loss_funcr  Q  si   @ *[%"@"@N((M ^**N(-00@7H7M7Mh\^7M7_44y''+h.@.@.DD
))"-89;N;Nr;RR__@j&6&6s&;@R@
!&&C(9(9#2(>CC(ooeo<$++B0@A))JB7!!.1""?3! )" #51=))$7Q?/44R8!KKQ_`oo0n]-::1>PRUVYkYqYqrsYtt "',?Q!G&4&:&:#
O, 4At+,V&
OUKLWR[R	 	 044R89N9S9S9UV "KKQ_`oo0n]-::1>PRUVY^YbYb!Z
 
 4At+,V&
O[QRWR%R	 	) "'+>Aa+agh!ilqlulu,!m
 "
 99.1GGHL+%%r4   c                     ^  \ rS rSrS/rSS0rSS/S/40rU 4S jrS rS	 r	\
\          SS
\\R                     S\\R                     S\\R                     S\\\R"                        S\\R"                     S\\R                     S\\   S\\R                     S\\\R                  4   S\\   S\\   S\4S jj5       5       rSrU =r$ )DogeForCausalLMi  zlm_head.weightlm_headcolwise_repr?   logitsc                 J  > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  UR                  SS9U l        UR                  U l	        UR                  U l
        UR                  U l        U R                  5         g )NFr   )r'   r(   rd  rN  rh  r   r   r/   r  router_aux_loss_coefr  r  rq  r  s     r2   r(   DogeForCausalLM.__init__  s     v&
 ++yy!3!3V5F5FUS$*$?$?!!--#)#=#=  	r4   c                     Xl         g r   rN  )r.   decoders     r2   set_decoderDogeForCausalLM.set_decoder  s    
r4   c                     U R                   $ r   r  rG   s    r2   get_decoderDogeForCausalLM.get_decoder  s    zzr4   rs  r   ry   rO  rt  labelsrE  r   logits_to_keepoutput_router_logitsr   r   c                    U
b  U
OU R                   R                  n
U R                  " SUUUUUUUS.UD6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 UD6nSnU
(       a  [        UR                  U R                  [        R                  " [        R                  " U R                  5      5      U R                   U5      nUb+  UU R"                  UR%                  UR&                  5      -  -  n[)        UUUUR*                  UR,                  UR.                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DogeForCausalLM

>>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
>>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)rs  r   ry   rO  rt  rE  r   )lossaux_lossr  rO  r?   rQ  r,  rG  )rT   r  rN  rw  r]   r  slicer  loss_functionrh  r  r,  r  r  r  r  r  r  r:   rg   r   rO  r?   rQ  )r.   rs  r   ry   rO  rt  r  rE  r   r  r  r   outputsr?   slice_indicesr  r  r  s                     r2   rB   DogeForCausalLM.forward  sm   N %9$D $++JjJj 	
 +/** 	+
)%+')	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  

499T%5%567((H !11HKK4LLL(#33!//))!//
 	
r4   )r  rN  r  r  r  rh  )
NNNNNNNNr   N)rJ   rK   rL   rM   _tied_weights_keys_tp_plan_pp_planr(   r  r  r   r   r   r*   r  r  listrK  r   r   r  r   r   r   rB   rN   rO   rP   s   @r2   r  r    se   *+=)H_-z:;H
  151537=A59-1$(5934/3Q
E,,-Q
 !.Q
 u//0	Q

 "$u'8'8"9:Q
   1 12Q
 ))*Q
 D>Q
 !!1!12Q
 c5<</0Q
 'tnQ
 +,Q
 
#Q
  Q
r4   r  c                       \ rS rSrSrg)DogeForSequenceClassificationi)  rG  N)rJ   rK   rL   rM   rN   rG  r4   r2   r  r  )  s    r4   r  )r  rd  rM  r  )Nr   )r   r  )NNr6   N)Ir  typingr   r   r   r*   torch.nn.functionalr   r   r   activationsr   cache_utilsr	   r
   
generationr   integrationsr   integrations.flex_attentionr   masking_utilsr   r   modeling_layersr   r   modeling_outputsr   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   r   configuration_doger    !torch.nn.attention.flex_attentionr!   Moduler$   rR   r   r   r  r  r   rp   r   rE   r   r   r   r	  r  r:  rM  rd  r  r  r  __all__rG  r4   r2   <module>r     s6  0  , ,     ! . ) 7 J R [ Q K A & g g ? *  !!; Y'J")) J (J(<")) <D(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%@  $#(,.*II.*<<.* 
.* <<	.*
 %,,34.* e_.* e_.* %.* 5<<%&.*b -. 1G - .zBII zzbii  6,		 6,r/1 /d ?/ ? ?< O
# O
 O
h "&"-1g&u||U5<<%8$>?g&#g& smg& 	g&
 U\\*g& 5<<g&T j
)? j
 j
Z	$DFY 	 cr4   