
    <h                        S r SSKrSSKJrJrJr  SSKrSSKJs  J	r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJr  SSKJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  SSK,J-r-J.r.  \ " 5       (       a  SSK/J0r0   " S S\5      r1 " S S\'5      r2 " S S\(5      r3   S=S\Rh                  S\Rj                  S\Rj                  S\Rj                  S\\Rj                  S 4   S!\\6   S"\\6   S#\\Rj                     S$\7\Rj                  \Rj                  4   4S% jjr8\" 5       r9\8\9S&'    " S' S(\Rh                  5      r: " S) S*\%5      r; " S+ S,\Rh                  5      r< " S- S.\5      r= " S/ S0\&5      r> " S1 S2\.5      r?    S>S3\\Rj                  \7\Rj                     S4   S4\\@   S5\\@   S6\@S\\Rj                     S$\\Rj                  \@4   4S7 jjrA " S8 S9\-5      rB " S: S;\$5      rC/ S<QrDg)?zPyTorch Doge model.    N)CallableOptionalUnion)nn   )ACT2FN)Cache)PretrainedConfig)compile_friendly_flex_attention)GradientCheckpointingLayer)MoeCausalLMOutputWithPastMoeModelOutputWithPast)rope_config_validation)AttentionInterface)Unpack)TransformersKwargsis_torch_flex_attn_available)OutputRecorder   )LlamaForSequenceClassificationLlamaMLPLlamaPreTrainedModelLlamaRMSNormLlamaRotaryEmbeddingapply_rotary_pos_embeager_attention_forward	repeat_kv)MixtralForCausalLMMixtralModel)	BlockMaskc                      ^  \ rS rSrSrSrS/r0 SS_SS_SS_S	S
_SS
_SS_SS_SS_SS_SS_SS_SS_SS
_SS_SS_SS_rS/S/4SS/S/4S/S/4S.r                          S!U 4S jjr	S r
U =r$ )"
DogeConfig6   a0  
This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge
model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 32768):
        Vocabulary size of the Doge2 model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`DogeModel`]
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer decoder.
    hidden_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for each sequence transformation and state transformation module.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether the model's input and output word embeddings should be tied.
    max_position_embeddings (`int`, *optional*, defaults to 2048):
        The maximum sequence length that this model might ever be used with.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`Dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings.
        NOTE: if you apply new rope type and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value accordingly.
        Doge family of small models use `{ 'rope_type': 'dynamic', 'factor': 4.0, 'original_max_position_embeddings': 2048 }` as the default value.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings.
                In most scaling types, a `factor` of x will enable the model to handle sequences of length x * original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'.
                The original max position embeddings used during pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation.
                If unspecified, it defaults to value recommended by the implementation, using the `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<`original_max_position_embeddings`).
                Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<`original_max_position_embeddings`).
                Must be a list of numbers with the same length as the hidden size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*):
        This is the number of key_value heads that should be used to implement Grouped Query Attention.
        If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used.
        When converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed by meanpooling all the original heads within that group.
        For more details checkout [this paper](https://arxiv.org/pdf/2305.13245.pdf).
        If it is not specified, will default to `num_attention_heads`.
    attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
    sliding_window (`int`, *optional*):
        Sliding window attention window size. If not specified, will default to `None`.
    keep_window_size (`int`, *optional*, defaults to 2048):
        The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    is_moe (`bool`, *optional*, defaults to `False`):
        Whether to use the Cross Domain Mixture of Experts, if `True`, the MoE will inherit the MLP to initialize.
    num_experts (`int`, *optional*, defaults to 16384):
        Number of routed experts in the model. This is only used when `is_moe=True`.
    num_experts_per_tok (`int`, *optional*, defaults to 64):
        Number of selected experts to route per-token.
    norm_topk_prob (`bool`, *optional*, defaults to `False`):
        Whether to normalize the topk probabilities.
    output_router_logits (`bool`, *optional*, defaults to `False`):
        Whether or not the router logits should be returned by the model. Enabling this will also
        allow the model to output the auxiliary loss, including load balancing loss and router z-loss.
    router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
        The aux loss factor for the total loss.

```python
>>> from transformers import DogeConfig, DogeModel

>>> # Initializing a Doge-320M style configuration
>>> configuration = DogeConfig()

>>> # Initializing a model from the Doge-320M style configuration
>>> model = DogeModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```dogepast_key_valueszlayers.*.self_attn.q_projcolwisezlayers.*.self_attn.k_projzlayers.*.self_attn.v_projzlayers.*.self_attn.dt_projrowwisezlayers.*.self_attn.o_projzlayers.*.input_layernorm.weightsequence_parallelzlayers.*.input_residual.weightz(layers.*.post_attention_layernorm.weightz'layers.*.post_attention_residual.weightznorm.weightzlayers.*.mlp.gate_projzlayers.*.mlp.up_projzlayers.*.mlp.down_projzlayers.*.mlp.router_gatecolwise_repzlayers.*.mlp.down_embedrowwise_repzlayers.*.mlp.up_embed	input_idsinputs_embedshidden_statesattention_mask)embed_tokenslayersnormc                   > Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [3        U 5        Uc  Xl        [4        TU ]l  " SSU
0UD6  g )Ntype	rope_typetie_word_embeddings )
vocab_sizehidden_sizeintermediate_sizenum_hidden_layershidden_dropout
hidden_actinitializer_rangerms_norm_eps	use_cachemax_position_embeddings
rope_thetarope_scalingnum_attention_headsnum_key_value_headsattention_biasattention_dropoutmlp_biassliding_windowkeep_window_sizeis_moenum_expertsnum_experts_per_toknorm_topk_proboutput_router_logitsrouter_aux_loss_coefr   super__init__)selfr7   r8   r9   r:   r;   r<   r=   r>   r?   r5   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   rN   rO   kwargs	__class__s                               ]/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/doge/modular_doge.pyrQ   DogeConfig.__init__   s   < %&!2!2,$!2("'>$$(#6 #6 ,!2 , 0&#6 ,$8!$8! (Vt7H7H-H-1->->v-FDk*t$ &':$ 	
 3	
	
    )rE   rF   r<   r;   r8   r=   r9   rJ   rI   r@   rG   rM   rC   rK   rL   r:   rD   rN   r>   rB   rA   rO   rH   r?   r7   )i   i                  silug{Gz?gư>TFrX   g     @N   NFrZ   FNrX   Fi @  @   FFgMbP?)__name__
__module____qualname____firstlineno____doc__
model_typekeys_to_ignore_at_inferencebase_model_tp_planbase_model_pp_planrQ   __static_attributes____classcell__rT   s   @rU   r"   r"   6   su   n` J#4"5#Y#Y 	$Y 	%i	
 	$Y 	*+> 	)*= 	34G 	23F 	* 	!) 		 	!) 	#M 	"=  	 !& &(9:#%568IJ!"_$56 ! $ ""7G
 G
rW   r"   c                       \ rS rSrSrg)DogeRMSNormi  r6   Nr^   r_   r`   ra   rg   r6   rW   rU   rk   rk         rW   rk   c                       \ rS rSrSrg)DogeRotaryEmbeddingi  r6   Nrl   r6   rW   rU   ro   ro     rm   rW   ro   modulequerykeyvaluer.   r    scalingsoftcap	head_maskreturnc                 6  ^^^ S n	S m[        U[        5      (       a  Un	OUmTb  TS S 2S S 2S S 2S UR                  S   24   mUUU4S jn
[        UUUU
U	SUSS9u  pUR	                  UR
                  5      nUR                  SS5      R                  5       nX4$ )Nc                    > Tb  T[         R                  " U T-  5      -  n Tb  U TU   U   U   U   -   n Tb  U TU   U   S   S   -   n U $ )Nr   )torchtanh)score	batch_idxhead_idxq_idxkv_idxcausal_maskrv   ru   s        rU   	score_mod)flex_attention_forward.<locals>.score_mod)  sm    ejj99E"K	28<UCFKKE Ii0:1=a@@ErW   T)r   
block_mask
enable_gqascale
return_lse   r   )
isinstancer    shaper   todtype	transpose
contiguous)rp   rq   rr   rs   r.   rt   ru   rv   rS   r   r   attn_outputattention_weightsr   s         ``     @rU   flex_attention_forwardr     s     JK.),,#
$!!Q?SYYr]?":; &E &"K *,,U[[9''1-88:K))rW   doge_flex_attentionc                     ^  \ rS rSrSS\S\\   4U 4S jjjr   SS\R                  S\
\R                  \R                  4   S\\R                     S\\   S	\\R                     S
\
\R                  \\R                     \\
\R                        4   4S jjr  SS\R                  S\R                  S\S\\R                     4S jjrSrU =r$ )DogeAttentioniI  config	layer_idxc                   > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        UR                  U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R&                  " [(        R*                  " UR                  5      5      U l        [        R                  " UR                  U R                  -  UR                  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        [3        U R                  UR4                  S9U l        [3        U R                  UR4                  S9U l        g )Nhead_dimg      ࿩biaseps)rP   rQ   r   r   getattrr8   rC   r   rD   num_key_value_groupsrt   rF   rI   r   LinearrE   q_projk_projv_proj	Parameterr{   zerosAdt_projo_projrk   r>   q_normk_normrR   r   r   rT   s      rU   rQ   DogeAttention.__init__J  s   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9 & 7 7ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ekk&*D*DEFyy&&68R8RY_YnYn
 ii&&68J8JQWQfQf
 "$--V5H5HI!$--V5H5HIrW   r-   position_embeddingsr.   past_key_valuecache_positionrw   c                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  pU R                  UR                  SS5      R                  UR                   S   UR                   S   S5      5      n[        R                  " U R                   ["        R$                  " U5      -  5      R                  SS5      nU R'                  UUU R(                  US9n[+        UU R,                  5      n[.        nU R0                  R2                  S:w  a  [4        U R0                  R2                     nU" U U	U
U4UU R6                  (       d  S	OU R8                  U R:                  S
.UD6u  nnUR                  " / UQSP76 R=                  5       nU R?                  U5      nUU4$ )Nr   r   )sincosr   r   ry   )r-   	dt_statesrI   r.   eagerrZ   )r.   dropoutrt   ) r   r   r   r   viewr   r   r   r   r   updater   r   reshaper{   expr   Fsoftplusprepare_dynamic_maskrI   r   r   r   r   _attn_implementationALL_ATTENTION_FUNCTIONStrainingrF   rt   r   r   )rR   r-   r   r.   r   r   rS   input_shapehidden_shapequery_states
key_statesvalue_statesr   r   cache_kwargsr   	attn_maskattention_interfacer   attn_weightss                       rU   forwardDogeAttention.forwardh  s^    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J LL""1a(001C1CA1FHZHZ[]H^`bc
	 IIdffqzz)'<<=GGBO	--'!22)	 . 
	 i)B)BC	(?;;++w6"9$++:Z:Z"[$7		%

 %#}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((rW   r   rI   c           	         [         R                  " UR                  5      R                  nUR                  nUSS2SS2SSS24   R	                  SSUR
                  S   S5      nUb  [        U[        5      (       d  UR                  [         R                  :X  aB  UR                  n[         R                  " U[         R                  " SUR                  US9U5      nUR                  USS2SS2SS2SUR
                  S   24   S:g  U5      nUR
                  S   U:  ah  [         R                  " XvUR                  S9n[         R                  " XsSSS	S
9R                  n	UR!                  SU	S5      nUR                  US:H  U5      nU$ )a  
The core idea of DMA is to calculate the dynamic attention mask to mask the tokens that should be masked, so as to form sparse attention.

Combine `dt_states` with `attention_mask` to generate the final `attn_mask`.

Args:
    hidden_states (`torch.Tensor`): The input hidden_states, used to determine the minimum value of the current input precision.
    dt_states (`torch.Tensor`): dt_states of shape `(batch_size, num_heads, key_sequence_length)`.
    keep_window_size (`int`): The window size of tokens that are not dynamically masked, and dynamic masking is only performed when the sequence length exceeds this value.
    attention_mask (`torch.Tensor`, *optional*): attention mask of shape `(batch_size, 1, query_sequence_length, key_sequence_length)`.
Nr   r   rZ   )devicer   r   r   r   TF)dimlargestsorted      ?)r{   finfor   minexpandr   r   r    boolwheretensorr   masked_fill
zeros_liketopkindicesscatter)
rR   r-   r   rI   r.   	min_dtyper   r   active_masktopk_indicess
             rU   r   "DogeAttention.prepare_dynamic_mask  se   $ KK 3 3488	##aD!m,33M''*B
	 %j.S.S##uzz1%++!&"ELL^=R=RZ_$`bk" "--nQ1F[	XZH[F[=[.\`a.aclmI??2!11**9)JZJZ[K ::irSW`efnnL%--b,DK!--kS.@)LIrW   )r   rF   r   r   r   r   r   rI   r   r   r   r   r   rt   r   NNNN)rX   N)r^   r_   r`   ra   r"   r   intrQ   r{   Tensortupler	   
LongTensorr   r   rg   rh   ri   s   @rU   r   r   I  s   Jz Jhsm J JD 26*.596)||6) #5<<#=>6) !.	6)
 !6) !!1!126) 
u||Xell3XeELL>Q5RR	S6)x !%15#||# <<# 	#
 !.# #rW   r   c                       \ rS rSrSrg)DogeMLPi  r6   Nrl   r6   rW   rU   r   r     rm   rW   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )	DogeCDMoEi  r   c                   > [         TU ]  5         UR                  U l        UR                  U l        [        UR
                     U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        UR                  U l        UR                  U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  UR"                  S9U l        [        R                   " U R                  U R                  S-  SS9U l        [        R,                  " U R                  U R                  5      U l        [        R,                  " U R                  U R                  5      U l        g )Nr   r   F)rP   rQ   r8   r9   r   r<   act_fnrK   mathfloorsqrtnum_keysrL   top_krM   r   r   rG   	gate_projup_proj	down_projrouter_gate	Embedding
down_embedup_embedrR   r   rT   s     rU   rQ   DogeCDMoE.__init__  s_   !--!'!9!9V../!--

499T-=-=#>?//
$33 4#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRab 99T%5%5t}}q7HuU ,,t'7'79I9IJT%5%5t7G7GHrW   r-   rw   c                    UR                   u  p4nU R                  U5      R                  SX4-  S5      nUR                  U R                  SS9u  u  pxu  pUR                  S5      UR                  S5      -   nU	R                  S5      U R                  -  U
R                  S5      -   nUR                  " / UR                   S S QSP76 nUR                  " / UR                   S S QSP76 nUR                  U R                  SS9u  pUR                  SU5      n[        R                  " USS9nU R                  (       a  UUR                  SSS9-  nU R                  U5      nU R                  U5      n[        R                  " UUR                  X4-  SS5      5      R                  X4-  S5      nU R!                  U5      U-  n[        R                  " UR                  X4-  SS5      U5      R                  X4S5      nU R#                  U R!                  U R%                  U5      5      U R'                  U5      -  5      nUU-   nX4$ )Nr   r   r   ry   T)r   keepdimr   )r   r   r   r   r   	unsqueezer   gatherr   softmaxrM   sumr   r   r{   matmulr   r   r   r   )rR   r-   rS   bszseq_len_router_logitsscores_xscores_y	indices_x	indices_y
all_scoresall_indicesscoresposition_indicesr   routing_weightsr   r   experts_weightsexperts_statess                        rU   r   DogeCDMoE.forward  s+   
 (--a ((7<<QrR 8E7I7I$--]_7I7`44y''+h.@.@.DD
))"-=	@S@STV@WW__@j&6&6s&;@R@
!&&C(9(9#2(>CC#-??4::2?#F $$R)9:))F322r42HHO __W-
==),,z=3E3EcmUWYZ3[\aabeboqst++o6Ho&:&:3=!R&PRZ[``adoqrt{{4>>-3P'QTXT`T`anTo'op%6++rW   )r   r   r   r   r8   r9   rM   rK   r   r   r   r   r   )r^   r_   r`   ra   r"   rQ   r{   r   r   rg   rh   ri   s   @rU   r   r     s5    Iz I.,||, 
	, ,rW   r   c                     ^  \ rS rSrSS\S\\   4U 4S jjjr     SS\R                  S\
\R                  \R                  4   S\\R                     S\\R                     S	\\
\R                        S
\\   S\\R                     S\\   S\
\R                  \\
\R                  \R                  4      4   4S jjrSrU =r$ )DogeDecoderLayeri  r   r   c                 (  > [         TU ]  5         UR                  U l        [        UR                  UR
                  S9U l        [        XS9U l        [        R                  " [        R                  " UR                  5      5      U l        [        UR                  UR
                  S9U l        UR                  (       d  [!        U5      O
[#        U5      U l        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   )r   r   )rP   rQ   r;   rk   r8   r>   input_layernormr   	self_attnr   r   r{   onesinput_residualpost_attention_layernormrJ   r   r   mlppost_attention_residualr   s      rU   rQ   DogeDecoderLayer.__init__  s    $33*6+=+=6CVCVW&fJ ll5::f6H6H+IJ(3F4F4FFL_L_(`%*0--76?Yv=N')||EJJv?Q?Q4R'S$rW   r-   r   r.   position_idsr   r?   r   rS   rw   c                    Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  p[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nUn	U R                  U5      nU R                  U5      n[        R                  " XR                  U R
                  S9nU R                  U	-  U-   nU$ )N)r-   r   r.   r  r   r?   r   )pr   r6   )
r  r  r   r   r;   r   r  r  r  r  )rR   r-   r   r.   r  r   r?   r   rS   residualself_attn_weightss              rU   r   DogeDecoderLayer.forward  s     !,,];+/>> 	,
' 3)%))	,
 	,
( 		-3F3FQUQ^Q^_++h6F !55mD/		-3F3FQUQ^Q^_44x?-OrW   )r;   r  r  r  r  r  r  r   )NNNFN)r^   r_   r`   ra   r"   r   r   rQ   r{   r   r   r   r   r   r   FloatTensorr   rg   rh   ri   s   @rU   r  r    s   
Tz 
Thsm 
T 
T  26378<$)59"||" #5<<#=>" !.	"
 u//0" !u||!45" D>" !!1!12" +," 
u  (51B1BEDUDU1U+V"WW	X" "rW   r  c                   8    \ rS rSrSrSr\" \SS9\\	S.r
S rSrg)	DogePreTrainedModeli5  Fr   )index)r  r-   
attentionsc                    [         R                  " U5        [        U[        5      (       a7  [	        US5      (       a%  UR
                  R                  R                  5         gg[        U[        5      (       an  [	        US5      (       a%  UR                  R                  R                  S5        [	        US5      (       a&  UR                  R                  R                  S5        ggg)zInitialize the weightsr   r  r   r  N)r   _init_weightsr   r   hasattrr   datazero_r  r  fill_r  )rR   rp   s     rU   r+  !DogePreTrainedModel._init_weights>  s    **62fm,,vs####% $ 011v/00%%**005v899..3399#> : 2rW   r6   N)r^   r_   r`   ra   _supports_flash_attn_can_compile_fullgraphr   r   r  r   _can_record_outputsr+  rg   r6   rW   rU   r'  r'  5  s+     "'	;)#
?rW   r'  c                       \ rS rSrSrg)	DogeModeliK  r6   Nrl   r6   rW   rU   r5  r5  K  rm   rW   r5  gate_logitsrK   r   r   c                    U b  [        U [        5      (       d  gU S   R                  nU S   R                  n/ n/ nU  GH  n	U	R	                  U5      n	U	R                  USS9u  u  pu  pU
R                  S5      UR                  S5      -   nUR                  S5      U-  UR                  S5      -   nUR                  " / UR                  SS QSP76 nUR                  " / UR                  SS QSP76 nUR                  USS9u  nnUR                  SU5      n[        R                  " USS9nUR                  U5        UR                  U5        GM     [        R                  " USS9n[        R                  " USS9nUcu  UR                  S5      n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      UR                  S   -  n[        R$                  " USS9nGO;UR                  u  nn['        U 5      nUSSS2SS2S4   R)                  UUUU45      R+                  S5      R	                  U5      nUR                  S5      UR-                  5          n[        R                  " XUS9n[        R                   " XuUS9nUR#                  SUU5      [        R.                  " U5      -  nUSSS2SS2S4   R)                  UUUU45      R+                  SU5      R	                  U5      n[        R.                  " UU-  SS9[        R.                  " USS9-  n[        R.                  " UU-  5      nUU-  $ )a  
Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.

See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
experts is too unbalanced.

Args:
    gate_logits:
        Logits from the `router_gate`, should be a tuple of model.config.num_hidden_layers tensors of
        shape [2, batch_size * sequence_length, num_keys].
    num_experts:
        Number of experts
    num_keys:
        Number of keys
    top_k:
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter.
    attention_mask (`torch.Tensor`, *optional*):
        The attention_mask used in forward function
        shape [batch_size X sequence_length] if not None.

Returns:
    The auxiliary loss.
Nr   r   r   ry   r   )r   r   r   r   r   r   r   r   r   r   r   r  appendr{   catr   	ones_likescatter_add_meanlenr   r   r   r  )r6  rK   r   r   r.   compute_dtypecompute_deviceall_expert_indicesall_routing_weightslayer_gate_logitsr  r	  r
  r  r  r  r  r  expert_indicesr  tokens_per_expertpadrouter_prob_per_expert
batch_sizesequence_lengthr:   expert_attention_mask router_per_expert_attention_maskoverall_losss                                rU   load_balancing_loss_funcrL  O  si   @ *[%"@"@N((M ^**N(-00@7H7M7Mh\^7M7_44y''+h.@.@.DD
))"-89;N;Nr;RR__@j&6&6s&;@R@
!&&C(9(9#2(>CC(ooeo<$++B0@A))JB7!!.1""?3! )" #51=))$7Q?/44R8!KKQ_`oo0n]-::1>PRUVYkYqYqrsYtt "',?Q!G&4&:&:#
O, 4At+,V&
OUKLWR[R	 	 044R89N9S9S9UV "KKQ_`oo0n]-::1>PRUVY^YbYb!Z
 
 4At+,V&
O[QRWR%R	 	) "'+>Aa+agh!ilqlulu,!m
 "
 99.1GGHL+%%rW   c                   f  ^  \ rS rSrU 4S jr          SS\\R                     S\\R                     S\\R                     S\\	\R                        S\\R                     S\\R                     S	\\   S
\\R                     S\\\R                  4   S\\   S\\   S\4S jjrSrU =r$ )DogeForCausalLMi  c                 f   > [         TU ]  U5        [        U5      U l        UR                  U l        g r   )rP   rQ   r5  modelrK   r   s     rU   rQ   DogeForCausalLM.__init__  s*     v&
!--rW   r+   r.   r  r%   r,   labelsr?   r   logits_to_keeprN   rS   rw   c                    U
b  U
OU R                   R                  n
U R                  " SUUUUUUUS.UD6nUR                  n[	        U	[
        5      (       a  [        U	* S5      OU	nU R                  USS2USS24   5      nSnUb  U R                  " XU R                  40 UD6nSnU
(       a  [        UR                  U R                  [        R                  " [        R                  " U R                  5      5      U R                   U5      nUb+  UU R"                  UR%                  UR&                  5      -  -  n[)        UUUUR*                  UR,                  UR.                  UR                  S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, DogeForCausalLM

>>> model = DogeForCausalLM.from_pretrained("SmallDoge/Doge-320M")
>>> tokenizer = AutoTokenizer.from_pretrained("SmallDoge/Doge-320M")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```N)r+   r.   r  r%   r,   r?   r   )lossaux_losslogitsr%   r-   r)  r  r6   )r   rN   rP  last_hidden_stater   r   slicelm_headloss_functionr7   rL  r  rK   r   r   r   rL   rO   r   r   r   r%   r-   r)  )rR   r+   r.   r  r%   r,   rR  r?   r   rS  rN   rS   outputsr-   slice_indicesrW  rU  rV  s                     rU   r   DogeForCausalLM.forward  sm   J %9$D $++JjJj 	
 +/** 	+
)%+')	+
 	+
  118B>SV8W8W~ot4]kmA}a,?@A%%fdooPPD/%%  

499T%5%567((H !11HKK4LLL(#33!//))!//
 	
rW   )rP  rK   )
NNNNNNNNr   N)r^   r_   r`   ra   rQ   r   r{   r   r   listr%  r   r   r   r   r   r   r   rg   rh   ri   s   @rU   rN  rN    s"   . 151537=A59-1$(5934/3Q
E,,-Q
 !.Q
 u//0	Q

 "$u'8'8"9:Q
   1 12Q
 ))*Q
 D>Q
 !!1!12Q
 c5<</0Q
 'tnQ
 +,Q
 
#Q
 Q
rW   rN  c                       \ rS rSrSrg)DogeForSequenceClassificationi  r6   Nrl   r6   rW   rU   ra  ra    rm   rW   ra  )r"   rN  r5  r'  ra  r   )NNr   N)Erb   r   typingr   r   r   r{   torch.nn.functionalr   
functionalr   activationsr   cache_utilsr	   configuration_utilsr
   integrations.flex_attentionr   modeling_layersr   modeling_outputsr   r   modeling_rope_utilsr   modeling_utilsr   processing_utilsr   utilsr   r   utils.genericr   llama.modeling_llamar   r   r   r   r   r   r   r   mixtral.modeling_mixtralr   r   !torch.nn.attention.flex_attentionr    r"   rk   ro   Moduler   floatr   r   r   r   r   r   r  r'  r5  r   rL  rN  ra  __all__r6   rW   rU   <module>rv     sQ  "   , ,     !   3 J 9 Q 9 0 & E +	 	 	 H  !!;S
! S
l	, 		. 	  $#(,.*II.*<<.* 
.* <<	.*
 %,,34.* e_.* e_.* %.* 5<<%&.*b -. 1G - .zBII zz	h 	6,		 6,r/1 /d?. ?,	 	 "&"-1g&u||U5<<%8$>?g&#g& smg& 	g&
 U\\*g& 5<<g&TW
( W
t	$B 	rW   