ó
    <±hAk  ã                   óZ  • S r SSKJr  SSKrSSKJs  Jr  SSKJr  SSKJ	r	  SSK
JrJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJrJr  SSKJr  SSKJ r   SSK!J"r"J#r#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+  \RX                  " \-5      r. " S S\ 5      r/ " S S\*5      r0 " S S\5      r1 " S S\Rd                  5      r3 " S S\"5      r4 " S S\+5      r5 " S S \#\5      r6 " S! S"\)5      r7 " S# S$\(5      r8 " S% S&\$5      r9 " S' S(\&5      r: " S) S*\'5      r; " S+ S,\%5      r</ S-Qr=g).zPyTorch MiniMax model.é    )ÚOptionalN)Únné   )ÚACT2FN)ÚCacheÚDynamicCache)Úlayer_type_validation)Úcreate_causal_maskÚ!create_sliding_window_causal_mask)ÚFlashAttentionKwargs)ÚGradientCheckpointingLayer)ÚMoeModelOutputWithPast)ÚUnpack)ÚTransformersKwargsÚlogging)ÚOutputRecorderé   )ÚMixtralConfig)
ÚMixtralAttentionÚMixtralDecoderLayerÚMixtralForCausalLMÚMixtralForQuestionAnsweringÚ MixtralForSequenceClassificationÚMixtralForTokenClassificationÚMixtralModelÚMixtralPreTrainedModelÚMixtralRMSNormÚMixtralSparseMoeBlockc                   ó@   ^ • \ rS rSrSr        SU 4S jjrSrU =r$ )ÚMiniMaxConfigé4   ao  
This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an
MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the MiniMax.

[MiniMaxAI/MiniMax-Text-01-hf](https://huggingface.co/MiniMaxAI/MiniMax-Text-01-hf)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.


Args:
    vocab_size (`int`, *optional*, defaults to 32000):
        Vocabulary size of the MiniMax model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`MiniMaxModel`]
    hidden_size (`int`, *optional*, defaults to 4096):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 14336):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 32):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_key_value_heads (`int`, *optional*, defaults to 8):
        This is the number of key_value heads that should be used to implement Grouped Query Attention. If
        `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
        `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
        converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
        by meanpooling all the original heads within that group. For more details, check out [this
        paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
    head_dim (`int`, *optional*, defaults to `hidden_size // num_attention_heads`):
        The attention head dimension.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder.
    max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
        The maximum sequence length that this model might ever be used with. MiniMax's sliding window attention
        allows sequence of up to 4096*32 tokens.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models). Only
        relevant if `config.is_decoder=True`.
    pad_token_id (`int`, *optional*):
        The id of the padding token.
    bos_token_id (`int`, *optional*, defaults to 1):
        The id of the "beginning-of-sequence" token.
    eos_token_id (`int`, *optional*, defaults to 2):
        The id of the "end-of-sequence" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether the model's input and output word embeddings should be tied.
    rope_theta (`float`, *optional*, defaults to 1000000.0):
        The base period of the RoPE embeddings.
    sliding_window (`int`, *optional*):
        Sliding window attention window size. If not specified, will default to `4096`.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    num_experts_per_tok (`int`, *optional*, defaults to 2):
        The number of experts to route per-token, can be also interpreted as the `top-k` routing
        parameter
    num_local_experts (`int`, *optional*, defaults to 8):
        Number of experts per Sparse MLP layer.
    output_router_logits (`bool`, *optional*, defaults to `False`):
        Whether or not the router logits should be returned by the model. Enabeling this will also
        allow the model to output the auxiliary loss. See [here]() for more details
    router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
        The aux loss factor for the total loss.
    router_jitter_noise (`float`, *optional*, defaults to 0.0):
        Amount of noise to add to the router.
    layer_types (`list`, *optional*):
        Attention pattern for each layer.
    block_size (`int`, *optional*, defaults to 256):
        The length of each attention block, determining how queries, keys, and values
        are grouped and processed for intra- and inter-block attention.
    full_attn_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after normal attention.
    full_attn_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after normal attention.
    linear_attn_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after lightning attention.
    linear_attn_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after lightning attention.
    mlp_alpha_factor (`float`, *optional*, defaults to 1):
        Weight for residual value in residual connection after MLP.
    mlp_beta_factor (`float`, *optional*, defaults to 1):
        Weight for hidden state value in residual connection after MLP.

```python
>>> from transformers import MiniMaxModel, MiniMaxConfig

>>> # Initializing a MiniMax style configuration
>>> configuration = MiniMaxConfig()

>>> # Initializing a model from the MiniMax style configuration
>>> model = MiniMaxModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```c	                 óX  >• [         TU ]  " S0 U	D6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        X€l	        U R                  cB  [        U R                  5       V
s/ sH  n
[        U
S-   S-  5      (       a  SOSPM     sn
U l        [        U R                  5        g s  sn
f )Né   r   Úfull_attentionÚlinear_attention© )ÚsuperÚ__init__Úlayer_typesÚ
block_sizeÚfull_attn_alpha_factorÚfull_attn_beta_factorÚlinear_attn_alpha_factorÚlinear_attn_beta_factorÚmlp_alpha_factorÚmlp_beta_factorÚrangeÚnum_hidden_layersÚboolr	   )Úselfr)   r*   r+   r,   r-   r.   r/   r0   Úsuper_kwargsÚiÚ	__class__s              €Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/minimax/modular_minimax.pyr(   ÚMiniMaxConfig.__init__š   s¨   ø€ ô 	‰ÒÑ(˜<Ò(Ø&ÔØ$ŒØ&<Ô#Ø%:Ô"Ø(@Ô%Ø'>Ô$Ø 0ÔØ.Ôà×ÑÑ#äW\Ð]a×]sÑ]sÔWtó ÙWtÐRS¤D¨!¨a©%°1©×$5Ñ$5Ñ Ð;MÒMÑWtñ ˆDÔô 	˜d×.Ñ.Õ/ùò s   Á&#B')r*   r+   r,   r)   r-   r.   r/   r0   )Né   r#   r#   r#   r#   r#   r#   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r(   Ú__static_attributes__Ú__classcell__©r7   s   @r8   r    r    4   s.   ø† ñcðN ØØ ØØ!"Ø !ØØ÷0õ 0ó    r    c                   ó   • \ rS rSrSrg)ÚMiniMaxRMSNormé·   r&   N©r;   r<   r=   r>   r@   r&   rC   r8   rE   rE   ·   ó   † ÚrC   rE   c                   ó    ^ • \ rS rSrU 4S jrS rS\4S jrU 4S jrS\4U 4S jjr	S r
S	\4S
 jrS\R                  4S jrS\4S jrSrU =r$ )ÚMiniMaxCacheé»   c                 ó0   >• [         TU ]  5         / U l        g ©N)r'   r(   Úlinear_cache©r4   r7   s    €r8   r(   ÚMiniMaxCache.__init__¼   s   ø€ Ü‰ÑÔØ02ˆÕrC   c                 ó¨   • [        [        U R                  5      US-   5       H  nU R                  R                  / 5        M      X R                  U'   g )Nr#   )r1   ÚlenrN   Úappend)r4   Ú	layer_idxrN   Ú_s       r8   Úset_linear_cacheÚMiniMaxCache.set_linear_cacheÀ   sD   € ä”s˜4×,Ñ,Ó-¨y¸1©}Ö=ˆAØ×Ñ×$Ñ$ RÖ(ñ >à'3×Ñ˜)Ò$rC   rT   c                 ó@   • U[        U 5      :  a  U R                  U   $ g rM   )rR   rN   ©r4   rT   s     r8   Úget_linear_cacheÚMiniMaxCache.get_linear_cacheÆ   s"   € Ø”s˜4“yÓ Ø×$Ñ$ YÑ/Ð/ØrC   c                 óZ   >• [        [        TU ]	  5       [        U R                  5      5      $ rM   )Úmaxr'   Ú__len__rR   rN   rO   s    €r8   r^   ÚMiniMaxCache.__len__Ë   s"   ø€ Ü”5‘7‘?Ó$¤c¨$×*;Ñ*;Ó&<Ó=Ð=rC   c                 óš   >• U[        U R                  5      :  a#  U R                  U   / :w  a  U R                  U   4$ [        TU ]  U5      $ rM   )rR   rN   r'   Ú__getitem__)r4   rT   r7   s     €r8   ra   ÚMiniMaxCache.__getitem__Î   sM   ø€ Ø”s˜4×,Ñ,Ó-Ó-°$×2CÑ2CÀIÑ2NÐRTÓ2TØ×%Ñ% iÑ0Ð2Ð2Ü‰wÑ" 9Ó-Ð-rC   c              #   óN   #   • [        [        U 5      5       H	  nX   v •  M     g 7frM   )r1   rR   rY   s     r8   Ú__iter__ÚMiniMaxCache.__iter__Ó   s    é € Üœs 4›yÖ)ˆIØ‘/Ô!ò *ùs   ‚#%Úrepeatsc                 óô   • [        [        U 5      5       H`  nU R                  U   / :w  a,  U R                  U   R                  USS9U R                  U'   MB  U R                  U   R                  U5        Mb     g )Nr   ©Údim)r1   rR   rN   Úrepeat_interleaveÚlayersÚbatch_repeat_interleave)r4   rf   rT   s      r8   rl   Ú$MiniMaxCache.batch_repeat_interleave×   sl   € Üœs 4›yÖ)ˆIØ× Ñ  Ñ+¨rÓ1Ø/3×/@Ñ/@ÀÑ/K×/]Ñ/]Ð^eÐklÐ/]Ð/m×!Ñ! )Ó,à—‘˜IÑ&×>Ñ>¸wÖGò	 *rC   Úindicesc                 óâ   • [        [        U 5      5       HW  nU R                  U   / :w  a#  U R                  U   US4   U R                  U'   M9  U R                  U   R	                  U5        MY     g )N.)r1   rR   rN   rk   Úbatch_select_indices)r4   rn   rT   s      r8   rp   Ú!MiniMaxCache.batch_select_indicesÞ   sd   € Üœs 4›yÖ)ˆIØ× Ñ  Ñ+¨rÓ1Ø/3×/@Ñ/@ÀÑ/KÈGÐUXÈLÑ/Y×!Ñ! )Ó,à—‘˜IÑ&×;Ñ;¸GÖDò	 *rC   Ú
max_lengthc                 ó   • [        S5      e)Nz*MiniMaxCache doesnot support `crop` method)ÚRuntimeError)r4   rr   s     r8   ÚcropÚMiniMaxCache.cropå   s   € ÜÐGÓHÐHrC   )rN   )r;   r<   r=   r>   r(   rV   ÚintrZ   r^   ra   rd   rl   ÚtorchÚTensorrp   ru   r@   rA   rB   s   @r8   rJ   rJ   »   sc   ø† õ3ò4ð¨#ô õ
>ð. S÷ .ò
"ðH¨sô HðE¨E¯L©Lô EðI˜s÷ Iò IrC   rJ   c                   óN  ^ • \ rS rSrS\S\4U 4S jjrS rS r  SS\	R                  S\\	R                  \	R                  4   S	\\	R                     S
\\   S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )ÚMiniMaxLightningAttentionéé   ÚconfigrT   c                 ó¼  >• [         TU ]  5         X l        [        USS 5      =(       d    UR                  UR
                  -  U l        UR
                  U l        UR                  U l        UR                  U l        [        UR                     U l        [        U R                  U R
                  -  5      U l        [        R                  " UR                  U R
                  U R                  -  S-  SS9U l        [        R                  " U R
                  U R                  -  UR                  SS9U l        [        R                  " UR                  U R
                  U R                  -  SS9U l        U R'                  5       nU R)                  U5      u  pEnU R+                  SU5        U R+                  SU5        U R+                  SU5        U R+                  SU5        g )	NÚhead_dimr   F)ÚbiasÚ
slope_rateÚquery_decayÚ	key_decayÚdiagonal_decay)r'   r(   rT   ÚgetattrÚhidden_sizeÚnum_attention_headsr   r2   r*   r   Ú
hidden_actÚact_fnrE   Únormr   ÚLinearÚqkv_projÚout_projÚoutput_gateÚget_slope_rateÚdecay_factorsÚregister_buffer)r4   r}   rT   r   r‚   rƒ   r„   r7   s          €r8   r(   Ú"MiniMaxLightningAttention.__init__ê   s  ø€ Ü‰ÑÔØ"ŒÜ ¨
°DÓ9×m¸V×=OÑ=OÐSY×SmÑSmÑ=mˆŒØ#)×#=Ñ#=ˆÔ Ø!'×!9Ñ!9ˆÔØ ×+Ñ+ˆŒä˜V×.Ñ.Ñ/ˆŒÜ" 4§=¡=°4×3KÑ3KÑ#KÓLˆŒ	ÜŸ	š	 &×"4Ñ"4°d×6NÑ6NÐQU×Q^ÑQ^Ñ6^ÐabÑ6bÐinÑoˆŒÜŸ	š	 $×":Ñ":¸T¿]¹]Ñ"JÈF×L^ÑL^ÐejÑkˆŒÜŸ9š9 V×%7Ñ%7¸×9QÑ9QÐTX×TaÑTaÑ9aÐhmÑnˆÔà×(Ñ(Ó*ˆ
Ø15×1CÑ1CÀJÓ1OÑ.ˆ à×Ñ˜\¨:Ô6Ø×Ñ˜]¨KÔ8Ø×Ñ˜[¨)Ô4Ø×ÑÐ-¨~Õ>rC   c                 óä   • SSSU R                   -  -  -  n[        R                  " U R                   5      S-   nSU R                  U R                  S-
  S-   -  -
  S-   nX-  nXC-  nUS S 2S S 4   nU$ )Nr#   r   é   gñhãˆµøä>)r‡   rx   ÚarangerT   r2   )r4   ÚbaseÚexponentÚfactorÚrates        r8   r   Ú(MiniMaxLightningAttention.get_slope_rate   s€   € ØA˜!˜d×6Ñ6Ñ6Ñ7Ñ8ˆÜ—<’< × 8Ñ 8Ó9¸AÑ=ˆØT—^‘^ t×'=Ñ'=ÀÑ'AÀDÑ'HÑIÑIÈDÑPˆà‰~ˆØ‰}ˆØ’At˜TMÑ"ˆàˆrC   c                 ó¬  • [         R                  " U R                  5      S-   n[         R                  " U* US S 2S 4   -  5      n[         R                  " U* U R                  US S 2S 4   -
  -  5      nUS S 2S 4   US S S 24   -
  nUS S S S 2S S 24   nX-  n[         R                  " US:¬  U* [        S5      5      n[         R                  " U5      nX4U4$ )Nr#   r   z-inf)rx   r•   r*   ÚexpÚwhereÚfloat)r4   r   Úblock_size_ranger‚   rƒ   r„   s         r8   r   Ú'MiniMaxLightningAttention.decay_factors  sÓ   € Ü Ÿ<š<¨¯©Ó8¸1Ñ<Ðä—i’i  Ð.>ºqÀ$¸wÑ.GÑ GÓHˆÜ—I’I˜z˜k¨T¯_©_Ð?OÒPQÐSWÐPWÑ?XÑ-XÑYÓZˆ	à)ª!¨T¨'Ñ2Ð5EÀdÊAÀgÑ5NÑNˆØ'¨¨d²A²qÐ(8Ñ9ˆØ#Ñ4ˆÜŸš ^°qÑ%8¸>¸/Ì5ÐQWË=ÓYˆÜŸš >Ó2ˆà ~Ð5Ð5rC   Úhidden_statesÚposition_embeddingsÚattention_maskÚpast_key_valueÚcache_positionÚkwargsÚreturnc                 ó	  • UR                   u  pxn	X€R                  -   S-
  U R                  -  n
U R                  U R                  U5      5      nUR	                  XxU R
                  SU R                  -  5      n[        R                  " X°R                  SS9u  pÍnUR                  SS5      nUR                  SS5      nUR                  SS5      nS nUb  UR                  U R                  5      nUGc  [        R                  " XpR
                  U R                  U R                  5      R                  U5      nUbN  UR                  [        R                  S9nUR                  UR!                  S5      R!                  S5      ) S5      n/ n[#        U
5       GHh  nUU R                  -  n[%        UU R                  -   U5      nUU-
  nUS S 2S S 2UU24   nUS S 2S S 2UU24   nUS S 2S S 2UU24   nU R&                  S S 2S U24   nU R(                  S S 2U* S 24   nU R*                  S S 2S S 2S U2S U24   n[        R,                  " U R.                  * U-  5      n[        R0                  " UUR                  SS5      5      n[        R0                  " UU-  U5      n[        R0                  " UU-  U5      nUU-   nUR3                  U5        [        R0                  " UU-  R                  SS5      U5      n UU-  U -   nGMk     O¿[        R,                  " U R.                  * 5      n!/ n[#        U5       H  nUS S 2S S 2UUS-   24   nUS S 2S S 2UUS-   24   nUS S 2S S 2UUS-   24   n[        R0                  " UR                  SS5      U5      n"U!U-  U"-   n[        R0                  " UU5      nUR3                  U5        M     [        R4                  " USS9nUR                  SS5      nUR	                  XxU R
                  U R                  -  5      nU R7                  U5      n[8        R:                  " U R=                  U5      5      U-  nU R?                  U5      nUb  URA                  U R                  U5        UU4$ )	Nr#   r   rh   r   )Údtypeéÿÿÿÿr   éþÿÿÿ)!Úshaper*   r‰   rŒ   Úreshaper‡   r   rx   ÚsplitÚ	transposerZ   rT   ÚzerosÚtor3   Úmasked_fillÚ	unsqueezer1   Úminr‚   rƒ   r„   rœ   r   ÚmatmulrS   ÚcatrŠ   ÚFÚsigmoidrŽ   r   rV   )#r4   r¡   r¢   r£   r¤   r¥   r¦   Ú
batch_sizeÚseq_lenr†   Ú
num_blocksÚ
qkv_statesÚquery_statesÚ
key_statesÚvalue_statesÚattn_weights_interÚattn_outputr6   Ú	start_idxÚend_idxÚcurrent_block_sizeÚcurrent_query_statesÚcurrent_key_statesÚcurrent_value_statesÚcurrent_query_decayÚcurrent_key_decayÚcurrent_diagonal_decayÚblock_decayÚattn_weights_intraÚattn_output_intraÚattn_output_interÚcurrent_attn_outputÚnext_attn_weights_interÚratioÚcurrent_attn_weights_inters#                                      r8   ÚforwardÚ!MiniMaxLightningAttention.forward  s”  € ð ,9×+>Ñ+>Ñ(ˆ
˜[Ø§¡Ñ/°!Ñ3¸¿¹ÑGˆ
à—[‘[ §¡¨}Ó!=Ó>ˆ
Ø×'Ñ'¨
¸T×=UÑ=UÐWXÐ[_×[hÑ[hÑWhÓiˆ
ä16·²¸ZÏÉÐ\]Ñ1^Ñ.ˆ ,à#×-Ñ-¨a°Ó3ˆØ×)Ñ)¨!¨QÓ/ˆ
Ø#×-Ñ-¨a°Ó3ˆð "ÐØÑ%Ø!/×!@Ñ!@ÀÇÁÓ!PÐàÒ%Ü!&§¢¨Z×9QÑ9QÐSW×S`ÑS`Ðbf×boÑboÓ!p×!sÑ!sØó"Ðð
 Ñ)Ø!/×!2Ñ!2¼¿¹Ð!2Ð!DØ+×7Ñ7¸×9QÑ9QÐRSÓ9T×9^Ñ9^Ð_aÓ9bÐ8bÐdeÓfàˆKÜ˜:×&Ø §¡Ñ/	Ü˜i¨$¯/©/Ñ9¸7ÓCØ%,¨yÑ%8Ð"à'3²A²q¸)ÀGÐ:KÐ4KÑ'LÐ$Ø%/²²1°iÀÐ6GÐ0GÑ%HÐ"Ø'3²A²q¸)ÀGÐ:KÐ4KÑ'LÐ$à&*×&6Ñ&6²qÐ:MÐ;MÐ:MÐ7MÑ&NÐ#Ø$(§N¡N²1Ð7IÐ6IÑ6JÐ3JÑ$KÐ!Ø)-×)<Ñ)<ºQÂÐCVÐDVÐCVÐXkÐYkÐXkÐ=kÑ)lÐ&Ü#Ÿiši¨¯©Ð(8Ð;MÑ(MÓNô &+§\¢\Ð2FÐHZ×HdÑHdÐegÐikÓHlÓ%mÐ"Ü$)§L¢LÐ1CÐF\Ñ1\Ð^rÓ$sÐ!ô %*§L¢LÐ1EÐH[Ñ1[Ð]oÓ$pÐ!ð '8Ð:KÑ&KÐ#Ø×"Ñ"Ð#6Ô7ô +0¯,ª,Ø'Ð*;Ñ;×FÑFÀrÈ2ÓNÐPdó+Ð'ð &8¸+Ñ%EÐH_Ñ%_Ó"ò; 'ô@ —I’I˜tŸ™Ð.Ó/ˆEØˆKÜ˜7–^Ø'3²A²q¸!¸aÀ!¹e¸)°OÑ'DÐ$Ø%/²²1°a¸!¸a¹%°i°Ñ%@Ð"Ø'3²A²q¸!¸aÀ!¹e¸)°OÑ'DÐ$ä-2¯\ª\Ð:L×:VÑ:VÐWYÐ[]Ó:^Ð`tÓ-uÐ*Ø%*Ð-?Ñ%?ÐB\Ñ%\Ð"Ü&+§l¢lÐ3GÐI[Ó&\Ð#à×"Ñ"Ð#6Ö7ñ $ô —i’i °Ñ4ˆð "×+Ñ+¨A¨qÓ1ˆØ!×)Ñ)¨*¸t×?WÑ?WÐZ^×ZgÑZgÑ?gÓhˆØ—i‘i Ó,ˆÜ—i’i × 0Ñ 0°Ó ?Ó@À;ÑNˆØ—m‘m KÓ0ˆð Ñ%Ø×+Ñ+¨D¯N©NÐ<NÔOàÐ.Ð.Ð.rC   )
r‰   r*   r   rT   rŠ   r‡   r2   r   rŽ   rŒ   )NN)r;   r<   r=   r>   r    rw   r(   r   r   rx   ry   Útupler   r   Ú
LongTensorr   r   rÓ   r@   rA   rB   s   @r8   r{   r{   é   sÚ   ø† ð?˜}ð ?¸÷ ?ò,	ò6ð& +/Ø59ñ`/à—|‘|ð`/ð # 5§<¡<°·±Ð#=Ñ>ð`/ð ! §¡Ñ.ð	`/ð
 ! ™ð`/ð ! ×!1Ñ!1Ñ2ð`/ð Ð-Ñ.ð`/ð 
ˆu|‰|˜X e§l¡lÑ3°X¸eÀEÇLÁLÑ>QÑ5RÐRÑ	S÷`/ó `/rC   r{   c                   ó   • \ rS rSrSrg)ÚMiniMaxAttentioni|  r&   NrG   r&   rC   r8   rØ   rØ   |  rH   rC   rØ   c                   ó   • \ rS rSrSrg)ÚMiniMaxSparseMoeBlocki€  r&   NrG   r&   rC   r8   rÚ   rÚ   €  rH   rC   rÚ   c                   óž  ^ • \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\
\	\R                        S
\
\   S\
\   S\
\   S\
\R                     S\\   S\	\R                  \
\	\R                  \R                  4      4   4S jjrSrU =r$ )ÚMiniMaxDecoderLayeri„  r}   rT   c                 ó†  >• [         TU ]  X5        X l        UR                  U   U l        UR
                  U l        UR                  U l        U R                  S:X  a3  [        X5      U l        UR                  U l
        UR                  U l        g [        X5      U l        UR                  U l
        UR                  U l        g )Nr%   )r'   r(   rT   r)   Ú
layer_typer/   r0   r{   Ú	self_attnr-   Úattn_alpha_factorr.   Úattn_beta_factorrØ   r+   r,   )r4   r}   rT   r7   s      €r8   r(   ÚMiniMaxDecoderLayer.__init__…  s    ø€ Ü‰Ñ˜Ô+à"ŒØ ×,Ñ,¨YÑ7ˆŒØ &× 7Ñ 7ˆÔØ%×5Ñ5ˆÔà?‰?Ð0Ó0Ü6°vÓIˆDŒNØ%+×%DÑ%DˆDÔ"Ø$*×$BÑ$BˆDÕ!ä-¨fÓ@ˆDŒNØ%+×%BÑ%BˆDÔ"Ø$*×$@Ñ$@ˆDÕ!rC   r¡   r¢   r£   Úposition_idsr¤   Úoutput_attentionsÚoutput_router_logitsÚ	use_cacher¥   r¦   r§   c
                 ó&  • U R                  U5      nUnU R                  " SUUUUUUUU	S.U
D6u  pX°R                  -  XR                  -  -   nU R	                  U5      nUnU R                  U5      u  pX°R                  -  XR                  -  -   nU$ )aí  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    attention_mask (`torch.Tensor`, *optional*): attention mask of size
        `(batch, sequence_length)` where padding elements are indicated by 0.
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r¡   r¢   r£   rã   r¤   rä   ræ   r¥   r&   )Úinput_layernormrß   rà   rá   Úpost_attention_layernormÚblock_sparse_moer/   r0   )r4   r¡   r¢   r£   rã   r¤   rä   rå   ræ   r¥   r¦   ÚresidualrU   s                r8   rÓ   ÚMiniMaxDecoderLayer.forward–  s¾   € ðL ×,Ñ,¨]Ó;ˆØ ˆð  Ÿ>š>ð 

Ø'Ø 3Ø)Ø%Ø)Ø/ØØ)ñ

ð ñ

Ñˆð !×#9Ñ#9Ñ9¸M×LaÑLaÑ<aÑaˆð ×5Ñ5°mÓDˆØ ˆØ×0Ñ0°Ó?ÑˆØ ×#8Ñ#8Ñ8¸=×K_ÑK_Ñ;_Ñ_ˆàÐrC   )rà   rá   rT   rÞ   r/   r0   rß   )NNNFFFN)r;   r<   r=   r>   r    rw   r(   rx   ry   rÕ   r   rÖ   r3   r   r   ÚFloatTensorrÓ   r@   rA   rB   s   @r8   rÜ   rÜ   „  s&  ø† ðA˜}ð A¸÷ Að* 26Ø37Ø8<Ø,1Ø/4Ø$)Ø59ñ=à—|‘|ð=ð # 5§<¡<°·±Ð#=Ñ>ð=ð ! §¡Ñ.ð	=ð
 ˜u×/Ñ/Ñ0ð=ð !  u§|¡|Ñ!4Ñ5ð=ð $ D™>ð=ð ' t™nð=ð ˜D‘>ð=ð ! ×!1Ñ!1Ñ2ð=ð Ð-Ñ.ð=ð 
ˆu× Ñ  (¨5°×1BÑ1BÀE×DUÑDUÐ1UÑ+VÑ"WÐWÑ	X÷=ó =rC   rÜ   c                   ó2   • \ rS rSrSr\" \SS9\\\	/S.r
Srg)ÚMiniMaxPreTrainedModeliÖ  Fr#   )Úindex)Úrouter_logitsr¡   Ú
attentionsr&   N)r;   r<   r=   r>   Ú_can_compile_fullgraphr   rÚ   rÜ   rØ   r{   Ú_can_record_outputsr@   r&   rC   r8   rï   rï   Ö  s'   † Ø"Ðá'Ð(=ÀQÑGØ,Ø'Ð)BÐCñÓrC   rï   c                   óî   • \ rS rSr        SS\R
                  S\\R                     S\\R
                     S\\   S\\R                     S\\
   S	\\
   S
\\R
                     S\\   S\4S jjrSrg)ÚMiniMaxModeliß  NÚ	input_idsr£   rã   Úpast_key_valuesÚinputs_embedsræ   rä   r¥   r¦   r§   c	                 óà  • US L US L-  (       a  [        S5      eU(       a  Uc  [        5       nO4U(       a-  [        U[        5      (       d  [        S[        U5       S35      eUc  U R	                  U5      nUcD  Ub  UR                  5       OSn
[        R                  " XªUR                  S   -   UR                  S9nUc  UR                  S5      nU R                  R                  c  [        O[        nU" U R                  UUUUUS9nUnU R                  XÓ5      nU R                    H(  nUR"                  S:X  a  UnOUnU" U4UUUUUUS	.U	D6nM*     U R%                  U5      n['        UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszSMiniMax uses cache of its own and is not compatible with `past_key_values` of type Ú.r   r#   )Údevice)r}   Úinput_embedsr£   r¥   rø   rã   r$   )r¢   r£   rã   r¤   ræ   r¥   )Úlast_hidden_staterø   )Ú
ValueErrorrJ   Ú
isinstanceÚtypeÚembed_tokensÚget_seq_lengthrx   r•   r¬   rü   r³   r}   Úsliding_windowr
   r   Ú
rotary_embrk   rÞ   rŠ   r   )r4   r÷   r£   rã   rø   rù   ræ   rä   r¥   r¦   Úpast_seen_tokensÚmask_functionÚcausal_maskr¡   r¢   Údecoder_layerÚinput_attention_masks                    r8   rÓ   ÚMiniMaxModel.forwardà  s¥  € ð ˜Ð -°tÐ";×<ÜÐYÓZÐZæ˜Ñ0Ü*›n‰OÞœz¨/¼<×HÑHÜØeÔfjÐkzÓf{Ðe|Ð|}Ð~óð ð Ñ Ø ×-Ñ-¨iÓ8ˆMàÑ!ØCRÑC^˜×=Ñ=Ô?ÐdeÐÜ"Ÿ\š\Ø °]×5HÑ5HÈÑ5KÑ"KÐTa×ThÑThñˆNð ÑØ)×3Ñ3°AÓ6ˆLà.2¯k©k×.HÑ.HÑ.PÕ*ÔVwˆÙ#Ø—;‘;Ø&Ø)Ø)Ø+Ø%ñ
ˆð &ˆð #Ÿo™o¨mÓJÐà!Ÿ[œ[ˆMØ×'Ñ'Ð+;Ó;Ø'2Ñ$ð (6Ð$á)Øð	à$7Ø3Ø)Ø.Ø#Ø-ñ	ð ñ	ŠMñ )ð$ Ÿ	™	 -Ó0ˆä%Ø+Ø+ñ
ð 	
rC   r&   )NNNNNNNN)r;   r<   r=   r>   rx   rÖ   r   ry   rJ   rí   r3   r   r   r   rÓ   r@   r&   rC   r8   rö   rö   ß  s×   † ð '+Ø15Ø37Ø26Ø59Ø$(Ø,0Ø59ñG
à×#Ñ#ðG
ð ! §¡Ñ.ðG
ð ˜u×/Ñ/Ñ0ð	G
ð
 " ,Ñ/ðG
ð   × 1Ñ 1Ñ2ðG
ð ˜D‘>ðG
ð $ D™>ðG
ð ! ×!1Ñ!1Ñ2ðG
ð Ð+Ñ,ðG
ð 
 ÷G
ð G
rC   rö   c                   ó(   ^ • \ rS rSrU 4S jrSrU =r$ )ÚMiniMaxForCausalLMi*  c                 ó$   >• [         TU ]  " S0 UD6$ )a   
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, MiniMaxForCausalLM

>>> model = MiniMaxForCausalLM.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")
>>> tokenizer = AutoTokenizer.from_pretrained("MiniMaxAI/MiniMax-Text-01-hf")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r&   )r'   rÓ   )r4   r5   r7   s     €r8   rÓ   ÚMiniMaxForCausalLM.forward+  s   ø€ ô. ‰wŠÑ. Ñ.Ð.rC   r&   )r;   r<   r=   r>   rÓ   r@   rA   rB   s   @r8   r  r  *  s   ø† ÷/ó /rC   r  c                   ó   • \ rS rSrSrg)Ú MiniMaxForSequenceClassificationiE  r&   NrG   r&   rC   r8   r  r  E  rH   rC   r  c                   ó   • \ rS rSrSrg)ÚMiniMaxForTokenClassificationiI  r&   NrG   r&   rC   r8   r  r  I  rH   rC   r  c                   ó   • \ rS rSrSrg)ÚMiniMaxForQuestionAnsweringiM  r&   NrG   r&   rC   r8   r  r  M  rH   rC   r  )r    rï   rö   r  r  r  r  )>r?   Útypingr   rx   Útorch.nn.functionalr   Ú
functionalr·   Úactivationsr   Úcache_utilsr   r   Úconfiguration_utilsr	   Úmasking_utilsr
   r   Úmodeling_flash_attention_utilsr   Úmodeling_layersr   Úmodeling_outputsr   Úprocessing_utilsr   Úutilsr   r   Úutils.genericr   Úmixtral.configuration_mixtralr   Úmixtral.modeling_mixtralr   r   r   r   r   r   r   r   r   r   Ú
get_loggerr;   Úloggerr    rE   rJ   ÚModuler{   rØ   rÚ   rÜ   rï   rö   r  r  r  r  Ú__all__r&   rC   r8   Ú<module>r)     s  ðñ  å ã ß Ð Ý å !ß .Ý 8ß RÝ BÝ 9Ý 6Ý &ß 0Ý +Ý 9÷÷ ÷ ð 
×	Ò	˜HÓ	%€ô@0Mô @0ôF	^ô 	ô+I<ô +Iô\P/ §	¡	ô P/ôf	Ð'ô 	ô	Ð1ô 	ôOÐ-Ð/Iô OôdÐ3ô ôH
<ô H
ôV/Ð+ô /ô6	Ð'Gô 	ô	Ð$Aô 	ô	Ð"=ô 	òrC   