
    <h@                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJrJr  S	S
KJr  S	SKJrJrJr  S	SKJrJrJrJrJrJrJr  SSKJr  \R@                  " \!5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r*/ SQr+g)     )OptionalUnionN)nn   )Cache)BaseModelOutputWithPastMoeModelOutputWithPast)Unpack)auto_docstringcan_return_tuplelogging   )BambaConfig)
BambaMixerBambaRMSNormGated HybridMambaAttentionDynamicCache)GraniteFlashAttentionKwargsGraniteMoeSharedAttentionGraniteMoeSharedDecoderLayerGraniteMoeSharedForCausalLMGraniteMoeSharedMLPGraniteMoeSharedModelGraniteMoeSharedPreTrainedModel   )GraniteMoeHybridConfigc                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridAttention*   config	layer_idxc                 $   > [         TU ]  X5        g Nsuper__init__selfr   r    	__class__s      u/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/granitemoehybrid/modular_granitemoehybrid.pyr%   "GraniteMoeHybridAttention.__init__+   s    +     	__name__
__module____qualname____firstlineno__r   intr%   __static_attributes____classcell__r(   s   @r)   r   r   *   s    ,5 ,# , ,r+   r   c                   4   ^  \ rS rSrS\S\4U 4S jjrSrU =r$ )GraniteMoeHybridMambaLayer/   r   r    c                 8   > [         TU ]  [        U5      U5        g r"   )r$   r%   r   r&   s      r)   r%   #GraniteMoeHybridMambaLayer.__init__0   s    V,i8r+   r,   r-   r5   s   @r)   r7   r7   /   s    95 9# 9 9r+   r7   c                   ,   ^  \ rS rSrSU 4S jjrSrU =r$ )GraniteMoeHybridRMSNormGated4   c                 $   > [         TU ]  X5        g r"   r#   )r'   hidden_sizeepsr(   s      r)   r%   %GraniteMoeHybridRMSNormGated.__init__5   s    *r+   r,   )gư>)r.   r/   r0   r1   r%   r3   r4   r5   s   @r)   r<   r<   4   s    + +r+   r<   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeHybridMLP9   r   c                 $   > [         TU ]  U5        g r"   r#   r'   r   r(   s     r)   r%   GraniteMoeHybridMLP.__init__:   s     r+   r,   )r.   r/   r0   r1   r   r%   r3   r4   r5   s   @r)   rC   rC   9   s    !5 ! !r+   rC   c                   l  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\
   S\	\   S	\	\   S
\	\R                     S\	\   S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )GraniteMoeHybridDecoderLayer>   r   r    c                   > [         TU ]  X5        [        U5      U l        S U l        S U l        UR                  U   S:X  a  [        X5      U l        O[        X5      U l        UR                  U   U l	        [        USS5      S:  U l        g )Nmambanum_local_expertsr   )r$   r%   rC   
shared_mlp	self_attnrL   layers_block_typer7   r   
layer_typegetattrhas_expertsr&   s      r)   r%   %GraniteMoeHybridDecoderLayer.__init__?   s    +-f5
##I.'93FFDJ6vIDN 229= #6+>BQFr+   hidden_statesattention_maskpast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsreturnc	                    Un
U R                  U5      nU R                  b  U R                  " SUUUUS.U	D6nSnOU R                  " SUUUUUUUS.U	D6u  pXU R                  -  -   nUn
U R	                  U5      nU R
                  (       a'  U R                  U5      u  pXR                  U5      -   nOU R                  U5      nSnXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs.Can be used to provide `GraniteFlashAttentionKwargs` for
        padding-free training and/or improve torch.compile performance.
N)rU   rZ   cache_paramsrV   )rU   rV   rW   rX   rY   rZ   r\   r,   )input_layernormrL   rO   residual_multiplierpost_attention_layernormrS   block_sparse_moerN   )r'   rU   rV   rW   rX   rY   rZ   r[   r\   r]   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                  r)   forward$GraniteMoeHybridDecoderLayer.forwardO   s3   J !,,];::! JJ +-+-	
 M !%/3~~ 	0+--"3#-$7	0 	0,M !43K3K#KK !55mD/3/D/D]/S,-0NNM OOM:M M 43K3K#KK "++G''Gr+   )rS   rQ   rL   rO   rN   )NNFFNFN)r.   r/   r0   r1   r   r2   r%   torchTensorr   r   bool
LongTensortupler
   r   FloatTensorrj   r3   r4   r5   s   @r)   rI   rI   >   s   G5 G# G& 26*.,1$)59/4KOU||U !.U !	U
 $D>U D>U !!1!12U 'tnU &eELL%,,,F&GHU 45U 
u  (51B1BEDUDU1U+V"WW	XU Ur+   rI   c                   >   ^  \ rS rSr% \\S'   S/rSrU 4S jrSr	U =r
$ )GraniteMoeHybridPreTrainedModel   r   rI   Tc                   > [         TU ]  U5        [        U[        5      (       a  UR                  R
                  R                  S5        [        R                  " [        R                  " SUR                  S-   5      5      UR                  l        UR                  R
                  R                  S5        g [        U[        5      (       a&  UR                  R
                  R                  S5        g g )Ng      ?r   )r$   _init_weights
isinstancer7   dt_biasdatafill_rl   logarange	num_headsA_logDr<   weight)r'   moduler(   s     r)   rv   -GraniteMoeHybridPreTrainedModel._init_weights   s    f%f899NN%%c* %		%,,q&:J:JQ:N*O PFLLHHMM$ <==MM$$S) >r+   r,   )r.   r/   r0   r1   r   __annotations___no_split_modules_is_statefulrv   r3   r4   r5   s   @r)   rs   rs      s!    ""78L* *r+   rs   c                   v  ^  \ rS rSrS\4U 4S jjr\\           SS\R                  S\
\R                     S\
\R                     S\
\\\\R                     4      S\
\R                     S	\
\   S
\
\   S\
\   S\
\   S\
\   S\
\R                     S\\   S\\\4   4S jj5       5       rS rSrU =r$ )GraniteMoeHybridModel   r   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        X5      PM     sn5      U l        g s  snf r"   )r$   r%   r   
ModuleListrangenum_hidden_layersrI   layersr&   s      r)   r%   GraniteMoeHybridModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A	input_idsrV   position_idspast_key_valuesinputs_embedsrY   rX   output_hidden_statesr[   return_dictrZ   r]   r^   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
US L US L-  (       a  [        S5      eU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc  [        R                  S5        UcD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9nUc  UR#                  S5      nU R%                  X%XU5      nU R'                  X+5      nUnS nU R(                  b  U R)                  UU5      nU(       a  SOS nU(       a  SOS nU	(       a  SOS nU R*                   Hj  nUR,                  S	:X  a  UOUnU(       a  UU4-  nU" U4UUUUUU	US
.UD6nUS   nU(       a  US   b	  UUS   4-  nU	(       d  MY  US   c  Ma  UUS   4-  nMl     U R/                  U5      nU(       a  UU4-  nU(       a  UR0                  (       d  SUl        [3        UUUUUS9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.FzGraniteMoeHybrid requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. Because one was not provided, no cache will be returned.r   r   devicer,   rL   )rV   rW   rX   rY   rZ   r[   r\   T)last_hidden_stater   rU   
attentionsrh   )r   rX   r   rY   use_return_dict
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensembedding_multiplierget_seq_lengthrl   r|   shaper   	unsqueeze_update_causal_mask_update_mamba_mask
rotary_embr   rQ   normhas_previous_stater	   )r'   r   rV   r   r   r   rY   rX   r   r[   r   rZ   r]   past_seen_tokenscausal_mask
mamba_maskrU   r\   all_hidden_statesall_self_attnsall_router_logitsdecoder_layer
layer_masklayer_outputss                           r)   rj   GraniteMoeHybridModel.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==Yj I  --i8M%(A(AA 0K
 !CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L..>L]
 ,,^L
 &"??&"&//-"N #7BD0d"6BD![[M'4'?'?7'JP[J#!m%55!)
)."3#-%9$7
 
M *!,M  #/"}Q'7&99N## $0%-*;)==%; )> 		-0  -!11?#E#E15O.%+++%+
 	
r+   c                 b    UnUS   S:  d!  Ub   [         R                  " US:H  5      (       a  SnU$ )zV
No need for zeroing states when
    1. Cached forward
    2. Attending to all inputs
r   Nr   )rl   all)r'   rV   rZ   r   s       r)   r   (GraniteMoeHybridModel._update_mamba_mask4  s:     $
!q ^%?EIIn`aNaDbDbJr+   )r   )NNNNNNNNNNN)r.   r/   r0   r1   r   r%   r   r   rl   ro   r   rm   r   r   listrq   rn   r
   r   rp   r   rj   r   r3   r4   r5   s   @r)   r   r      sX   
5 
  '+1537KO59$(,0/3/3&*59s
##s
 !.s
 u//0	s

 "%tE4E4E/F(F"GHs
   1 12s
 D>s
 $D>s
 'tns
 'tns
 d^s
 !!1!12s
 45s
 
u--	.s
  s
j	 	r+   r   c                   L   ^  \ rS rSrS/rS\4U 4S jjr      SS jrSrU =r	$ )GraniteMoeHybridForCausalLMi@  zlm_head.weightr   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r"   )r$   r%   r   model	post_initrF   s     r)   r%   $GraniteMoeHybridForCausalLM.__init__C  s&     *62
r+   c                 |   US L n	U	(       d]  Uc  US   UR                   S   :  a  US S 2UR                   S   * S 24   nOhUR                   S   UR                   S   :w  a	  US S 2U4   nO>U(       a7  [        U R                  UR                   S   U R                  U R                  S9nUbZ  UcW  UR                  5       R                  S5      S-
  nUR                  US:H  S5        U	(       d  US S 2UR                   S   * S 24   nUb  U	(       a  SU0n
OSUR                  5       0n
U
R                  UUUUUS.5        U
$ )Nr   r   r   r   r   r   )r   r   rY   rV   rZ   )
r   r   r   dtyper   longcumsummasked_fill_
contiguousupdate)r'   r   r   rV   r   rZ   r   rY   r]   empty_past_kvmodel_inputss              r)   prepare_inputs_for_generation9GraniteMoeHybridForCausalLM.prepare_inputs_for_generationI  sX    (4/ )!"%);;%a.*>*>q*A)A)C&CD	#~';';A'>>%a&78	>Y__Q/DKKO %,*>)..077;a?L%%n&91= +A	0B/B/D,DE $+];L')=)=)?@L ,#2&"0"0	
 r+   )r   )NNNNNT)
r.   r/   r0   r1   _tied_weights_keysr   r%   r   r3   r4   r5   s   @r)   r   r   @  s7    *+5  7 7r+   r   )r   r   rs   ),typingr   r   rl   r   cache_utilsr   modeling_outputsr   r	   processing_utilsr
   utilsr   r   r   bamba.configuration_bambar   bamba.modeling_bambar   r   r   *granitemoeshared.modeling_granitemoesharedr   r   r   r   r   r   r   configuration_granitemoehybridr   
get_loggerr.   r   r   r7   r<   rC   rI   rs   r   r   __all__r,   r+   r)   <module>r      s     #     O & > > 3 b b   C 
		H	%, 9 ,
9 9
+#4 +
!- !
f#? fR*&E *G1 GT@"= @F fr+   