
    <h(                     >   S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  S	S
KJrJrJrJr  SSKJr  \R(                  " \5      r " S S\SS9r " S S\R0                  5      r " S S\5      r " S S\5      r " S S\5      r " S S\5      r/ SQrg)    )Optional	TypedDictN)nn   )ACT2FN)Cache)Unpack)logging   )GraniteMoeDecoderLayerGraniteMoeForCausalLMGraniteMoeModelGraniteMoePreTrainedModel   )GraniteMoeSharedConfigc                       \ rS rSr% Sr\R                  \S'   \R                  \S'   \\S'   \\S'   \R                  \S'   Sr
g	)
GraniteFlashAttentionKwargs%   aR  
Keyword arguments for advanced Flash Attention, causal-conv1d, and mamba_ssm kernel usage.
Use cases include padding-free training and fewer `torch.compile` graph breaks.

Attributes:
    cu_seq_lens_q (`torch.LongTensor`)
        Gets cumulative sequence length for query state.
    cu_seq_lens_k (`torch.LongTensor`)
        Gets cumulative sequence length for key state.
    max_length_q (`int`):
        Maximum sequence length for query state.
    max_length_k (`int`):
        Maximum sequence length for key state.
    seq_idx (`torch.IntTensor):
        Index of each packed sequence.
cu_seq_lens_qcu_seq_lens_kmax_length_qmax_length_kseq_idx N)__name__
__module____qualname____firstlineno____doc__torch
LongTensor__annotations__int	IntTensor__static_attributes__r       u/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/modular_granitemoeshared.pyr   r   %   s7    " ######__r&   r   F)totalc                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	GraniteMoeSharedMLP>   zj
MLP layer for shared experts

Args:
    config:
        Configuration object with model hyperparameters.
configc                 X  > [         TU ]  5         UR                  U l        UR                  U l        [
        UR                     U l        [        R                  " U R                  U R                  S-  SS9U l
        [        R                  " U R                  U R                  SS9U l        g )Nr   F)bias)super__init__hidden_size
input_sizeshared_intermediate_sizer   
hidden_act
activationr   Linearinput_linearoutput_linearselfr,   	__class__s     r'   r0   GraniteMoeSharedMLP.__init__G   s     ,,!:: !2!23IIdoot7G7G!7KRWXYYt'7'7uUr&   hidden_statesreturnc                     U R                  U5      nUR                  SSS9nU R                  US   5      US   -  nU R                  U5      nU$ )Nr   )dimr   r   )r7   chunkr5   r8   )r:   r=   chunked_hidden_statess      r'   forwardGraniteMoeSharedMLP.forwardP   s^    ))-8 - 3 3A2 3 >(=a(@ADYZ[D\\**=9r&   )r5   r1   r7   r2   r8   )r   r   r   r   r   r   r0   r    TensorrD   r%   __classcell__r;   s   @r'   r*   r*   >   s7    V5 VU\\ ell  r&   r*   c                     ^  \ rS rSrS\S\4U 4S jjr        SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\   S\	\\R                  \R                  4      S\\   S\\R                   \	\\R                   \R                   4      4   4S jjrSrU =r$ )GraniteMoeSharedDecoderLayerX   r,   	layer_idxc                 t   > [         TU ]  X5        UR                  S:X  a  S U l        g [        U5      U l        g )Nr   )r/   r0   r3   r*   
shared_mlpr:   r,   rL   r;   s      r'   r0   %GraniteMoeSharedDecoderLayer.__init__Y   s1    +"("A"AQ"F$L_`fLgr&   r=   attention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionoutput_router_logitsposition_embeddingskwargsr>   c
                 t   UnU R                  U5      nU R                  " SUUUUUUUU	S.U
D6u  pXU R                  -  -   nUnU R                  U5      nU R	                  U5      u  pU R
                  c  UnOXR                  U5      -   nAXU R                  -  -   nU4nU(       a  X4-  nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    output_router_logits (`bool`, *optional*):
        Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
        should not be returned during inference.
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs. Can be used to provide `GraniteFlashAttentionKwargs` for
        padding-free training and/or improve torch.compile performance.
)r=   rQ   rR   rS   rT   rU   rV   rX   r   )input_layernorm	self_attnresidual_multiplierpost_attention_layernormblock_sparse_moerN   )r:   r=   rQ   rR   rS   rT   rU   rV   rW   rX   rY   residualself_attn_weightsmoe_hidden_statesrouter_logitsoutputss                   r'   rD   $GraniteMoeSharedDecoderLayer.forward]   s    L !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !43K3K#KK !55mD+/+@+@+O(??"-M-0NNM 43K3K#KK "++G''Gr&   )rN   )NNNFFNFN)r   r   r   r   r   r#   r0   r    rF   r   r!   r   booltupler	   r   FloatTensorrD   r%   rG   rH   s   @r'   rJ   rJ   X   s2   h5 h# h 2637*.,1$)59/4KOO||O !.O u//0	O
 !O $D>O D>O !!1!12O 'tnO &eELL%,,,F&GHO 45O 
u  (51B1BEDUDU1U+V"WW	XO Or&   rJ   c                   &    \ rS rSr% \\S'   S/rSrg)GraniteMoeSharedPreTrainedModel   r,   rJ   r   N)r   r   r   r   r   r"   _no_split_modulesr%   r   r&   r'   rj   rj      s    ""78r&   rj   c                   0   ^  \ rS rSrS\4U 4S jjrSrU =r$ )GraniteMoeSharedModel   r,   c           	         > [         TU ]  U5        [        R                  " [	        UR
                  5       Vs/ sH  n[        X5      PM     sn5      U l        g s  snf N)r/   r0   r   
ModuleListrangenum_hidden_layersrJ   layersrO   s      r'   r0   GraniteMoeSharedModel.__init__   sI     mmNSTZTlTlNmnNm)&<Nmn
ns   A)ru   )r   r   r   r   r   r0   r%   rG   rH   s   @r'   rn   rn      s    
5 
 
r&   rn   c                   6   ^  \ rS rSrS/rS\4U 4S jjrSrU =r$ )GraniteMoeSharedForCausalLM   zlm_head.weightr,   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g rq   )r/   r0   rn   model	post_initr9   s     r'   r0   $GraniteMoeSharedForCausalLM.__init__   s&     *62
r&   )r{   )	r   r   r   r   _tied_weights_keysr   r0   r%   rG   rH   s   @r'   rx   rx      s    *+5  r&   rx   )rx   rn   rj   )typingr   r   r    r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   granitemoe.modeling_granitemoer   r   r   r   configuration_granitemoesharedr   
get_loggerr   loggerr   Moduler*   rJ   rj   rn   rx   __all__r   r&   r'   <module>r      s     '   !   &   C 
		H	%)5 2")) 4T#9 Tn9&? 9

O 
"7  fr&   