
    <h.                     :   S SK JrJr  S SKrS SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJrJr  SSKJr  SS	KJrJr  S
SKJrJrJrJrJr  SSKJr  \R6                  " \5      r " S S\5      r " S S\5      r " S S\5      r  " S S\5      r! " S S\5      r"/ SQr#g)    )OptionalUnionN)nn   )CacheDynamicCache)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargslogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelLlamaPreTrainedModel   )GraniteConfigc                   B   ^  \ rS rSrSrSS\S\\   4U 4S jjjrSr	U =r
$ )GraniteAttention(   z=Multi-headed attention from 'Attention Is All You Need' paperconfig	layer_idxc                 F   > [         TU ]  X5        UR                  U l        g N)super__init__attention_multiplierscalingselfr   r   	__class__s      c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/granite/modular_granite.pyr   GraniteAttention.__init__+   s    +22    )r!   r   )__name__
__module____qualname____firstlineno____doc__r   r   intr   __static_attributes____classcell__r$   s   @r%   r   r   (   s"    G3} 3# 3 3r'   r   c                   v  ^  \ rS rSrS\S\4U 4S jjr       SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\	\R                     S\	\\R                  \R                  4      S\\R                  \	\\R                  \R                  4      4   4S jjrSrU =r$ )GraniteDecoderLayer0   r   r   c                 b   > [         TU ]  X5        UR                  U l        [        XS9U l        g )N)r   r   )r   r   residual_multiplierr   	self_attnr"   s      r%   r   GraniteDecoderLayer.__init__1   s*    +#)#=#= )Mr'   hidden_statesattention_maskposition_idspast_key_valueoutput_attentions	use_cachecache_positionposition_embeddingsreturnc	                    Un
U R                  U5      nU R                  " SUUUUUUUUS.U	D6u  pXU R                  -  -   nUn
U R                  U5      nU R	                  U5      nXU R                  -  -   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`, *optional*):
        attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
        query_sequence_length, key_sequence_length)` if default attention is used.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    use_cache (`bool`, *optional*):
        If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
        (see `past_key_values`).
    past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
    cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
        Indices depicting the position of the input sequence tokens in the sequence
    position_embeddings (`tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*):
        Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`,
        with `head_dim` being the embedding dimension of each attention head.
    kwargs (`dict`, *optional*):
        Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
        into the model
)r8   r9   r:   r;   r<   r=   r>   r?    )input_layernormr6   r5   post_attention_layernormmlp)r#   r8   r9   r:   r;   r<   r=   r>   r?   kwargsresidualself_attn_weightsoutputss                r%   forwardGraniteDecoderLayer.forward6   s    D !,,]; ,0>> 
,
')%)/) 3
,
 
,
( !43K3K#KK !55mD/ 43K3K#KK "++Gr'   )r5   r6   )NNNFFNN)r(   r)   r*   r+   r   r-   r   torchTensorr   
LongTensorr   booltupleFloatTensorrJ   r.   r/   r0   s   @r%   r2   r2   0   s   N} N N 2637*.,1$)59KO?||? !.? u//0	?
 !? $D>? D>? !!1!12? &eELL%,,,F&GH? 
u  (51B1BEDUDU1U+V"WW	X? ?r'   r2   c                       \ rS rSrSrg)GranitePreTrainedModelx   rB   N)r(   r)   r*   r+   r.   rB   r'   r%   rS   rS   x   s    r'   rS   c                     ^  \ rS rSrS\4U 4S jjr         SS\\R                     S\\R                     S\\R                     S\\
   S\\R                     S	\\   S
\\   S\\   S\\R                     S\\   S\4S jjrSrU =r$ )GraniteModel|   r   c           	         > [         TU ]  U5        UR                  U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        g s  snf r   )	r   r   embedding_multiplierr   
ModuleListrangenum_hidden_layersr2   layersr"   s      r%   r   GraniteModel.__init__}   sV     $*$?$?!mmEJ6KcKcEdeEd	 3Ede
es   	A*	input_idsr9   r:   past_key_valuesinputs_embedsr=   r<   output_hidden_statesr>   rF   r@   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [	        S5      eU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnUc  U R                  U5      nXPR                  -  nU(       a  Uc
  [        5       nU	cD  Ub  UR                  5       OSn[        R                  " XUR                  S   -   UR                   S9n	Uc  U	R#                  S5      n[%        U R                   UUU	UUS9nUnU R'                  X5      nU(       a  SOS nU(       a  SOS nU R(                  S U R                   R*                    H7  nU(       a  X4-  nU" U4UUUUUU	US	.U
D6nUS   nU(       d  M.  UUS   4-  nM9     U R-                  U5      nU(       a  X4-  n[/        UU(       a  UOS UUS
9$ )Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r   )device)r   input_embedsr9   r>   r`   r:   rB   )r9   r:   r;   r<   r=   r>   r?   )last_hidden_stater`   r8   
attentions)r   r<   rb   r=   
ValueErrorgradient_checkpointingtrainingloggerwarning_onceembed_tokensrY   r   get_seq_lengthrL   arangeshaperd   	unsqueezer	   
rotary_embr]   r\   normr
   )r#   r_   r9   r:   r`   ra   r=   r<   rb   r>   rF   past_seen_tokenscausal_maskr8   r?   all_hidden_statesall_self_attnsdecoder_layerlayer_outputss                      r%   rJ   GraniteModel.forward   s;    2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M%(A(AA0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 & #oomJ #7BD0d![[)H4;;+H+HIM#!%55!)
*)."3#-$7
 
M *!,M  =#3"55' J* 		-0  !11&+/8Od+%	
 	
r'   )rY   r]   )	NNNNNNNNN)r(   r)   r*   r+   r   r   r   rL   rN   rM   r   rQ   rO   r   r   r
   rJ   r.   r/   r0   s   @r%   rV   rV   |   s    
} 
 151537+/59$(,0/359_
E,,-_
 !._
 u//0	_

 "%_
   1 12_
 D>_
 $D>_
 'tn_
 !!1!12_
 +,_
 
!_
 _
r'   rV   c                   h   \ rS rSr           SS\\R                     S\\R                     S\\R                     S\\\	\
\R                     4      S\\R                     S\\R                     S	\\   S
\\   S\\   S\\R                     S\\\R                  4   S\\   S\4S jjrSrg)GraniteForCausalLM   Nr_   r9   r:   r`   ra   labelsr=   r<   rb   r>   logits_to_keeprF   r@   c                     Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U R                  " SUUUUUUUU	U
S.	UD6nUR                  n[        U[        5      (       a  [        U* S 5      OUnU R                  US S 2US S 24   5      nUU R                   R                  -  nS nUb)  U R                  " SUX`R                   R                  S.UD6n[        UUUR                  UR                  UR                  S9$ )N)	r_   r9   r:   r`   ra   r=   r<   rb   r>   )logitsr~   
vocab_size)lossr   r`   r8   rg   rB   )r   r<   rb   modelrf   
isinstancer-   slicelm_headlogits_scalingloss_functionr   r   r`   r8   rg   )r#   r_   r9   r:   r`   ra   r~   r=   r<   rb   r>   r   rF   rI   r8   slice_indicesr   r   s                     r%   rJ   GraniteForCausalLM.forward   s+    2C1N-TXT_T_TqTq$8$D $++JjJj 	
 ,0:: ,
)%+'/!5),
 ,
  118B>SV8W8W~ot4]kmA}a,?@A$++444%%pVF{{OeOepiopD%#33!//))
 	
r'   rB   )NNNNNNNNNNr   )r(   r)   r*   r+   r   rL   rN   rM   r   r   listrQ   rO   r-   r   r   r   rJ   r.   rB   r'   r%   r|   r|      s)    151537KO59-1$(,0/359342
E,,-2
 !.2
 u//0	2

 "%tE4E4E/F(F"GH2
   1 122
 ))*2
 D>2
 $D>2
 'tn2
 !!1!122
 c5<</02
 +,2
 
 2
 2
r'   r|   )r|   rV   rS   )$typingr   r   rL   torch.utils.checkpointr   cache_utilsr   r   masking_utilsr	   modeling_outputsr
   r   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   configuration_graniter   
get_loggerr(   rk   r   r2   rS   rV   r|   __all__rB   r'   r%   <module>r      s     #    . / O & 0  1 
		H	%3~ 3E+ EP	1 	g
: g
T3
) 3
l Kr'   