
    <h                        S r SSKJrJr  SSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJrJr  SSKJr  SSKJr  SSKJrJrJrJrJrJrJrJrJrJ r   SSK!J"r"  \RF                  " \$5      r%Sr& " S S\5      r' " S S\5      r( " S S\5      r) " S S\5      r* " S S\5      r+ " S S\5      r, " S S\5      r- " S  S!\5      r. " S" S#\5      r/ " S$ S%\5      r0/ S&Qr1g)'zPyTorch Qwen3 model.    )CallableOptionalN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)TransformersKwargslogging   )GemmaMLP)LlamaAttention)
Qwen2DecoderLayerQwen2ForCausalLMQwen2ForQuestionAnsweringQwen2ForSequenceClassificationQwen2ForTokenClassification
Qwen2ModelQwen2PreTrainedModelQwen2RMSNormapply_rotary_pos_embeager_attention_forward   )Qwen3ConfigzQwen/Qwen3-8Bc                       \ rS rSrSrg)Qwen3RMSNorm3    N__name__
__module____qualname____firstlineno____static_attributes__r       _/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   3       r&   r   c                       \ rS rSrSrg)Qwen3MLP7   r   Nr    r   r&   r'   r*   r*   7   r(   r&   r*   c                   B  ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\   S	\
\R                     S
\\   S\	\R                  \
\R                     \
\	\R                        4   4S jjrSrU =r$ )Qwen3Attention;   config	layer_idxc                   > [         TU ]  X5        [        U R                  UR                  S9U l        [        U R                  UR                  S9U l        UR                  U   S:X  a  UR                  U l        g S U l        g )N)epssliding_attention)	super__init__r   head_dimrms_norm_epsq_normk_normlayer_typessliding_window)selfr/   r0   	__class__s      r'   r5   Qwen3Attention.__init__<   si    +"4==f6I6IJ"4==f6I6IJ7=7I7I)7TXk7kf33qur&   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    UR                   S S n/ UQSPU R                  P7nU R                  U R                  U5      R	                  U5      5      R                  SS5      n	U R                  U R                  U5      R	                  U5      5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                   (       d  SOU R"                  U R$                  U R&                  S.UD6u  nnUR(                  " / UQSP76 R+                  5       nU R-                  U5      nUU4$ )Nr   r   )sincosrC   eagerg        )dropoutscalingr;   )shaper6   r8   q_projview	transposer9   k_projv_projr   updater0   r   r/   _attn_implementationr	   trainingattention_dropoutrL   r;   reshape
contiguouso_proj)r<   r?   r@   rA   rB   rC   rD   input_shapehidden_shapequery_states
key_statesvalue_statesrI   rH   cache_kwargsattention_interfaceattn_outputattn_weightss                     r'   forwardQwen3Attention.forwardB   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL..
%
 
%
!\ "));;;;FFHkk+.L((r&   )r9   r8   r;   )NN)r!   r"   r#   r$   r   intr5   torchTensortupler   r   
LongTensorr
   r   rc   r%   __classcell__r=   s   @r'   r-   r-   ;   s    v{ vs v +/59*)||*) #5<<#=>*) !.	*)
 !*) !!1!12*) -.*) 
u||Xell3XeELL>Q5RR	S*) *)r&   r-   c                       \ rS rSrSrg)Qwen3DecoderLayero   r   Nr    r   r&   r'   rm   rm   o   r(   r&   rm   c                       \ rS rSrSrg)Qwen3PreTrainedModels   r   Nr    r   r&   r'   rp   rp   s   r(   r&   rp   c                       \ rS rSrSrg)
Qwen3Modelw   r   Nr    r   r&   r'   rs   rs   w   r(   r&   rs   c                   :   ^  \ rS rSrS\\   S\4U 4S jjrSrU =r	$ )Qwen3ForCausalLM{   super_kwargsrE   c                 $   > [         TU ]  " S0 UD6$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from transformers import AutoTokenizer, Qwen3ForCausalLM

>>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

>>> prompt = "Hey, are you conscious? Can you talk to me?"
>>> inputs = tokenizer(prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
```r   )r4   rc   )r<   rx   r=   s     r'   rc   Qwen3ForCausalLM.forward|   s    4 w...r&   r   )
r!   r"   r#   r$   r
   r   r   rc   r%   rj   rk   s   @r'   rv   rv   {   s%    /12/ 
 / /r&   rv   c                       \ rS rSrSrg)Qwen3ForSequenceClassification   r   Nr    r   r&   r'   r|   r|      r(   r&   r|   c                       \ rS rSrSrg)Qwen3ForTokenClassification   r   Nr    r   r&   r'   r   r      r(   r&   r   c                       \ rS rSrSrg)Qwen3ForQuestionAnswering   r   Nr    r   r&   r'   r   r      r(   r&   r   )rv   r   rp   rs   r|   r   )2__doc__typingr   r   rf   cache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr	   processing_utilsr
   utilsr   r   gemma.modeling_gemmar   llama.modeling_llamar   qwen2.modeling_qwen2r   r   r   r   r   r   r   r   r   r   configuration_qwen3r   
get_loggerr!   logger_CHECKPOINT_FOR_DOCr   r*   r-   rm   rp   rs   rv   r|   r   r   __all__r   r&   r'   <module>r      s     %    B 6 5 & 0 +   - 
		H	%% 	< 		x 	1)^ 1)h	) 		/ 		 	/' /<	%C 		"= 		 9 	r&   