
    <h;Z                        S SK r S SKJrJrJr  S SKrS SKJr  SSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#J$r$  \RJ                  " \&5      r'   S+S\RP                  S\RR                  S\RR                  S\RR                  S\\RR                     S\\*   S\*S\\RR                     4S jjr+ " S S\RP                  5      r, " S S\5      r-\ " S S \5      5       r.\" S!S"9 " S# S$\.5      5       r/ " S% S&\RP                  5      r0\" S'S"9 " S( S)\.\5      5       r1/ S*Qr2g),    N)CallableOptionalUnion)nn   )ACT2FN)Cache)GenerationMixin)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )	AutoModelAutoModelForCausalLM   )VoxtralConfigVoxtralEncoderConfigmodulequerykeyvalueattention_maskscalingdropout	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub0  UR                  S:X  a   XS S 2S S 2S S 2S UR
                  S   24   -   n	[        R                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R                  R                  XU R                  S	9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )
N      r   r      )dimr   ptraining)sizetorchmatmul	transposendimshaper   
functionalsoftmaxviewr#   r-   
contiguous)r   r   r   r    r!   r"   r#   r$   kwargsattn_weightsattn_outputs              d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/voxtral/modeling_voxtral.pyeager_attention_forwardr<   ,   s     **R.D(<<}}Q':;gEL!n&9&9Q&>#Q1o		"o5M&NN==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$    c                   `  ^  \ rS rSrSr      SS\S\S\S\S\S\S	\\   S
\\	   4U 4S jjjr
S\R                  S\S\4S jr   SS\R                  S\\R                     S\\R                     S\S\\R                  \\R                     \\\R                        4   4
S jjrSrU =r$ )VoxtralAttentionJ   z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr#   
is_decoderbias	is_causal	layer_idxconfigc	                 `  > [         T	U ]  5         Xl        X l        X0l        X-  U l        Xl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        Uc4  U(       a-  [        R                  SU R                  R                   S35        Xpl        [         R"                  " XSS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        [         R"                  " XUS9U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r'   zInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrD   )super__init__rA   rB   r#   head_dimrG   
ValueErrorr"   rC   rE   loggerwarning_once	__class____name__rF   r   Lineark_projv_projq_projout_proj)
selfrA   rB   r#   rC   rD   rE   rF   rG   rP   s
            r;   rK   VoxtralAttention.__init__M   s    	""!.MMI%$..8MdnnM]$YKr3  }}d*$"*4>>+B+B*C D, ,
 #ii	5Aii	4@ii	4@		)TBr=   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   r   )r6   rB   rL   r1   r7   )rW   rY   rZ   r[   s       r;   _shapeVoxtralAttention._shapeu   s5    {{3GQQRSUVWbbddr=   hidden_statesr!   layer_head_maskoutput_attentionsreturnc                 P   UR                  5       u  pgnU R                  U R                  U5      U R                  -  Xv5      n	U R                  U R	                  U5      SU5      n
U R                  U R                  U5      SU5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                  SUUS.UD6u  pUR                  XgS5      R                  5       nU R                  U5      nX4$ )z#Input shape: Batch x Time x Channelr&   eager              ?)r#   r"   ra   r$   )r.   r]   rU   r"   rS   rT   r<   rG   _attn_implementationr   r-   r#   reshaper7   rV   )rW   r_   r!   r`   ra   r8   r[   tgt_len_query_states
key_statesvalue_statesattention_interfacer:   r9   s                  r;   forwardVoxtralAttention.forwardx   s    (,,.a {{4;;}#=#Lg[[[]!;RE
{{4;;}#=r3G(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,/%%
 %
! "))#;FFHmmK0((r=   )rG   r#   rA   rL   rE   rC   rS   rF   rB   rV   rU   r"   rT   )re   FTFNN)NNF)rQ   
__module____qualname____firstlineno____doc__intfloatboolr   r   rK   r/   Tensorr]   tuplero   __static_attributes____classcell__rP   s   @r;   r?   r?   J   s*   G  #'*.&C&C &C 	&C
 &C &C &C C=&C '&C &CPeU\\ eC ec e 2626"'))||)) !.)) "%,,/	))
  )) 
u||Xell3XeELL>Q5RR	S)) ))r=   r?   c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\S\R                  4
S	 jjr	S
r
U =r$ )VoxtralEncoderLayer   rG   c                 j  > [         TU ]  5         UR                  U l        [	        U R                  UR
                  UR                  US9U l        [        R                  " U R                  5      U l
        UR                  U l        [        UR                     U l        UR                  U l        [        R                   " U R                  UR"                  5      U l        [        R                   " UR"                  U R                  5      U l        [        R                  " U R                  5      U l        g )N)rA   rB   r#   rG   )rJ   rK   d_modelrA   r?   encoder_attention_headsattention_dropout	self_attnr   	LayerNormself_attn_layer_normr#   r   activation_functionactivation_fnactivation_dropoutrR   encoder_ffn_dimfc1fc2final_layer_normrW   rG   rP   s     r;   rK   VoxtralEncoderLayer.__init__   s    )nn44,,	
 %'LL$@!~~#F$>$>?"(";";99T^^V-C-CD99V33T^^D "T^^ <r=   r_   r!   r`   ra   rb   c                    UnU R                  U5      nU R                  UUUUS9u  p[        R                  R	                  XR                  U R
                  S9nXQ-   nUnU R                  U5      nU R                  U R                  U5      5      n[        R                  R	                  XR                  U R
                  S9nU R                  U5      n[        R                  R	                  XR                  U R
                  S9nXQ-   nUR                  [        R                  :X  aC  [        R                  " UR                  5      R                  S-
  n[        R                   " X* US9nX4$ )aW  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r_   r!   r`   ra   r+   i  )minmax)r   r   r   r4   r#   r-   r   r   r   r   r   dtyper/   float16finfor   clamp)rW   r_   r!   r`   ra   residualr9   clamp_values           r;   ro   VoxtralEncoderLayer.forward   sC   $ !11-@&*nn')+/	 '5 '
# --m||VZVcVc-d 0 --m<**488M+BC--m?V?Vaeanan-o/--m||VZVcVc-d 0%--/++m&9&9:>>EK!KK<[YM**r=   )	r   r   r#   rA   r   r   r   r   r   )F)rQ   rq   rr   rs   r   rK   r/   rx   rw   ro   rz   r{   r|   s   @r;   r~   r~      s^    =} =. #()+||)+ )+ 	)+
  )+ 
)+ )+r=   r~   c                   N    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrSrS rSrg)	VoxtralPreTrainedModel   rG   modelTNpast_key_valuesc                    [        U R                  S5      (       a  U R                  R                  OU R                  R                  R                  n[	        U[
        R                  [
        R                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [	        U[
        R                  5      (       aJ  UR                  R                  R                  S5        UR                  R                  R                  5         g [	        U[
        R                  5      (       ad  UR                  R                  R                  SUS9  UR                   b2  UR                  R                  UR                      R                  5         g g g )Ninitializer_rangere   )meanstdrf   )hasattrrG   r   audio_config
isinstancer   rR   Conv1dweightdatanormal_rD   zero_r   fill_	Embeddingpadding_idx)rW   r   r   s      r;   _init_weights$VoxtralPreTrainedModel._init_weights   s?   
 t{{$788 KK))));; 	 fryy"))455MM&&CS&9{{&  &&( '--MM$$S)KK""$--MM&&CS&9!!-""6#5#56<<> . .r=    )rQ   rq   rr   rs   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraphr   rz   r   r=   r;   r   r      sH    &*#"3N "&!?r=   r   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                      ^  \ rS rSr% Sr\\S'   SrS/r\	\
S.rS\4U 4S jjrS rS	\R                  4S
 jrS\R                  4S jr\ SS\\   4S jj5       rS\R.                  4S jrSrU =r$ )VoxtralEncoderi  z
Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
[`VoxtralEncoderLayer`].

Args:
    config: VoxtralEncoderConfig
rG   input_featuresr~   )
attentionsr_   c                   > [         TU ]  U5        UR                  U l        UR                  U l        UR
                  nUR                  U l        UR                  U l        UR                  U l	        UR                  (       a  [        R                  " U5      OSU l        [        R                  " U R                  USSS9U l        [        R                  " X"SSSS9U l        [        R$                  " U R                  U5      U l        U R&                  R)                  S5        [        R*                  " [-        UR.                  5       Vs/ sH  n[1        U5      PM     sn5      U l        [        R4                  " UR
                  5      U l        [        R8                  " SSS9U l        SU l        U R?                  5         g s  snf )	Nrf   r   r   )kernel_sizepaddingr   )r   strider   F)r   ) rJ   rK   r#   encoder_layerdrop	layerdropr   num_mel_binspad_token_idr   max_source_positionsscale_embeddingmathsqrtembed_scaler   r   conv1conv2r   embed_positionsrequires_grad_
ModuleListrangeencoder_layersr~   layersr   
layer_norm	AvgPool1d
avg_poolergradient_checkpointing	post_init)rW   rG   rA   rj   rP   s       r;   rK   VoxtralEncoder.__init__  sH    ~~11NN	"//!..$*$?$?!393I3I499Y/sYYt00)TUV
YYy1VWX
!||D,E,EyQ++E2mm%PVPePeJf$gJfQ%8%@Jf$gh,,v~~6,,q3&+# %hs   
Gc                 N    U R                  5        H
  nSUl        M     SU l        g )NF)
parametersrequires_grad_requires_grad)rW   params     r;   _freeze_parameters!VoxtralEncoder._freeze_parameters8  s#    __&E"'E '#r=   rb   c                     U R                   $ Nr   rW   s    r;   get_input_embeddings#VoxtralEncoder.get_input_embeddings=  s    zzr=   r    c                     Xl         g r   r   rW   r    s     r;   set_input_embeddings#VoxtralEncoder.set_input_embeddings@  s    
r=   r8   c           	         U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  pU	" UUSS9n
U
S   nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   r&   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)r   devicer   r   r+   N)r!   r`   )last_hidden_state)rG   r   r   r   r   r3   rM   tor   r   r   r   r4   gelupermuter   r#   r-   	enumerater   r   r   )rW   r   r!   r8   expected_seq_lengthinputs_embeds	embed_posr_   idxencoder_layerlayer_outputss              r;   ro   VoxtralEncoder.forwardC  s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::LM`Laamn|  oC  oC  DF  oG  nH  Hu  vI  uJ  JK  L  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--m||VZVcVc-d"+DKK"8C)- $M
 *!,M #9 6+
 	
r=   input_lengthsc                 4    US-
  S-  S-   nUS-
  S-  S-   nX4$ )zc
Computes the output length of the convolutional layers and the output length of the audio encoder
r   r   r   )rW   r   output_lengthss      r;    _get_feat_extract_output_lengths/VoxtralEncoder._get_feat_extract_output_lengthst  s5     '*q014'!+1A5,,r=   )r   r   r   r   r#   r   r   r   r   r   r   r   r   r   r   )rQ   rq   rr   rs   rt   r   r   main_input_namer   r?   r~   _can_record_outputsrK   r   r   Moduler   r   r   r   r   ro   r/   
LongTensorr   rz   r{   r|   s   @r;   r   r     s     ! &O./&,
3 4$
bii "))   -
 +,	-
 -
`-e>N>N - -r=   r   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VoxtralMultiModalProjectori}  rG   c                 ^  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )NFrI   )rJ   rK   r   rR   r   intermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2r   s     r;   rK   #VoxtralMultiModalProjector.__init__~  sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr=   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  )rW   audio_featuresr_   s      r;   ro   "VoxtralMultiModalProjector.forward  s2    n5/m4r=   )r  r  r  )	rQ   rq   rr   rs   r   rK   ro   rz   r{   r|   s   @r;   r  r  }  s    n} n r=   r  zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                     ^  \ rS rSrS/rSS0rSS/S/40rS/rU 4S jrS	 r	S
 r
S rS rS rS rS\R                   4S jr\\          SS\\R*                     S\\R                      S\\R,                     S\\R*                     S\\   S\\R                      S\\R*                     S\\   S\\R*                     S\\\R,                  4   S\\   S\4S jj5       5       rU 4S jrSr U =r!$ ) VoxtralForConditionalGenerationi  zlm_head.weightlm_headcolwise_repr_   logitsr   c                 .  > [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        U R                  5         g r   )rJ   rK   r	  
vocab_sizer   from_configr   audio_towerr   language_modelr  multi_modal_projectorr   r   s     r;   rK   (VoxtralForConditionalGeneration.__init__  sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r=   c                 6    U R                   R                  5       $ r   )r  r   r   s    r;   r   4VoxtralForConditionalGeneration.get_input_embeddings  s    ""7799r=   c                 :    U R                   R                  U5        g r   )r  r   r   s     r;   r   4VoxtralForConditionalGeneration.set_input_embeddings  s    007r=   c                 6    U R                   R                  5       $ r   )r  get_output_embeddingsr   s    r;   r$  5VoxtralForConditionalGeneration.get_output_embeddings  s    ""88::r=   c                 :    U R                   R                  U5        g r   )r  set_output_embeddings)rW   new_embeddingss     r;   r'  5VoxtralForConditionalGeneration.set_output_embeddings  s    11.Ar=   c                 :    U R                   R                  U5        g r   )r  set_decoder)rW   decoders     r;   r+  +VoxtralForConditionalGeneration.set_decoder  s    ''0r=   c                 6    U R                   R                  5       $ r   )r  get_decoderr   s    r;   r/  +VoxtralForConditionalGeneration.get_decoder  s    ""..00r=   r   c                     U R                  U5      nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nU$ )a(  
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
Args:
    input_features (`torch.FloatTensor`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

Returns:
    `torch.FloatTensor`:
        The audio embeddings.
r&   )r  r   rh   rG   r   r  r  )rW   r   audio_outputsaudio_hidden_statesaudio_embedss        r;   get_audio_embeds0VoxtralForConditionalGeneration.get_audio_embeds  sZ     ((8+==199"dkk>V>V>h>hi112EFr=   	input_idsr!   position_idsr   r   labels	use_cachecache_positionlogits_to_keepr8   rb   c                     Uc  U R                  5       " U5      nUb-  U R                  U5      nXR                  R                  :H  nXU'   U R                  " SUUUUUUU	U
S.UD6nU$ )a  
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```)r!   r8  r   r   r9  r:  r;  r<  r   )r   r5  rG   audio_token_idr  )rW   r7  r   r!   r8  r   r   r9  r:  r;  r<  r8   r4  audio_token_maskoutputss                  r;   ro   'VoxtralForConditionalGeneration.forward  s    b   557	BM%00@L  )KK,F,FF.:*++/+>+> 
,
)%+'))
,
 
,
 r=   c                    > UR                  SS 5      nUR                  S5      n[        TU ]  " U0 UD6nUb  US   S:X  a  X5S'   U$ )Nr   r;  r   )popgetrJ   prepare_inputs_for_generation)rW   argsr8   r   r;  model_inputsrP   s         r;   rE  =VoxtralForConditionalGeneration.prepare_inputs_for_generation  sZ      $4d;$45w<dMfM%.*;q*@-;)*r=   )r  r  r  r  )
NNNNNNNNNr   )"rQ   rq   rr   rs   _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictrK   r   r   r$  r'  r+  r/  r/   FloatTensorr5  r   r   r   r  rx   r	   rw   r   ru   r   r   r   ro   rE  rz   r{   r|   s   @r;   r  r    s    ++=)H_-z:;H$5#6 :8;B11u/@/@ *  156:1537+/59-1$(5934DE,,-D !!2!23D !.	D
 u//0D "%D   1 12D ))*D D>D !!1!12D c5<</0D +,D 
 D  DL r=   r  )r   r   r  )Nre   N)3r   typingr   r   r   r/   r   activationsr   cache_utilsr	   
generationr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   utils.genericr   autor   r   configuration_voxtralr   r   
get_loggerrQ   rN   r  rx   rv   r<   r?   r~   r   r   r  r  __all__r   r=   r;   <module>r\     s|  ,  , ,   !   ) 9 ` ` F & R R / 2 F 
		H	%  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<W)ryy W)t<+4 <+~  ?_  ?  ?F 
n-+ n-
n-b  
K&<o K
K\ Zr=   