
    <h]-                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJrJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJrJrJrJr  SSKJ r    " S S\5      r! " S S\5      r" " S S\5      r#\" SS9 " S S\5      5       r$ " S S\RJ                  5      r&\" SS9 " S S\#\
5      5       r'/ S Qr(g)!    )OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputBaseModelOutputWithPastCausalLMOutputWithPast)Unpack)TransformersKwargsauto_docstringcan_return_tuple)check_model_inputs   )	AutoModelAutoModelForCausalLM)Qwen2AudioAttentionQwen2AudioEncoderQwen2AudioEncoderLayerQwen2AudioPreTrainedModel   )VoxtralConfigc                       \ rS rSrSrg)VoxtralAttention&    N__name__
__module____qualname____firstlineno____static_attributes__r       c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   &       r%   r   c                       \ rS rSrSrg)VoxtralEncoderLayer*   r   Nr   r   r%   r&   r)   r)   *   r'   r%   r)   c                   ,    \ rS rSrSrSrSrSrSrSrSr	g)VoxtralPreTrainedModel.   TNr   )
r    r!   r"   r#   _supports_flex_attn_supports_cache_class_supports_attention_backend_can_compile_fullgraph_no_split_modulesr$   r   r%   r&   r,   r,   .   s&     "&!"&r%   r,   z:
    The Voxtral encoder, which is a Whisper encoder.
    )custom_introc                   B    \ rS rSr\\S.r\ SS\\	   4S jj5       r
Srg)VoxtralEncoder8   )
attentionshidden_statesNkwargsc           	         U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  pU	" UUSS9n
U
S   nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   z:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to .)dtypedevicer   r   )ptrainingN)attention_masklayer_head_mask)last_hidden_state)configmax_source_positionsconv1strideconv2shape
ValueErrortoweightr=   r>   r   
functionalgelupermuteembed_positionsdropoutr@   	enumeratelayers
layer_normr
   )selfinput_featuresrA   r9   expected_seq_lengthinputs_embeds	embed_posr8   idxencoder_layerlayer_outputss              r&   forwardVoxtralEncoder.forwardC   s   & #kk>>ARARSTAUUX\XbXbXiXijkXll#'::LM`Laamn|  oC  oC  DF  oG  nH  Hu  vI  uJ  JK  L  (**1B1B1H1HQUQ[Q[QbQbQiQi*j**4::n+EF**4::m+DE%--aA6((//	&266}7J7JK--m||VZVcVc-d"+DKK"8C)- $M
 *!,M #9 6+
 	
r%   r   N)r    r!   r"   r#   r   r)   _can_record_outputsr   r   r   r]   r$   r   r%   r&   r5   r5   8   s;     ',
  -
 +,	-
 -
r%   r5   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VoxtralMultiModalProjectort   rD   c                 ^  > [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )NF)bias)super__init__r   Linearaudio_configintermediate_sizetext_confighidden_sizelinear_1r   projector_hidden_actactlinear_2rU   rD   	__class__s     r&   rg   #VoxtralMultiModalProjector.__init__u   sz    		&"5"5"G"GI[I[IgIgnst&556		&"4"4"@"@&BTBTB`B`glmr%   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r_   )rm   ro   rp   )rU   audio_featuresr8   s      r&   r]   "VoxtralMultiModalProjector.forward{   s2    n5/m4r%   )ro   rm   rp   )	r    r!   r"   r#   r   rg   r]   r$   __classcell__rr   s   @r&   rb   rb   t   s    n} n r%   rb   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                     ^  \ rS rSrS/rSS0rSS/S/40rS/rU 4S jrS	 r	S
 r
S rS rS rS rS\R                   4S jr\\          SS\\R*                     S\\R                      S\\R,                     S\\R*                     S\\   S\\R                      S\\R*                     S\\   S\\R*                     S\\\R,                  4   S\\   S\4S jj5       5       rU 4S jrSr U =r!$ ) VoxtralForConditionalGeneration   zlm_head.weightlm_headcolwise_repr8   logitsrP   c                 .  > [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        U R                  5         g r_   )rf   rg   rk   
vocab_sizer   from_configri   audio_towerr   language_modelrb   multi_modal_projector	post_initrq   s     r&   rg   (VoxtralForConditionalGeneration.__init__   sn      ,,77$001D1DE2>>v?Q?QR%?%G" 	r%   c                 6    U R                   R                  5       $ r_   )r   get_input_embeddingsrU   s    r&   r   4VoxtralForConditionalGeneration.get_input_embeddings   s    ""7799r%   c                 :    U R                   R                  U5        g r_   )r   set_input_embeddings)rU   values     r&   r   4VoxtralForConditionalGeneration.set_input_embeddings   s    007r%   c                 6    U R                   R                  5       $ r_   )r   get_output_embeddingsr   s    r&   r   5VoxtralForConditionalGeneration.get_output_embeddings   s    ""88::r%   c                 :    U R                   R                  U5        g r_   )r   set_output_embeddings)rU   new_embeddingss     r&   r   5VoxtralForConditionalGeneration.set_output_embeddings   s    11.Ar%   c                 :    U R                   R                  U5        g r_   )r   set_decoder)rU   decoders     r&   r   +VoxtralForConditionalGeneration.set_decoder   s    ''0r%   c                 6    U R                   R                  5       $ r_   )r   get_decoderr   s    r&   r   +VoxtralForConditionalGeneration.get_decoder   s    ""..00r%   rV   c                     U R                  U5      nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nU$ )a(  
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
Args:
    input_features (`torch.FloatTensor`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

Returns:
    `torch.FloatTensor`:
        The audio embeddings.
r;   )r   rC   reshaperD   ri   rj   r   )rU   rV   audio_outputsaudio_hidden_statesaudio_embedss        r&   get_audio_embeds0VoxtralForConditionalGeneration.get_audio_embeds   sZ     ((8+==199"dkk>V>V>h>hi112EFr%   	input_idsrA   position_idspast_key_valuesrX   labels	use_cachecache_positionlogits_to_keepr9   returnc                     Uc  U R                  5       " U5      nUb-  U R                  U5      nXR                  R                  :H  nXU'   U R                  " SUUUUUUU	U
S.UD6nU$ )a  
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```)rA   r   r   rX   r   r   r   r   r   )r   r   rD   audio_token_idr   )rU   r   rV   rA   r   r   rX   r   r   r   r   r9   r   audio_token_maskoutputss                  r&   r]   'VoxtralForConditionalGeneration.forward   s    b   557	BM%00@L  )KK,F,FF.:*++/+>+> 
,
)%+'))
,
 
,
 r%   c                    > UR                  SS 5      nUR                  S5      n[        TU ]  " U0 UD6nUb  US   S:X  a  X5S'   U$ )NrV   r   r   )popgetrf   prepare_inputs_for_generation)rU   argsr9   rV   r   model_inputsrr   s         r&   r   =VoxtralForConditionalGeneration.prepare_inputs_for_generation  sZ      $4d;$45w<dMfM%.*;q*@-;)*r%   )r   r   r   r   )
NNNNNNNNNr   )"r    r!   r"   r#   _tied_weights_keys_tp_plan_pp_plan_keep_in_fp32_modules_strictrg   r   r   r   r   r   r   torchFloatTensorr   r   r   r   
LongTensorTensorr   boolr   intr   r   r   r]   r   r$   rw   rx   s   @r&   rz   rz      s    ++=)H_-z:;H$5#6 :8;B11u/@/@ *  156:1537+/59-1$(5934DE,,-D !!2!23D !.	D
 u//0D "%D   1 12D ))*D D>D !!1!12D c5<</0D +,D 
 D  DL r%   rz   )r,   r5   rz   ))typingr   r   r   r   activationsr   cache_utilsr   
generationr	   modeling_outputsr
   r   r   processing_utilsr   utilsr   r   r   utils.genericr   autor   r    qwen2_audio.modeling_qwen2_audior   r   r   r   configuration_voxtralr   r   r)   r,   r5   Modulerb   rz   __all__r   r%   r&   <module>r      s     #   !   ) ` ` & I I / 2  1	* 		0 	6  
4
& 4

4
n  
K&<o K
K\ Zr%   