ó
    <±h]-  ã                   ó†  • S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJrJrJr  SS	KJr  SS
KJrJrJr  SSKJr  SSKJrJr  SSKJrJrJrJr  SSKJ r    " S S\5      r! " S S\5      r" " S S\5      r#\" SS9 " S S\5      5       r$ " S S\RJ                  5      r&\" SS9 " S S\#\
5      5       r'/ S Qr(g)!é    )ÚOptionalÚUnionN)Únné   )ÚACT2FN)ÚCache)ÚGenerationMixin)ÚBaseModelOutputÚBaseModelOutputWithPastÚCausalLMOutputWithPast)ÚUnpack)ÚTransformersKwargsÚauto_docstringÚcan_return_tuple)Úcheck_model_inputsé   )Ú	AutoModelÚAutoModelForCausalLM)ÚQwen2AudioAttentionÚQwen2AudioEncoderÚQwen2AudioEncoderLayerÚQwen2AudioPreTrainedModelé   )ÚVoxtralConfigc                   ó   • \ rS rSrSrg)ÚVoxtralAttentioné&   © N©Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__static_attributes__r   ó    Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/voxtral/modular_voxtral.pyr   r   &   ó   † Úr%   r   c                   ó   • \ rS rSrSrg)ÚVoxtralEncoderLayeré*   r   Nr   r   r%   r&   r)   r)   *   r'   r%   r)   c                   ó,   • \ rS rSrSrSrSrSrSrSrSr	g)ÚVoxtralPreTrainedModelé.   TNr   )
r    r!   r"   r#   Ú_supports_flex_attnÚ_supports_cache_classÚ_supports_attention_backendÚ_can_compile_fullgraphÚ_no_split_modulesr$   r   r%   r&   r,   r,   .   s&   † ØÐØ ÐØ"&ÐØ!ÐØ"&ÐØÓr%   r,   z:
    The Voxtral encoder, which is a Whisper encoder.
    )Úcustom_introc                   óB   • \ rS rSr\\S.r\ SS\\	   4S jj5       r
Srg)ÚVoxtralEncoderé8   )Ú
attentionsÚhidden_statesNÚkwargsc           	      óÊ  • U R                   R                  U R                  R                  S   -  U R                  R                  S   -  nUR
                  S   U:w  a"  [        SU SUR
                  S    SU S35      eUR                  U R                  R                  R                  U R                  R                  R                  S9n[        R                  R                  U R                  U5      5      n[        R                  R                  U R	                  U5      5      nUR                  SSS	5      nU R                  R                  nXV-   R                  UR                  5      n[        R                  R!                  XpR                   U R"                  S
9n[%        U R&                  5       H  u  p‰U	" UUSS9n
U
S   nM     U R)                  U5      n[+        US9$ )a  
Args:
    input_features (`torch.LongTensor` of shape `(batch_size, feature_size, sequence_length)`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]
    attention_mask (`torch.Tensor`)`, *optional*):
        Voxtral does not support masking of the `input_features`, this argument is preserved for compatibility,
        but it is not used. By default the silence in the input log mel spectrogram are ignored.
r   éÿÿÿÿz:Qwen2Audio expects the mel input features to be of length z, but found z-. Make sure to pad the input mel features to Ú.)ÚdtypeÚdevicer   r   )ÚpÚtrainingN)Úattention_maskÚlayer_head_mask)Úlast_hidden_state)ÚconfigÚmax_source_positionsÚconv1ÚstrideÚconv2ÚshapeÚ
ValueErrorÚtoÚweightr=   r>   r   Ú
functionalÚgeluÚpermuteÚembed_positionsÚdropoutr@   Ú	enumerateÚlayersÚ
layer_normr
   )ÚselfÚinput_featuresrA   r9   Úexpected_seq_lengthÚinputs_embedsÚ	embed_posr8   ÚidxÚencoder_layerÚlayer_outputss              r&   ÚforwardÚVoxtralEncoder.forwardC   sá  € ð& #Ÿk™k×>Ñ>ÀÇÁ×ARÑARÐSTÑAUÑUÐX\×XbÑXb×XiÑXiÐjkÑXlÑlÐØ×Ñ Ñ#Ð':Ó:ÜØLÐM`ÐLaÐamÐn|÷  oCñ  oCð  DFñ  oGð  nHð  Huð  vIð  uJð  JKð  Lóð ð (×*Ñ*°·±×1BÑ1B×1HÑ1HÐQU×Q[ÑQ[×QbÑQb×QiÑQiÐ*ÐjˆÜŸ™×*Ñ*¨4¯:©:°nÓ+EÓFˆÜŸ™×*Ñ*¨4¯:©:°mÓ+DÓEˆØ%×-Ñ-¨a°°AÓ6ˆà×(Ñ(×/Ñ/ˆ	Ø&Ñ2×6Ñ6°}×7JÑ7JÓKˆÜŸ™×-Ñ-¨m¿|¹|ÐVZ×VcÑVcÐ-Ðdˆä"+¨D¯K©KÖ"8ÑˆCÙ)ØØ-Ø $ñˆMð
 *¨!Ñ,ŠMñ #9ð Ÿ™¨Ó6ˆäØ+ñ
ð 	
r%   r   ©N)r    r!   r"   r#   r   r)   Ú_can_record_outputsr   r   r   r]   r$   r   r%   r&   r5   r5   8   s;   † ð 'Ø,ñÐð
 ð ñ-
ð Ð+Ñ,ô	-
ó ó-
r%   r5   c                   ó6   ^ • \ rS rSrS\4U 4S jjrS rSrU =r$ )ÚVoxtralMultiModalProjectorét   rD   c                 ó^  >• [         TU ]  5         [        R                  " UR                  R
                  UR                  R                  SS9U l        [        UR                     U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )NF)Úbias)ÚsuperÚ__init__r   ÚLinearÚaudio_configÚintermediate_sizeÚtext_configÚhidden_sizeÚlinear_1r   Úprojector_hidden_actÚactÚlinear_2©rU   rD   Ú	__class__s     €r&   rg   Ú#VoxtralMultiModalProjector.__init__u   sz   ø€ Ü‰ÑÔÜŸ	š	 &×"5Ñ"5×"GÑ"GÈ×I[ÑI[×IgÑIgÐnsÑtˆŒÜ˜&×5Ñ5Ñ6ˆŒÜŸ	š	 &×"4Ñ"4×"@Ñ"@À&×BTÑBT×B`ÑB`ÐglÑmˆr%   c                 ól   • U R                  U5      nU R                  U5      nU R                  U5      nU$ r_   )rm   ro   rp   )rU   Úaudio_featuresr8   s      r&   r]   Ú"VoxtralMultiModalProjector.forward{   s2   € ØŸ™ nÓ5ˆØŸ™ Ó/ˆØŸ™ mÓ4ˆØÐr%   )ro   rm   rp   )	r    r!   r"   r#   r   rg   r]   r$   Ú__classcell__©rr   s   @r&   rb   rb   t   s   ø† ðn˜}÷ n÷ð r%   rb   zs
    The Voxtral model, which consists of Whisper encoder, a multi-modal projector and a LLama language model.
    c                   óê  ^ • \ rS rSrS/rSS0rSS/S/40rS/rU 4S jrS	 r	S
 r
S rS rS rS rS\R                   4S jr\\          SS\\R*                     S\\R                      S\\R,                     S\\R*                     S\\   S\\R                      S\\R*                     S\\   S\\R*                     S\\\R,                  4   S\\   S\4S jj5       5       rU 4S jrSr U =r!$ ) ÚVoxtralForConditionalGenerationé‚   zlm_head.weightÚlm_headÚcolwise_repr8   ÚlogitsrP   c                 ó.  >• [         TU ]  U5        UR                  R                  U l        [        R
                  " UR                  5      U l        [        R
                  " UR                  5      U l	        [        U5      U l        U R                  5         g r_   )rf   rg   rk   Ú
vocab_sizer   Úfrom_configri   Úaudio_towerr   Úlanguage_modelrb   Úmulti_modal_projectorÚ	post_initrq   s     €r&   rg   Ú(VoxtralForConditionalGeneration.__init__   sn   ø€ Ü‰Ñ˜Ô Ø ×,Ñ,×7Ñ7ˆŒÜ$×0Ò0°×1DÑ1DÓEˆÔÜ2×>Ò>¸v×?QÑ?QÓRˆÔÜ%?ÀÓ%GˆÔ"ð 	‰Õr%   c                 ó6   • U R                   R                  5       $ r_   )rƒ   Úget_input_embeddings©rU   s    r&   rˆ   Ú4VoxtralForConditionalGeneration.get_input_embeddings—   s   € Ø×"Ñ"×7Ñ7Ó9Ð9r%   c                 ó:   • U R                   R                  U5        g r_   )rƒ   Úset_input_embeddings)rU   Úvalues     r&   rŒ   Ú4VoxtralForConditionalGeneration.set_input_embeddingsš   s   € Ø×Ñ×0Ñ0°Õ7r%   c                 ó6   • U R                   R                  5       $ r_   )rƒ   Úget_output_embeddingsr‰   s    r&   r   Ú5VoxtralForConditionalGeneration.get_output_embeddings   s   € Ø×"Ñ"×8Ñ8Ó:Ð:r%   c                 ó:   • U R                   R                  U5        g r_   )rƒ   Úset_output_embeddings)rU   Únew_embeddingss     r&   r“   Ú5VoxtralForConditionalGeneration.set_output_embeddings    s   € Ø×Ñ×1Ñ1°.ÕAr%   c                 ó:   • U R                   R                  U5        g r_   )rƒ   Úset_decoder)rU   Údecoders     r&   r—   Ú+VoxtralForConditionalGeneration.set_decoder£   s   € Ø×Ñ×'Ñ'¨Õ0r%   c                 ó6   • U R                   R                  5       $ r_   )rƒ   Úget_decoderr‰   s    r&   r›   Ú+VoxtralForConditionalGeneration.get_decoder¦   s   € Ø×"Ñ"×.Ñ.Ó0Ð0r%   rV   c                 óÂ   • U R                  U5      nUR                  nUR                  SU R                  R                  R
                  5      nU R                  U5      nU$ )a(  
This method is used to get the audio embeddings from input features (a log mel spectrogram), meaning inferring the audio encoder and the multi-modal projector.
Args:
    input_features (`torch.FloatTensor`):
        Float values of mel features extracted from the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]` or a
        `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
        `input_features`, the [`AutoFeatureExtractor`] should be used for extracting the mel features, padding
        and conversion into a tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`]

Returns:
    `torch.FloatTensor`:
        The audio embeddings.
r;   )r‚   rC   ÚreshaperD   ri   rj   r„   )rU   rV   Úaudio_outputsÚaudio_hidden_statesÚaudio_embedss        r&   Úget_audio_embedsÚ0VoxtralForConditionalGeneration.get_audio_embeds©   sZ   € ð ×(Ñ(¨Ó8ˆØ+×=Ñ=ÐØ1×9Ñ9¸"¸d¿k¹k×>VÑ>V×>hÑ>hÓiÐØ×1Ñ1Ð2EÓFˆØÐr%   Ú	input_idsrA   Úposition_idsÚpast_key_valuesrX   ÚlabelsÚ	use_cacheÚcache_positionÚlogits_to_keepr9   Úreturnc                 óÎ   • Uc  U R                  5       " U5      nUb-  U R                  U5      nXR                  R                  :H  nXÆU'   U R                  " SUUUUUUU	U
S.UD6nU$ )a   
Example:

```python
>>> from transformers import VoxtralForConditionalGeneration, AutoProcessor
>>> import torch

>>> device = "cuda" if torch.cuda.is_available() else "cpu"
>>> repo_id = "mistralai/Voxtral-Mini-3B-2507"

>>> processor = AutoProcessor.from_pretrained(repo_id)
>>> model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)

>>> conversation = [
    {
        "role": "user",
        "content": [
            {
                "type": "audio",
                "url": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
            },
            {"type": "text", "text": "What can you tell me about this audio?"},
        ],
    }
]

>>> inputs = processor.apply_chat_template(conversation)
>>> inputs = inputs.to(device, dtype=torch.bfloat16)

>>> outputs = model.generate(**inputs, max_new_tokens=30)
>>> processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
["This audio is a humorous conversation between two friends, likely in English, where one of them is trying to figure out what the other's tattoo says."]
```)rA   r¥   r¦   rX   r§   r¨   r©   rª   r   )rˆ   r¢   rD   Úaudio_token_idrƒ   )rU   r¤   rV   rA   r¥   r¦   rX   r§   r¨   r©   rª   r9   r¡   Úaudio_token_maskÚoutputss                  r&   r]   Ú'VoxtralForConditionalGeneration.forward¾   s   € ðb Ñ Ø ×5Ñ5Ô7¸	ÓBˆMàÑ%Ø×0Ñ0°Ó@ˆLð  )¯K©K×,FÑ,FÑFÐØ.:Ð*Ñ+à+/×+>Ò+>ð 
,
Ø)Ø%Ø+Ø'ØØØ)Ø)ñ
,
ð ñ
,
ˆð ˆr%   c                 óŽ   >• UR                  SS 5      nUR                  S5      n[        TU ]  " U0 UD6nUb  US   S:X  a  X5S'   U$ )NrV   r©   r   )ÚpopÚgetrf   Úprepare_inputs_for_generation)rU   Úargsr9   rV   r©   Úmodel_inputsrr   s         €r&   r´   Ú=VoxtralForConditionalGeneration.prepare_inputs_for_generation  sZ   ø€ ð  Ÿ™Ð$4°dÓ;ˆØŸ™Ð$4Ó5ˆä‘wÒ<¸dÐMÀfÑMˆàÑ%¨.¸Ñ*;¸qÓ*@à-;Ð)Ñ*àÐr%   )r‚   rƒ   r„   r€   )
NNNNNNNNNr   )"r    r!   r"   r#   Ú_tied_weights_keysÚ_tp_planÚ_pp_planÚ_keep_in_fp32_modules_strictrg   rˆ   rŒ   r   r“   r—   r›   ÚtorchÚFloatTensorr¢   r   r   r   Ú
LongTensorÚTensorr   Úboolr   Úintr   r   r   r]   r´   r$   rw   rx   s   @r&   rz   rz   ‚   s  ø† ð +Ð+ÐØ˜=Ð)€HØ˜_Ð-°¨zÐ:Ð;€HØ$5Ð#6Ð õò:ò8ò;òBò1ò1ð¨u×/@Ñ/@ô ð* Øð 15Ø6:Ø15Ø37Ø+/Ø59Ø-1Ø$(Ø59Ø34ñDà˜E×,Ñ,Ñ-ðDð ! ×!2Ñ!2Ñ3ðDð ! §¡Ñ.ð	Dð
 ˜u×/Ñ/Ñ0ðDð " %™ðDð   × 1Ñ 1Ñ2ðDð ˜×)Ñ)Ñ*ðDð ˜D‘>ðDð ! ×!1Ñ!1Ñ2ðDð ˜c 5§<¡<Ð/Ñ0ðDð Ð+Ñ,ðDð 
 ôDó ó ðD÷Ló r%   rz   )r,   r5   rz   ))Útypingr   r   r¼   r   Úactivationsr   Úcache_utilsr   Ú
generationr	   Úmodeling_outputsr
   r   r   Úprocessing_utilsr   Úutilsr   r   r   Úutils.genericr   Úautor   r   Ú qwen2_audio.modeling_qwen2_audior   r   r   r   Úconfiguration_voxtralr   r   r)   r,   r5   ÚModulerb   rz   Ú__all__r   r%   r&   Ú<module>rÏ      sÏ   ð÷  #ã Ý å !Ý  Ý )ß `Ñ `Ý &ß IÑ IÝ /ß 2÷ó õ 1ô	Ð*ô 	ô	Ð0ô 	ôÐ6ô ñ ðñô
4
Ð&ó 4
óð
4
ôn §¡ô ñ ðñô
KÐ&<¸oó Kóð
Kò\ Zr%   