ó
    <±h!  ã                   óT   • S SK Jr  SSKJrJr   " S S\5      r " S S\5      rSS/rg)	é   )ÚPretrainedConfigé   )ÚCONFIG_MAPPINGÚ
AutoConfigc                   óZ   ^ • \ rS rSrSrSrSSSSSS	.r           SU 4S
 jjrSrU =r	$ )ÚVoxtralEncoderConfigé   aÀ	  
This is the configuration class to store the configuration of a [`VoxtralEncoder`]. It is used to instantiate a
Voxtral audio encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the audio encoder of the Voxtral
architecture.

e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 51866):
        Vocabulary size of the model.
    hidden_size (`int`, *optional*, defaults to 1280):
        Dimensionality of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 5120):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 20):
        Number of attention heads for each attention layer in the Transformer encoder.
    scale_embedding (`bool`, *optional*, defaults to `False`):
        Scale embeddings by dividing by sqrt(hidden_size) if True.
    activation_function (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu",
    num_mel_bins (`int`, *optional*, defaults to 128):
        Number of mel features used per input features. Should correspond to the value used in the
        `VoxtralProcessor` class.
    max_source_positions (`int`, *optional*, defaults to 1500):
        The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.

```python
>>> from transformers import VoxtralEncoderConfig, VoxtralEncoder

>>> # Initializing a VoxtralEncoderConfig
>>> configuration = VoxtralEncoderConfig()

>>> # Initializing a VoxtralEncoder (with random weights)
>>> model = VoxtralEncoder(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úvoxtral_encoderÚhidden_sizeÚnum_hidden_layersÚnum_attention_headsÚintermediate_sizeÚ	layerdrop)Úd_modelÚencoder_layersÚencoder_attention_headsÚencoder_ffn_dimÚencoder_layerdropc                 óÔ   >• [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        X€l	        Xl
        X l        SU l        SU l        SU l        X°l        g )Nç        © )ÚsuperÚ__init__Ú
vocab_sizer   r   r   r   Úscale_embeddingÚactivation_functionÚnum_mel_binsÚmax_source_positionsÚinitializer_rangeÚdropoutr   Úactivation_dropoutÚattention_dropout)Úselfr   r   r   r   r   r   r   r   r   r   r"   ÚkwargsÚ	__class__s                €Úi/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/voxtral/configuration_voxtral.pyr   ÚVoxtralEncoderConfig.__init__P   sn   ø€ ô 	‰ÒÑ"˜6Ò"à$ŒØ&ÔØ!2ÔØ!2Ôà#6Ô Ø.ÔØ#6Ô Ø(ÔØ$8Ô!Ø!2Ôð
 ˆŒØˆŒØ"%ˆÔà!2Õó    )r!   r   r"   r    r   r   r   r   r   r   r   r   r   r   )išÊ  i   i   é    r	   FÚgelué€   iÜ  g{®Gáz”?r   )
Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚattribute_mapr   Ú__static_attributes__Ú__classcell__©r%   s   @r&   r   r      sU   ø† ñ/ðb #€Jð !Ø-Ø#8Ø.Ø(ñ€Mð ØØØØØØ"ØØ!ØØ÷$3õ $3r(   r   c                   ó`   ^ • \ rS rSrSrSr\\S.rSSSSS	SS
SSSS.
r    SU 4S jjr	Sr
U =r$ )ÚVoxtralConfigéw   aÓ  
This is the configuration class to store the configuration of a [`VoxtralForConditionalGeneration`]. It is used to instantiate an
Voxtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Voxtral-Mini-3B.

e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    audio_config (`Union[AutoConfig, dict]`, *optional*):
        The config object or dictionary of the audio encoder.
    text_config (`Union[AutoConfig, dict]`, *optional*):
        The config object or dictionary of the text model.
    audio_token_id (`int`, *optional*):
        The image token index to encode the image prompt.
    projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The activation function (function or string) in the multi-modal projector.

```python
>>> from transformers import VoxtralForConditionalGeneration, VoxtralConfig

>>> # Initializing a Voxtral configuration
>>> configuration = VoxtralConfig(audio_token_id=24, projector_hidden_act="gelu")

>>> # Initializing a 3B model with random weights
>>> model = VoxtralForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úvoxtral)Útext_configÚaudio_configi   i   i    é   é   gñhãˆµøä>Tg    „×—Ar+   )
r   r   r   r   Únum_key_value_headsÚmax_position_embeddingsÚrms_norm_epsÚ	use_cacheÚ
rope_thetaÚhead_dimc                 ó  >• [        U[        5      (       a(  UR                  SS5      US'   [        US      " S0 UD6nOUc  [        S   " 5       nXl        [        U[        5      (       a6  UR                  SS5      US'   [        US      " S0 0 U R
                  EUED6nOUc  [        S   " S0 U R
                  D6nX l        UR                  U l        UR                  U l        X0l	        X@l
        [        TU ]0  " S0 UD6  g )Nr1   r
   Úllamar   )Ú
isinstanceÚdictÚgetr   r;   Ú_default_text_config_kwargsr:   r   r   Úaudio_token_idÚprojector_hidden_actr   r   )r#   r;   r:   rJ   rK   r$   r%   s         €r&   r   ÚVoxtralConfig.__init__©   s  ø€ ô l¤D×)Ñ)Ø)5×)9Ñ)9¸,ÐHYÓ)ZˆL˜Ñ&Ü)¨,°|Ñ*DÒEÑUÈÑU‰LØÑ!Ü)Ð*;Ò<Ó>ˆLØ(Ôäk¤4×(Ñ(Ø(3¯©¸ÀgÓ(NˆK˜Ñ%Ü(¨°\Ñ)BÒCñ ØET×5Ñ5ÐE¸ÐEñ‰Kð Ñ Ü(¨Ò1ÑU°D×4TÑ4TÑUˆKØ&Ôà%×0Ñ0ˆŒØ&×2Ñ2ˆÔØ,ÔØ$8Ô!ä‰ÒÑ"˜6Ó"r(   )r;   rJ   r   rK   r:   r   )NNNr*   )r,   r-   r.   r/   r0   r1   r   Úsub_configsrI   r   r3   r4   r5   s   @r&   r7   r7   w   sZ   ø† ñðB €JØ",¸jÑI€Kð ØØ!ØØ Ø#)ØØØ!Øñ#Ðð ØØØ#÷#õ #r(   r7   N)Úconfiguration_utilsr   Úautor   r   r   r7   Ú__all__r   r(   r&   Ú<module>rQ      s9   ðõ  4ß -ô`3Ð+ô `3ôFO#Ð$ô O#ðd " ?Ð
3r(   