
    <h!                     T    S SK Jr  SSKJrJr   " S S\5      r " S S\5      rSS/rg)	   )PretrainedConfig   )CONFIG_MAPPING
AutoConfigc                   Z   ^  \ rS rSrSrSrSSSSSS	.r           SU 4S
 jjrSrU =r	$ )VoxtralEncoderConfig   a	  
This is the configuration class to store the configuration of a [`VoxtralEncoder`]. It is used to instantiate a
Voxtral audio encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the audio encoder of the Voxtral
architecture.

e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 51866):
        Vocabulary size of the model.
    hidden_size (`int`, *optional*, defaults to 1280):
        Dimensionality of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 5120):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 32):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 20):
        Number of attention heads for each attention layer in the Transformer encoder.
    scale_embedding (`bool`, *optional*, defaults to `False`):
        Scale embeddings by dividing by sqrt(hidden_size) if True.
    activation_function (`str`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, "gelu",
    num_mel_bins (`int`, *optional*, defaults to 128):
        Number of mel features used per input features. Should correspond to the value used in the
        `VoxtralProcessor` class.
    max_source_positions (`int`, *optional*, defaults to 1500):
        The maximum sequence length of log-mel filter-bank features that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.

```python
>>> from transformers import VoxtralEncoderConfig, VoxtralEncoder

>>> # Initializing a VoxtralEncoderConfig
>>> configuration = VoxtralEncoderConfig()

>>> # Initializing a VoxtralEncoder (with random weights)
>>> model = VoxtralEncoder(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```voxtral_encoderhidden_sizenum_hidden_layersnum_attention_headsintermediate_size	layerdrop)d_modelencoder_layersencoder_attention_headsencoder_ffn_dimencoder_layerdropc                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        X`l        Xpl        Xl	        Xl
        Xl        SU l        SU l        SU l        Xl        g )N         )super__init__
vocab_sizer   r   r   r   scale_embeddingactivation_functionnum_mel_binsmax_source_positionsinitializer_rangedropoutr   activation_dropoutattention_dropout)selfr   r   r   r   r   r   r   r   r   r   r"   kwargs	__class__s                i/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/voxtral/configuration_voxtral.pyr   VoxtralEncoderConfig.__init__P   sn     	"6"$&!2!2#6 .#6 ($8!!2
 "%!2    )r!   r   r"   r    r   r   r   r   r   r   r   r   r   r   )i  i   i       r	   Fgelu   i  g{Gz?r   )
__name__
__module____qualname____firstlineno____doc__
model_typeattribute_mapr   __static_attributes____classcell__r%   s   @r&   r   r      sU    /b #J !-#8.(M "!$3 $3r(   r   c                   `   ^  \ rS rSrSrSr\\S.rSSSSS	SS
SSSS.
r    SU 4S jjr	Sr
U =r$ )VoxtralConfigw   a  
This is the configuration class to store the configuration of a [`VoxtralForConditionalGeneration`]. It is used to instantiate an
Voxtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the Voxtral-Mini-3B.

e.g. [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    audio_config (`Union[AutoConfig, dict]`, *optional*):
        The config object or dictionary of the audio encoder.
    text_config (`Union[AutoConfig, dict]`, *optional*):
        The config object or dictionary of the text model.
    audio_token_id (`int`, *optional*):
        The image token index to encode the image prompt.
    projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
        The activation function (function or string) in the multi-modal projector.

```python
>>> from transformers import VoxtralForConditionalGeneration, VoxtralConfig

>>> # Initializing a Voxtral configuration
>>> configuration = VoxtralConfig(audio_token_id=24, projector_hidden_act="gelu")

>>> # Initializing a 3B model with random weights
>>> model = VoxtralForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```voxtral)text_configaudio_configi   i   i          gh㈵>Tg    חAr+   )
r   r   r   r   num_key_value_headsmax_position_embeddingsrms_norm_eps	use_cache
rope_thetahead_dimc                   > [        U[        5      (       a(  UR                  SS5      US'   [        US      " S0 UD6nOUc  [        S   " 5       nXl        [        U[        5      (       a6  UR                  SS5      US'   [        US      " S0 0 U R
                  EUED6nOUc  [        S   " S0 U R
                  D6nX l        UR                  U l        UR                  U l        X0l	        X@l
        [        TU ]0  " S0 UD6  g )Nr1   r
   llamar   )
isinstancedictgetr   r;   _default_text_config_kwargsr:   r   r   audio_token_idprojector_hidden_actr   r   )r#   r;   r:   rJ   rK   r$   r%   s         r&   r   VoxtralConfig.__init__   s    lD)))5)9)9,HY)ZL&),|*DEUUL!)*;<>L(k4(((3g(NK%(\)BC ET55EEK  (1UD4T4TUK&%00&22,$8!"6"r(   )r;   rJ   r   rK   r:   r   )NNNr*   )r,   r-   r.   r/   r0   r1   r   sub_configsrI   r   r3   r4   r5   s   @r&   r7   r7   w   sZ    B J",jIK ! #)!# ## #r(   r7   N)configuration_utilsr   autor   r   r   r7   __all__r   r(   r&   <module>rQ      s9     4 -`3+ `3FO#$ O#d "?
3r(   