
    <hP                         S r SSKJr  SSKJr  SSKJr  SSKJr  \R                  " \
5      r " S S\5      r " S	 S
\5      r " S S\5      r/ SQrg)zDia model configuration    )Optional   )PretrainedConfig)rope_config_validation)loggingc                      ^  \ rS rSrSrSr             SS\S\S\S\S\S	\S
\S\S\S\S\S\	\
   S\4U 4S jjjrSrU =r$ )DiaEncoderConfig   a  
This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia
encoder according to the specified arguments, defining the encoder architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    max_position_embeddings (`int`, *optional*, defaults to 1024):
        The maximum sequence length that this model might ever be used with.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_key_value_heads (`int`, *optional*, defaults to 16):
        Number of key and value heads for each attention layer in the Transformer encoder.
    head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the attention head.
    intermediate_size (`int`, *optional*, defaults to 4096):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    vocab_size (`int`, *optional*, defaults to 256):
        Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`DiaModel`].
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"swish"` and `"gelu_new"` are supported.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
dia_encodermax_position_embeddingsnum_hidden_layershidden_sizenum_attention_headsnum_key_value_headshead_dimintermediate_sizenorm_eps
vocab_size
hidden_act
rope_thetarope_scalinginitializer_rangec                 J  > Xl         X l        X0l        Xpl        X@l        X`l        Xl        Xl        XPl        Xl	        Xl
        Xl        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [        U 5        Xl        [        TU ]<  " S0 UD6  g )Ntype	rope_type )r   r   r   r   r   r   r   r   r   r   r   r   r   r   super__init__)selfr   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                  a/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/configuration_dia.pyr   DiaEncoderConfig.__init__g   s    " (?$!2&!2#6   $#6 $$( (Vt7H7H-H-1->->v-FDk*t$!2"6"    )r   r   r   r   r   r   r   r   r   r   r   r   r   )      r%      r'      i   h㈵>   silu     @N{Gz?)__name__
__module____qualname____firstlineno____doc__
model_typeintfloatstrr   dictr   __static_attributes____classcell__r!   s   @r"   r	   r	      s    GR J (,!##%#%!% #'+#'##!$## ## 	##
 !## !## ## ## ## ## ## ## tn## !## ##r$   r	   c            )          ^  \ rS rSrSrSr                    SS\S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\S\S\	\
   S\S\S\4(U 4S jjjrSrU =r$ )DiaDecoderConfig   a@  
This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia
decoder according to the specified arguments, defining the decoder architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    max_position_embeddings (`int`, *optional*, defaults to 3072):
        The maximum sequence length that this model might ever be used with.
    num_hidden_layers (`int`, *optional*, defaults to 18):
        Number of hidden layers in the Transformer decoder.
    hidden_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the decoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        Number of key and value heads for each attention layer in the Transformer decoder.
    head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the attention head.
    cross_num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each cross-attention layer in the Transformer decoder.
    cross_head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the cross-attention head.
    cross_num_key_value_heads (`int`, *optional*, defaults to 16):
        Number of key and value heads for each cross-attention layer in the Transformer decoder.
    cross_hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the cross-attention layers.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    vocab_size (`int`, *optional*, defaults to 1028):
        Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`DiaModel`].
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, `"relu"`,
        `"swish"` and `"gelu_new"` are supported.
    num_channels (`int`, *optional*, defaults to 9):
        Number of channels for the Dia decoder.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Indicating that this model is part of an encoder-decoder architecture.
dia_decoderr   r   r   r   r   r   r   cross_num_attention_headscross_head_dimcross_num_key_value_headscross_hidden_sizer   r   r   num_channelsr   r   r   	use_cacheis_encoder_decoderc                   > Xl         X l        X0l        X@l        XPl        X`l        Xpl        Xl        Xl        Xl	        Xl
        Xl        Xl        Xl        Xl        UU l        UU l        U R                   b,  SU R                   ;   a  U R                   S   U R                   S'   [#        U 5        UU l        UU l        [(        TU ]T  " SSU0UD6  g )Nr   r   rE   r   )r   r   r   r   r   r   r   rA   r?   r@   rB   r   r   r   rC   r   r   r   r   rD   r   r   )r   r   r   r   r   r   r   r   r?   r@   rA   rB   r   r   r   rC   r   r   r   rD   rE   r    r!   s                         r"   r   DiaDecoderConfig.__init__   s    0 (?$!2&!2#6 #6  )B&)B&,!2 $$($( (Vt7H7H-H-1->->v-FDk*t$!2"I,>I&Ir$   )r@   rB   r?   rA   r   r   r   r   r   r   r   r   rC   r   r   r   r   rD   r   )i      i   i    r'      r(   r'   r(   r'   r%   r)   i  r+   	   r,   Nr-   TT)r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r7   boolr   r8   r9   r:   s   @r"   r<   r<      s9   Un J (,!#!%#%#$)+!)+!% #'+#'#'+0J!$0J 0J 	0J
 0J !0J !0J 0J $'0J 0J $'0J 0J 0J 0J 0J  !0J" #0J$ tn%0J& !'0J( )0J* !+0J 0Jr$   r<   c                      ^  \ rS rSrSrSrS/r\\S.r	          SS\
\   S\
\   S\S	\S
\S\S\S\
\\      S\S\4U 4S jjjrSS jrSrU =r$ )	DiaConfigi  a  
This is the configuration class to store the configuration of a [`DiaModel`]. It is used to instantiate a
Dia model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the
[nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    encoder_config (`DiaEncoderConfig`, *optional*):
        Configuration for the encoder part of the model. If not provided, a default `DiaEncoderConfig` will be used.
    decoder_config (`DiaDecoderConfig`, *optional*):
        Configuration for the decoder part of the model. If not provided, a default `DiaDecoderConfig` will be used.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Indicating that this model uses an encoder-decoder architecture.
    pad_token_id (`int`, *optional*, defaults to 1025):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1024):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 1026):
        Beginning of stream token id.
    delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`):
        The delay pattern for the decoder. The length of this list must match `decoder_config.num_channels`.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).

Example:

```python
>>> from transformers import DiaConfig, DiaModel

>>> # Initializing a DiaConfig with default values
>>> configuration = DiaConfig()

>>> # Initializing a DiaModel (with random weights) from the configuration
>>> model = DiaModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
diapast_key_values)encoder_configdecoder_configrP   rQ   r   rE   pad_token_ideos_token_idbos_token_iddelay_patternr   rD   c                   > [        U[        5      (       a  [        S0 UD6n[        U[        5      (       a  [        S0 UD6nUb  UO	[        5       U l        Ub  UO	[        5       U l        X0l        Ub  UO/ SQU l        Xl        Xl	        U R
                  R                  [        U R                  5      :X  d   S5       e[        TU ]4  " SUUUUS.UD6  g )N)	r      rJ   
      r&            z3Number of channels must match delay pattern length.)rR   rS   rT   rE   r   )
isinstancer7   r	   r<   rP   rQ   r   rU   r   rD   rC   lenr   r   )r   rP   rQ   r   rE   rR   rS   rT   rU   r   rD   r    r!   s               r"   r   DiaConfig.__init__N  s     nd++-??Nnd++-??N0>0JnP`Pb0>0JnP`Pb .;.G]Mn!2"""//3t7I7I3JJ 	
A	
J 	 	
%%%1		

 	
r$   c                     U R                   $ )z^Defaulting to audio config as it's the decoder in this case which is usually the text backbone)rQ   )r   decoders     r"   get_text_configDiaConfig.get_text_configs  s    """r$   )rQ   rU   rP   r   r   rD   )
NNr)   Ti  r%   i  Nr-   T)F)r.   r/   r0   r1   r2   r3   keys_to_ignore_at_inferencer	   r<   sub_configsr   r5   rK   r4   listr   rb   r8   r9   r:   s   @r"   rM   rM     s    -^ J#4"5%5IYZK 6:59#'   -1#'#
 !12#
 !!12#
 	#

 !#
 #
 #
 #
  S	*#
 !#
 #
 #
J# #r$   rM   )rM   r	   r<   N)r2   typingr   configuration_utilsr   modeling_rope_utilsr   utilsr   
get_loggerr.   loggerr	   r<   rM   __all__r   r$   r"   <module>rn      sb      3 9  
		H	%o#' o#dJJ' JJZ[#  [#| @r$   