ó
    <±hP  ã                   ó¦   • S r SSKJr  SSKJr  SSKJr  SSKJr  \R                  " \
5      r " S S\5      r " S	 S
\5      r " S S\5      r/ SQrg)zDia model configurationé    )ÚOptionalé   )ÚPretrainedConfig)Úrope_config_validation)Úloggingc                   óŒ   ^ • \ rS rSrSrSr             SS\S\S\S\S\S	\S
\S\S\S\S\S\	\
   S\4U 4S jjjrSrU =r$ )ÚDiaEncoderConfigé   a’  
This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia
encoder according to the specified arguments, defining the encoder architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    max_position_embeddings (`int`, *optional*, defaults to 1024):
        The maximum sequence length that this model might ever be used with.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_key_value_heads (`int`, *optional*, defaults to 16):
        Number of key and value heads for each attention layer in the Transformer encoder.
    head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the attention head.
    intermediate_size (`int`, *optional*, defaults to 4096):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    vocab_size (`int`, *optional*, defaults to 256):
        Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`DiaModel`].
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"swish"` and `"gelu_new"` are supported.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
Údia_encoderÚmax_position_embeddingsÚnum_hidden_layersÚhidden_sizeÚnum_attention_headsÚnum_key_value_headsÚhead_dimÚintermediate_sizeÚnorm_epsÚ
vocab_sizeÚ
hidden_actÚ
rope_thetaÚrope_scalingÚinitializer_rangec                 óJ  >• Xl         X l        X0l        Xpl        X@l        X`l        X€l        Xl        XPl        X l	        X°l
        XÀl        U R                  b,  SU R                  ;   a  U R                  S   U R                  S'   [        U 5        XÐl        [        TU ]<  " S0 UD6  g )NÚtypeÚ	rope_type© )r   r   r   r   r   r   r   r   r   r   r   r   r   r   ÚsuperÚ__init__)Úselfr   r   r   r   r   r   r   r   r   r   r   r   r   ÚkwargsÚ	__class__s                  €Úa/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/configuration_dia.pyr   ÚDiaEncoderConfig.__init__g   sœ   ø€ ð" (?Ô$Ø!2ÔØ&ÔØ!2ÔØ#6Ô Ø ŒØ ŒØ$ŒØ#6Ô Ø$ŒØ$ŒØ(Ôð ×ÑÑ(¨V°t×7HÑ7HÓ-HØ-1×->Ñ->¸vÑ-FˆD×Ñ˜kÑ*Ü˜tÔ$Ø!2ÔÜ‰ÒÑ"˜6Ó"ó    )r   r   r   r   r   r   r   r   r   r   r   r   r   )é   é   r%   é   r'   é€   i   çñhãˆµøä>é   Úsiluç     ˆÃ@Nç{®Gáz”?)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚintÚfloatÚstrr   Údictr   Ú__static_attributes__Ú__classcell__©r!   s   @r"   r	   r	      sÈ   ø† ñGðR €Jð (,Ø!#ØØ#%Ø#%ØØ!%ØØØ Ø#Ø'+Ø#'ñ##à!$ð##ð ð##ð ð	##ð
 !ð##ð !ð##ð ð##ð ð##ð ð##ð ð##ð ð##ð ð##ð ˜t‘nð##ð !÷##ö ##r$   r	   c            )       ó¶   ^ • \ rS rSrSrSr                    SS\S\S\S\S\S	\S
\S\S\S\S\S\S\S\S\S\S\	\
   S\S\S\4(U 4S jjjrSrU =r$ )ÚDiaDecoderConfigé   a@  
This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia
decoder according to the specified arguments, defining the decoder architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    max_position_embeddings (`int`, *optional*, defaults to 3072):
        The maximum sequence length that this model might ever be used with.
    num_hidden_layers (`int`, *optional*, defaults to 18):
        Number of hidden layers in the Transformer decoder.
    hidden_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the decoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 8192):
        Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer decoder.
    num_key_value_heads (`int`, *optional*, defaults to 4):
        Number of key and value heads for each attention layer in the Transformer decoder.
    head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the attention head.
    cross_num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each cross-attention layer in the Transformer decoder.
    cross_head_dim (`int`, *optional*, defaults to 128):
        Dimensionality of the cross-attention head.
    cross_num_key_value_heads (`int`, *optional*, defaults to 16):
        Number of key and value heads for each cross-attention layer in the Transformer decoder.
    cross_hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the cross-attention layers.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    vocab_size (`int`, *optional*, defaults to 1028):
        Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`DiaModel`].
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, `"relu"`,
        `"swish"` and `"gelu_new"` are supported.
    num_channels (`int`, *optional*, defaults to 9):
        Number of channels for the Dia decoder.
    rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the RoPE embeddings.
    rope_scaling (`dict`, *optional*):
        Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
        and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
        accordingly.
        Expected contents:
            `rope_type` (`str`):
                The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
                'llama3'], with 'default' being the original RoPE implementation.
            `factor` (`float`, *optional*):
                Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
                most scaling types, a `factor` of x will enable the model to handle sequences of length x *
                original maximum pre-trained length.
            `original_max_position_embeddings` (`int`, *optional*):
                Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
                pretraining.
            `attention_factor` (`float`, *optional*):
                Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
                computation. If unspecified, it defaults to value recommended by the implementation, using the
                `factor` field to infer the suggested value.
            `beta_fast` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
                ramp function. If unspecified, it defaults to 32.
            `beta_slow` (`float`, *optional*):
                Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
                ramp function. If unspecified, it defaults to 1.
            `short_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to short contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `long_factor` (`List[float]`, *optional*):
                Only used with 'longrope'. The scaling factor to be applied to long contexts (<
                `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
                size divided by the number of attention heads divided by 2
            `low_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
            `high_freq_factor` (`float`, *optional*):
                Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Indicating that this model is part of an encoder-decoder architecture.
Údia_decoderr   r   r   r   r   r   r   Úcross_num_attention_headsÚcross_head_dimÚcross_num_key_value_headsÚcross_hidden_sizer   r   r   Únum_channelsr   r   r   Ú	use_cacheÚis_encoder_decoderc                 óž  >• Xl         X l        X0l        X@l        XPl        X`l        Xpl        X l        X€l        Xl	        X°l
        XÀl        XÐl        Xàl        Xðl        UU l        UU l        U R                   b,  SU R                   ;   a  U R                   S   U R                   S'   [#        U 5        UU l        UU l        [(        TU ]T  " SSU0UD6  g )Nr   r   rE   r   )r   r   r   r   r   r   r   rA   r?   r@   rB   r   r   r   rC   r   r   r   r   rD   r   r   )r   r   r   r   r   r   r   r   r?   r@   rA   rB   r   r   r   rC   r   r   r   rD   rE   r    r!   s                         €r"   r   ÚDiaDecoderConfig.__init__ç   sÍ   ø€ ð0 (?Ô$Ø!2ÔØ&ÔØ!2ÔØ#6Ô Ø#6Ô Ø ŒØ)BÔ&Ø)BÔ&Ø,ÔØ!2ÔØ ŒØ$ŒØ$ŒØ(ÔØ$ˆŒØ(ˆÔð ×ÑÑ(¨V°t×7HÑ7HÓ-HØ-1×->Ñ->¸vÑ-FˆD×Ñ˜kÑ*Ü˜tÔ$Ø!2ˆÔØ"ˆŒÜ‰ÒÑIÐ,>ÐIÀ&ÓIr$   )r@   rB   r?   rA   r   r   r   r   r   r   r   r   rC   r   r   r   r   rD   r   )i   é   i   i    r'   é   r(   r'   r(   r'   r%   r)   i  r+   é	   r,   Nr-   TT)r.   r/   r0   r1   r2   r3   r4   r5   r6   r   r7   Úboolr   r8   r9   r:   s   @r"   r<   r<      s9  ø† ñUðn €Jð (,Ø!#ØØ!%Ø#%Ø#$ØØ)+Ø!Ø)+Ø!%ØØØ ØØ#Ø'+Ø#'ØØ#'ñ+0Jà!$ð0Jð ð0Jð ð	0Jð
 ð0Jð !ð0Jð !ð0Jð ð0Jð $'ð0Jð ð0Jð $'ð0Jð ð0Jð ð0Jð ð0Jð ð0Jð  ð!0Jð" ð#0Jð$ ˜t‘nð%0Jð& !ð'0Jð( ð)0Jð* !÷+0Jö 0Jr$   r<   c                   ó¦   ^ • \ rS rSrSrSrS/r\\S.r	          SS\
\   S\
\   S\S	\S
\S\S\S\
\\      S\S\4U 4S jjjrSS jrSrU =r$ )Ú	DiaConfigi  aß  
This is the configuration class to store the configuration of a [`DiaModel`]. It is used to instantiate a
Dia model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the
[nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    encoder_config (`DiaEncoderConfig`, *optional*):
        Configuration for the encoder part of the model. If not provided, a default `DiaEncoderConfig` will be used.
    decoder_config (`DiaDecoderConfig`, *optional*):
        Configuration for the decoder part of the model. If not provided, a default `DiaDecoderConfig` will be used.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the normalization layers.
    is_encoder_decoder (`bool`, *optional*, defaults to `True`):
        Indicating that this model uses an encoder-decoder architecture.
    pad_token_id (`int`, *optional*, defaults to 1025):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 1024):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 1026):
        Beginning of stream token id.
    delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`):
        The delay pattern for the decoder. The length of this list must match `decoder_config.num_channels`.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should return the last key/values attentions (not used by all models).

Example:

```python
>>> from transformers import DiaConfig, DiaModel

>>> # Initializing a DiaConfig with default values
>>> configuration = DiaConfig()

>>> # Initializing a DiaModel (with random weights) from the configuration
>>> model = DiaModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```
ÚdiaÚpast_key_values)Úencoder_configÚdecoder_configrP   rQ   r   rE   Úpad_token_idÚeos_token_idÚbos_token_idÚdelay_patternr   rD   c                 ó¨  >• [        U[        5      (       a  [        S0 UD6n[        U[        5      (       a  [        S0 UD6nUb  UO	[        5       U l        Ub  UO	[        5       U l        X0l        Ub  UO/ SQU l        Xl        X l	        U R
                  R                  [        U R                  5      :X  d   S5       e[        TU ]4  " SUUUUS.UD6  g )N)	r   é   rJ   é
   é   r&   é   é   é   z3Number of channels must match delay pattern length.)rR   rS   rT   rE   r   )Ú
isinstancer7   r	   r<   rP   rQ   r   rU   r   rD   rC   Úlenr   r   )r   rP   rQ   r   rE   rR   rS   rT   rU   r   rD   r    r!   s               €r"   r   ÚDiaConfig.__init__N  sØ   ø€ ô n¤d×+Ñ+Ü-Ñ?°Ñ?ˆNÜn¤d×+Ñ+Ü-Ñ?°Ñ?ˆNØ0>Ñ0J™nÔP`ÓPbˆÔØ0>Ñ0J™nÔP`ÓPbˆÔØ ŒØ.;Ñ.G™]ÒMnˆÔØ!2ÔØ"Œà×"Ñ"×/Ñ/´3°t×7IÑ7IÓ3JÓJð 	
ØAó	
ÐJô 	‰Òð 	
Ø%Ø%Ø%Ø1ñ		
ð
 ó	
r$   c                 ó   • U R                   $ )z^Defaulting to audio config as it's the decoder in this case which is usually the text backbone)rQ   )r   Údecoders     r"   Úget_text_configÚDiaConfig.get_text_configs  s   € à×"Ñ"Ð"r$   )rQ   rU   rP   r   r   rD   )
NNr)   Ti  r%   i  Nr-   T)F)r.   r/   r0   r1   r2   r3   Úkeys_to_ignore_at_inferencer	   r<   Úsub_configsr   r5   rK   r4   Úlistr   rb   r8   r9   r:   s   @r"   rM   rM     sÐ   ø† ñ-ð^ €JØ#4Ð"5ÐØ%5ÐIYÑZ€Kð 6:Ø59ØØ#'Ø Ø Ø Ø-1Ø#'Øñ#
à Ð!1Ñ2ð#
ð !Ð!1Ñ2ð#
ð ð	#
ð
 !ð#
ð ð#
ð ð#
ð ð#
ð    S¡	Ñ*ð#
ð !ð#
ð ÷#
ð #
÷J#ò #r$   rM   )rM   r	   r<   N)r2   Útypingr   Úconfiguration_utilsr   Úmodeling_rope_utilsr   Úutilsr   Ú
get_loggerr.   Úloggerr	   r<   rM   Ú__all__r   r$   r"   Ú<module>rn      sb   ðñ å å 3Ý 9Ý ð 
×	Ò	˜HÓ	%€ôo#Ð'ô o#ôdJJÐ'ô JJôZ[#Ð ô [#ò| @r$   