
    <h5                         S SK Jr  SSKJr  SSKJr  \R                  " \5      r " S S\5      r	 " S S\5      r
 " S	 S
\5      r/ SQrg)    )Optional   )PretrainedConfig)loggingc                      ^  \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\S\S\	S\	S\
S\S\	S\	4U 4S jjjrSrU =r$ )Aimv2VisionConfig   a  
This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.
Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```aimv2_vision_modelvision_confighidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                    > [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        Xl
        Xl        Xl        Xl        Xl        Xl        Xl        g )N )super__init__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )selfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   kwargs	__class__s                    e/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aimv2/configuration_aimv2.pyr   Aimv2VisionConfig.__init__[   sh    & 	"6"&!2!2#6 ($$!2$ !2  ("    )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )i   i         r         h㈵>        FFsilu{Gz?TF)__name__
__module____qualname____firstlineno____doc__
model_typebase_config_keyintfloatboolstrr   __static_attributes____classcell__r!   s   @r"   r   r      s    6p &J%O  !%!##$"#& #'!$#$# $# 	$#
 !$# $# $# $# $# !$# $# $# $# !$# $#  !$# $#r$   r   c                      ^  \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\	S\	S\
S\\   S\\   S\S\S\	4U 4S jjjrSrU =r$ )Aimv2TextConfig   a   
This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`Aimv2Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
aimv2_text_modeltext_config
vocab_sizer   r   r   r   r   r   r   r   r   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr   c                    > [         TU ]  " SXUS.UD6  Xl        X l        X0l        X@l        XPl        Xl        Xl        Xpl	        Xl
        Xl        Xl        X`l        g )N)rA   rB   rC   r   )r   r   r@   r   r   r   r   rD   r   r   r   r   r   r   )r   r@   r   r   r   r   r   r   r   r   r   rA   rB   rC   rD   r   r    r!   s                    r"   r   Aimv2TextConfig.__init__   sb    & 	sl\hslrs$&!2!2#6 '>$$!2!2  (r$   )r   r   r   r   r   rD   r   r   r   r   r   r@   )i   i   i         r)   r*   FFr+   NNi  M   r,   )r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r   r   r8   r9   r:   s   @r"   r<   r<      s    +Z $J#O  !%!##$"#& &*&*!')"&!!)!) !) 	!)
 !) !!) !) !!) !) !) !) sm!) sm!) !) "%!)   !!) !)r$   r<   c                   @   ^  \ rS rSrSrSr\\S.r SU 4S jjr	Sr
U =r$ )Aimv2Config   a  
[`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```aimv2)r?   r   c                    > [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        SU l
        g )NzP`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.zT`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.g      Y@r   )r   r   loggerinfor<   r?   r   r   projection_dimlogit_scale_init_valuemax_logit_scale)r   r?   r   rQ   rR   r    r!   s         r"   r   Aimv2Config.__init__  sy     	"6"KKKjk MKKno*9[9.??,&<#$r$   )rR   rS   rQ   r?   r   )NNi   g/L
F@)r-   r.   r/   r0   r1   r2   r<   r   sub_configsr   r8   r9   r:   s   @r"   rK   rK      s,    +Z J"1DUVK `f% %r$   rK   )rK   r   r<   N)typingr   configuration_utilsr   utilsr   
get_loggerr-   rO   r   r<   rK   __all__r   r$   r"   <module>r[      sZ   ,  3  
		H	%`#( `#FR)& R)jB%" B%J Br$   