ó
    <±h¬5  ã                   ó–   • S SK Jr  SSKJr  SSKJr  \R                  " \5      r " S S\5      r	 " S S\5      r
 " S	 S
\5      r/ SQrg)é    )ÚOptionalé   )ÚPretrainedConfig)Úloggingc                   ó–   ^ • \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\S\S\	S\	S\
S\S\	S\	4U 4S jjjrSrU =r$ )ÚAimv2VisionConfigé   aÓ  
This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.
Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úaimv2_vision_modelÚvision_configÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_channelsÚ
image_sizeÚ
patch_sizeÚrms_norm_epsÚattention_dropoutÚqkv_biasÚmlp_biasÚ
hidden_actÚinitializer_rangeÚuse_headÚ	is_nativec                 óÚ   >• [         TU ]  " S0 UD6  Xl        X l        X0l        X@l        XPl        Xpl        X`l        Xl	        XÀl
        Xàl        XÐl        X°l        X l        X€l        Xðl        g )N© )ÚsuperÚ__init__r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )Úselfr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   ÚkwargsÚ	__class__s                    €Úe/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aimv2/configuration_aimv2.pyr   ÚAimv2VisionConfig.__init__[   sh   ø€ ô& 	‰ÒÑ"˜6Ò"à&ÔØ!2ÔØ!2ÔØ#6Ô Ø(ÔØ$ŒØ$ŒØ!2ÔØ$Œà ŒØ!2ÔØ ŒØ ŒØ(ÔØ"ó    )r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   )i   i   é   é   r   éà   é   çñhãˆµøä>ç        FFÚsiluç{®Gáz”?TF)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚbase_config_keyÚintÚfloatÚboolÚstrr   Ú__static_attributes__Ú__classcell__©r!   s   @r"   r   r      sâ   ø† ñ6ðp &€JØ%€Oð  Ø!%Ø!#Ø#$ØØØØ"Ø#&ØØØ Ø#'ØØñ!$#àð$#ð ð$#ð ð	$#ð
 !ð$#ð ð$#ð ð$#ð ð$#ð ð$#ð !ð$#ð ð$#ð ð$#ð ð$#ð !ð$#ð ð$#ð  ÷!$#ö $#r$   r   c                   ó¢   ^ • \ rS rSrSrSrSr               SS\S\S\S\S	\S
\S\S\	S\	S\
S\\   S\\   S\S\S\	4U 4S jjjrSrU =r$ )ÚAimv2TextConfigé‚   a   
This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`Aimv2Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
Úaimv2_text_modelÚtext_configÚ
vocab_sizer   r   r   r   r   r   r   r   r   Úpad_token_idÚbos_token_idÚeos_token_idÚmax_position_embeddingsr   c                 ó¼   >• [         TU ]  " SX¼US.UD6  Xl        X l        X0l        X@l        XPl        Xàl        X l        Xpl	        Xðl
        Xl        X€l        X`l        g )N)rA   rB   rC   r   )r   r   r@   r   r   r   r   rD   r   r   r   r   r   r   )r   r@   r   r   r   r   r   r   r   r   r   rA   rB   rC   rD   r   r    r!   s                    €r"   r   ÚAimv2TextConfig.__init__³   sb   ø€ ô& 	‰ÒÐs lÐ\hÑsÐlrÒsà$ŒØ&ÔØ!2ÔØ!2ÔØ#6Ô Ø'>Ô$Ø$ŒØ!2Ôà!2ÔØ ŒØ ŒØ(Õr$   )r   r   r   r   r   rD   r   r   r   r   r   r@   )i Á  i   i   é   é   r)   r*   FFr+   NNiÿÀ  éM   r,   )r-   r.   r/   r0   r1   r2   r3   r4   r5   r6   r7   r   r   r8   r9   r:   s   @r"   r<   r<   ‚   sê   ø† ñ+ðZ $€JØ#€Oð  ØØ!%Ø!#Ø#$Ø"Ø#&ØØØ Ø&*Ø&*Ø!Ø')Ø"&ñ!!)àð!)ð ð!)ð ð	!)ð
 ð!)ð !ð!)ð ð!)ð !ð!)ð ð!)ð ð!)ð ð!)ð ˜s‘mð!)ð ˜s‘mð!)ð ð!)ð "%ð!)ð   ÷!!)ö !)r$   r<   c                   ó@   ^ • \ rS rSrSrSr\\S.r SU 4S jjr	Sr
U =r$ )ÚAimv2Configé×   a¼  
[`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```Úaimv2)r?   r   c                 óô   >• [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        X0l        X@l	        SU l
        g )NzP`text_config` is `None`. Initializing the `Aimv2TextConfig` with default values.zT`vision_config` is `None`. initializing the `Aimv2VisionConfig` with default values.g      Y@r   )r   r   ÚloggerÚinfor<   r?   r   r   Úprojection_dimÚlogit_scale_init_valueÚmax_logit_scale)r   r?   r   rQ   rR   r    r!   s         €r"   r   ÚAimv2Config.__init__  sy   ø€ ô 	‰ÒÑ"˜6Ò"àÑØˆKÜK‰KÐjÔkàÑ ØˆMÜK‰KÐnÔoä*Ñ9¨[Ñ9ˆÔÜ.Ñ?°Ñ?ˆÔØ,ÔØ&<Ô#Ø$ˆÕr$   )rR   rS   rQ   r?   r   )NNi   gƒ/L¦
F@)r-   r.   r/   r0   r1   r2   r<   r   Úsub_configsr   r8   r9   r:   s   @r"   rK   rK   ×   s,   ø† ñ+ðZ €JØ"1ÐDUÑV€Kð `f÷%õ %r$   rK   )rK   r   r<   N)Útypingr   Úconfiguration_utilsr   Úutilsr   Ú
get_loggerr-   rO   r   r<   rK   Ú__all__r   r$   r"   Ú<module>r[      sZ   ðõ, å 3Ý ð 
×	Ò	˜HÓ	%€ô`#Ð(ô `#ôFR)Ð&ô R)ôjB%Ð"ô B%òJ Br$   