ó
    <±hýq  ã                   ó¬  • S r SSKrSSKJr  SSKrSSKJs  Jr  SSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJr  SSKJrJr  SSKJrJrJr  SSKJ r J!r!J"r"   " S S\5      r# " S S\5      r$ " S S\5      r% " S S\"5      r& " S S\5      r' " S S\5      r( " S S\RR                  5      r* " S S\5      r+ " S  S!\ 5      r, " S" S#\5      r- " S$ S%\!5      r. " S& S'\RR                  5      r/\ " S( S)\5      5       r0\" S*S+9 " S, S-\05      5       r1\" S.S+9 " S/ S0\05      5       r2\ " S1 S2\\RR                  5      5       r3/ S3Qr4g)4z%Pytorch implementation of AIMv2 Modelé    N)ÚOptional)Únné   )Úcreate_causal_mask)ÚGradientCheckpointingLayer)ÚBaseModelOutputWithPooling)ÚPreTrainedModel)Úauto_docstringÚcan_return_tupleé   )Ú	CLIPModelÚCLIPTextEmbeddingsÚ_get_vector_norm)ÚLlamaMLPÚLlamaRMSNorm)ÚSiglipConfigÚSiglipTextConfigÚSiglipVisionConfig)ÚSiglipAttentionÚSiglipEncoderÚSiglipOutputc                   óŽ   ^ • \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\S\S\S\S\4U 4S jjjr	Sr
U =r$ )ÚAimv2VisionConfigé'   aÓ  
This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.
Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚnum_channelsÚ
image_sizeÚ
patch_sizeÚrms_norm_epsÚattention_dropoutÚqkv_biasÚmlp_biasÚ
hidden_actÚinitializer_rangeÚuse_headÚ	is_nativec                 ó’   >• [         TU ]  " SUUUUUUUUU
S.	UD6  Xàl        XÐl        Xl        X°l        X l        X€l        Xðl        U ?	g )N)	r   r   r   r   r&   r   r    r!   r$   © )
ÚsuperÚ__init__r(   r'   r#   r%   r$   r"   r)   Úlayer_norm_eps)Úselfr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   ÚkwargsÚ	__class__s                    €Ú_/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aimv2/modular_aimv2.pyr-   ÚAimv2VisionConfig.__init__`   sj   ø€ ô& 	‰Òð 	
Ø#Ø/Ø/Ø 3Ø!Ø%Ø!Ø!Øñ	
ð ò	
ð !ŒØ!2ÔØ!2ÔØ ŒØ ŒØ(ÔØ"ŒàÑó    )r#   r'   r)   r%   r$   r"   r(   )i   i   é   é   r   éà   é   çñhãˆµøä>ç        FFÚsiluç{®Gáz”?TF)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__ÚintÚfloatÚboolÚstrr-   Ú__static_attributes__Ú__classcell__©r1   s   @r2   r   r   '   sÖ   ø† ñ6ðt  Ø!%Ø!#Ø#$ØØØØ"Ø#&ØØØ Ø#'ØØñ!( àð( ð ð( ð ð	( ð
 !ð( ð ð( ð ð( ð ð( ð ð( ð !ð( ð ð( ð ð( ð ð( ð !ð( ð ð( ð  ÷!( ö ( r4   r   c                   óš   ^ • \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\	\   S\	\   S\S\S\4U 4S jjjr
SrU =r$ )ÚAimv2TextConfigé‹   a   
This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`Aimv2Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
Ú
vocab_sizer   r   r   r   r"   r#   r$   r%   r&   Úpad_token_idÚbos_token_idÚeos_token_idÚmax_position_embeddingsr'   c                 óˆ   >• [         TU ]  " SUUUUUU
UUUUS.
UD6  Xðl        Xpl        Xl        X€l        X`l        U ?U ?U ?	U ?
g )N)
rL   r   r   r   r   r&   rP   rM   rN   rO   r+   )r,   r-   r'   r#   r%   r$   r"   rN   rM   Úprojection_sizer.   )r/   rL   r   r   r   r   r"   r#   r$   r%   r&   rM   rN   rO   rP   r'   r0   r1   s                    €r2   r-   ÚAimv2TextConfig.__init__¹   su   ø€ ô& 	‰Òð 	
Ø!Ø#Ø/Ø/Ø 3Ø!Ø$;Ø%Ø%Ø%ñ	
ð ò	
ð "3ÔØ!2ÔØ ŒØ ŒØ(ÔàÐØÐØÐ ØÑr4   )r#   r'   r%   r$   r"   )i Á  i   i   é   é   r9   r:   FFr;   NNiÿÀ  éM   r<   )r=   r>   r?   r@   rA   rB   rC   rD   rE   r   r-   rF   rG   rH   s   @r2   rJ   rJ   ‹   sÞ   ø† ñ+ð^  ØØ!%Ø!#Ø#$Ø"Ø#&ØØØ Ø&*Ø&*Ø!Ø')Ø"&ñ!* àð* ð ð* ð ð	* ð
 ð* ð !ð* ð ð* ð !ð* ð ð* ð ð* ð ð* ð ˜s‘mð* ð ˜s‘mð* ð ð* ð "%ð* ð   ÷!* ö * r4   rJ   c                   ó2   ^ • \ rS rSrSr SU 4S jjrSrU =r$ )ÚAimv2Configéæ   a¼  
[`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```c                 óR   >• [         TU ]  " X40 UD6  X0l        X@l        SU l        U ?g )Ng      Y@)r,   r-   Úprojection_dimÚlogit_scale_init_valueÚmax_logit_scaleÚinitializer_factor)r/   Útext_configÚvision_configr[   r\   r0   r1   s         €r2   r-   ÚAimv2Config.__init__  s1   ø€ ô 	‰Ò˜Ñ>°vÒ>Ø,ÔØ&<Ô#Ø$ˆÔàÑ#r4   )r\   r]   r[   )NNi   gƒ/L¦
F@)r=   r>   r?   r@   rA   r-   rF   rG   rH   s   @r2   rX   rX   æ   s   ø† ñ+ð\ `f÷$õ $r4   rX   c                   ó   • \ rS rSrSrg)ÚAimv2Outputi  r+   N©r=   r>   r?   r@   rF   r+   r4   r2   rc   rc     ó   † Úr4   rc   c                   ó   • \ rS rSrSrg)ÚAimv2RMSNormi#  r+   Nrd   r+   r4   r2   rg   rg   #  re   r4   rg   c                   ó   • \ rS rSrSrg)ÚAimv2MLPi'  r+   Nrd   r+   r4   r2   ri   ri   '  re   r4   ri   c                   ó¶   ^ • \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )ÚAimv2VisionEmbeddingsi+  Úconfigc                 óB  >• [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)Úkernel_sizeÚstrider   Úposition_ids)é   éÿÿÿÿF)Ú
persistent)r,   r-   rl   r!   r   ÚConv2dr   r   Úpatch_embedrg   r"   Úrms_normr    r)   Ú	EmbeddingÚposition_embeddingÚregister_bufferÚtorchÚarangeÚexpand)r/   rl   Únum_patchesr1   s      €r2   r-   ÚAimv2VisionEmbeddings.__init__,  sØ   ø€ Ü‰ÑÔØŒØ ×+Ñ+ˆŒÜŸ9š9Ø×Ñ ×!3Ñ!3À×ARÑARÐ[a×[lÑ[lñ
ˆÔô % V×%7Ñ%7¸×9LÑ9LÓMˆŒà×(Ñ(¨F×,=Ñ,=Ñ=À!ÑCˆØ{‰{×$×$Ü&(§l¢l°;×@RÑ@RÓ&SˆDÔ#Ø×Ñ˜^¬U¯\ª\¸+Ó-F×-MÑ-MÈgÓ-VÐchÐÒir4   é   g     ˆÃ@ÚcpuÚreturnc                 ó  • [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " X…US9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	N©ÚdtypeÚdeviceÚxy)Úindexingé   g      ð?).Nrq   ©Údim)rz   r{   rB   ÚmeshgridÚflattenÚconcatÚsinÚcos)ÚheightÚwidthÚ	embed_dimÚtemperaturer…   r„   Úgrid_wÚgrid_hÚpos_dimÚomegaÚout_hÚout_ws               r2   Ú"build_2d_sincos_position_embeddingÚ8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding:  sç   € ô —’œc %›j°ÑEˆÜ—’œc &›k°ÑFˆÜŸš¨ÀÑF‰ˆà˜q‘.ˆÜ—’˜W¸&ÑAÀGÑKˆØ{Ñ)Ñ*ˆà—‘Ó  Ñ+¨e°Dº!°G©nÑ<ˆØ—‘Ó  Ñ+¨e°Dº!°G©nÑ<ˆä|Š|˜UŸY™Y›[¨%¯)©)«+°u·y±y³{ÀEÇIÁIÃKÐPÐVWÑXÐY]Ò_`ÒbcÐYcÑdÐdr4   Úpixel_valuesc                 óÌ  • UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )Nr   rq   )r’   r…   r„   )Úsizeru   rŒ   Ú	transposerv   rl   r)   rš   r!   r   r…   r„   rx   rp   )r/   rœ   Ú_r   r‘   Úhidden_statesÚ	pos_embeds          r2   ÚforwardÚAimv2VisionEmbeddings.forwardK  sÉ   € Ø*×/Ñ/Ó1Ñˆˆ1eØ×(Ñ(¨Ó6×>Ñ>¸qÓA×KÑKÈAÈqÓQˆØŸ™ mÓ4ˆà;‰;× × Ø×?Ñ?ØŸ/™/Ñ)ØŸ™Ñ(ØŸ+™+×1Ñ1Ø$×+Ñ+Ø#×)Ñ)ð @ð ‰Ið ×/Ñ/°×0AÑ0AÓBˆIà%Ñ1ˆØÐr4   )rl   ru   r!   rx   rv   )r=   r>   r?   r@   r   r-   Ústaticmethodrz   Úfloat32ÚTensorrš   r£   rF   rG   rH   s   @r2   rk   rk   +  sb   ø† ðjÐ0÷ jð à!$°'À%ÈuÏ}É}ñeà	‰ôeó ðeð  E§L¡Lð °U·\±\÷ ò r4   rk   c                   ó   • \ rS rSrSrg)ÚAimv2TextEmbeddingsi_  r+   Nrd   r+   r4   r2   r©   r©   _  re   r4   r©   c                   ó(   ^ • \ rS rSrU 4S jrSrU =r$ )ÚAimv2Attentionic  c                 óì  >• [         TU ]  U5        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l	        g )N©Úbias)
r,   r-   r   ÚLinearr’   r$   Úk_projÚv_projÚq_projÚout_proj©r/   rl   r1   s     €r2   r-   ÚAimv2Attention.__init__d  s   ø€ Ü‰Ñ˜Ô Ü—i’i §¡°·±ÀVÇ_Á_ÑUˆŒÜ—i’i §¡°·±ÀVÇ_Á_ÑUˆŒÜ—i’i §¡°·±ÀVÇ_Á_ÑUˆŒÜŸ	š	 $§.¡.°$·.±.ÀvÇÁÑWˆr4   )r°   r³   r²   r±   )r=   r>   r?   r@   r-   rF   rG   rH   s   @r2   r«   r«   c  s   ø† ÷Xó Xr4   r«   c                   ó¸   ^ • \ rS rSrS\4U 4S jjr  S
S\R                  S\\R                     S\\	   S\
\R                  \R                  4   4S jjrS	rU =r$ )ÚAimv2EncoderLayeril  rl   c                 óö   >• [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g ©N)r,   r-   r«   Ú	attentionri   Úffnrg   r   r"   Ú	rms_norm1Ú	rms_norm2r´   s     €r2   r-   ÚAimv2EncoderLayer.__init__m  sZ   ø€ Ü‰ÑÔÜ'¨Ó/ˆŒÜ˜FÓ#ˆŒÜ% f×&8Ñ&8¸&×:MÑ:MÓNˆŒÜ% f×&8Ñ&8¸&×:MÑ:MÓNˆr4   r¡   Úattention_maskÚoutput_attentionsr   c                 ó¶   • U R                  U5      nU R                  XBS9u  pVX-   nU R                  U5      nU R                  U5      nX-   nU(       a  X4$ US 4$ )N)r¡   r¿   )r¼   rº   r½   r»   )r/   r¡   r¿   rÀ   Únorm_hidden_statesÚattn_outputÚattn_weightsÚ
mlp_outputs           r2   r£   ÚAimv2EncoderLayer.forwardt  sk   € ð "Ÿ^™^¨MÓ:ÐØ$(§N¡NÐAS NÐ$sÑ!ˆà%Ñ3ˆØ!Ÿ^™^¨MÓ:ÐØ—X‘XÐ0Ó1ˆ
à%Ñ2ˆÞ0AÐ,Ð\ÈÐW[ÐG\Ð\r4   )rº   r»   r¼   r½   )NF)r=   r>   r?   r@   r   r-   rz   r§   r   rD   Útupler£   rF   rG   rH   s   @r2   r·   r·   l  ss   ø† ðOÐ0÷ Oð 26Ø,1ñ	]à—|‘|ð]ð ! §¡Ñ.ð]ð $ D™>ð	]ð
 
ˆu|‰|˜UŸ\™\Ð)Ñ	*÷]ó ]r4   r·   c                   ó   • \ rS rSrSrg)ÚAimv2Encoderi…  r+   Nrd   r+   r4   r2   rÉ   rÉ   …  re   r4   rÉ   c                   ój   ^ • \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )ÚAimv2AttentionPoolingHeadi‰  rl   c                 ó  >• [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nr­   rq   T)r,   r-   r   r   Ú	num_headsr   r¯   r$   r°   r±   Ú	Parameterrz   ÚzerosÚ	cls_tokenÚoutput_projr´   s     €r2   r-   Ú"Aimv2AttentionPoolingHead.__init__Š  sµ   ø€ Ü‰ÑÔØ!×-Ñ-ˆÔØ×3Ñ3ˆŒä—i’i × 0Ñ 0°$×2BÑ2BÈÏÉÑYˆŒÜ—i’i × 0Ñ 0°$×2BÑ2BÈÏÉÑYˆŒäŸš¤e§k¢k°!°Q¸×8HÑ8HÓ&IÓJˆŒÜŸ9š9 T×%5Ñ%5°t×7GÑ7GÈdÑSˆÕr4   r¡   r   c                 ó¾  • UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " X†U5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )Nrr   rq   r   r   r   r‰   )ÚshaperÐ   r|   r°   ÚreshaperÍ   r±   ÚpermuteÚFÚscaled_dot_product_attentionrŸ   ÚmeanrÑ   )r/   r¡   Ú
batch_sizeÚseq_lenÚ
hidden_dimrÐ   ÚkeyÚvalueÚqueryrÃ   Úoutputs              r2   r£   Ú!Aimv2AttentionPoolingHead.forward•  s8  € Ø*7×*=Ñ*=Ñ'ˆ
˜Zà—N‘N×)Ñ)¨*°b¸"Ó=ˆ	àk‰k˜-Ó(×0Ñ0°ÀdÇnÁnÐV`×drÑdrÑVrÓsˆØ—‘˜MÓ*×2Ñ2°:ÈÏÉÐXb×ftÑftÑXtÓuˆØ×!Ñ! *¨a°·±ÀÏ~É~ÑA]Ó^ˆàk‰k˜!˜Q  1Ó%ˆØ—‘˜a  A qÓ)ˆØ—‘˜a  A qÓ)ˆä×4Ò4°UÀÓGˆà!×+Ñ+¨A¨qÓ1×9Ñ9¸*ÀaÈÓTˆØ!×&Ñ&¨1Ð&Ð-ˆà×!Ñ! +Ó.ˆØˆr4   )rÐ   r   r°   rÍ   rÑ   r±   )r=   r>   r?   r@   r   r-   rz   r§   r£   rF   rG   rH   s   @r2   rË   rË   ‰  s2   ø† ð	TÐ0÷ 	Tð U§\¡\ð °e·l±l÷ ò r4   rË   c                   óT   ^ • \ rS rSr% Sr\\S'   SrSr/ SQr	Sr
SrSrU 4S jrSrU =r$ )	ÚAimv2PreTrainedModeli«  zÃ
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rl   Úaimv2T)r·   rË   rk   r©   c                 óª  >• [         TU ]  U5        [        US5      (       ad  [        UR                  [
        R                  5      (       a:  UR                  R                  R                  [        R                  " S5      5        g g [        U[        5      (       a9  UR                  R                  R                  SU R                  R                  S9  g g )NÚlogit_scaleg$I’$I’,@r:   )rÙ   Ústd)r,   Ú_init_weightsÚhasattrÚ
isinstanceræ   r   rÎ   ÚdataÚfill_ÚmathÚlogrË   rÐ   Únormal_rl   r'   )r/   Úmoduler1   s     €r2   rè   Ú"Aimv2PreTrainedModel._init_weights¿  s˜   ø€ Ü‰Ñ˜fÔ%Ü6˜=×)Ñ)Ü˜&×,Ñ,¬b¯l©l×;Ñ;Ø×"Ñ"×'Ñ'×-Ñ-¬d¯hªh°xÓ.@ÕAð <ä˜Ô 9×:Ñ:Ø×Ñ×!Ñ!×)Ñ)¨s¸¿¹×8UÑ8UÐ)ÒVð ;r4   r+   )r=   r>   r?   r@   rA   rX   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_supports_sdpaÚ_supports_flash_attnÚ_supports_flex_attnrè   rF   rG   rH   s   @r2   rã   rã   «  sC   ø‡ ñð
 ÓØÐØ&*Ð#òÐð €NØÐØÐ÷Wó Wr4   rã   zL
    The Vision model from AIMv2 without any head or projection on top.
    )Úcustom_introc                   óÀ   ^ • \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\   SS\\R                     S\\   S	\\   S\4S
 jj5       5       rSrU =r$ )ÚAimv2VisionModeliÈ  rl   rœ   c                 ó>  >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r¹   )r,   r-   rl   rk   Ú
embeddingsrÉ   Úencoderrg   r   r"   rv   r(   rË   ÚheadÚ	post_initr´   s     €r2   r-   ÚAimv2VisionModel.__init__Ñ  so   ø€ Ü‰Ñ˜Ô ØŒÜ/°Ó7ˆŒÜ# FÓ+ˆŒä$ V×%7Ñ%7¸×9LÑ9LÓMˆŒàŸ™ˆŒØ==Ü1°&Ó9ˆDŒIà‰Õr4   r   c                 ó.   • U R                   R                  $ r¹   )rý   ru   ©r/   s    r2   Úget_input_embeddingsÚ%Aimv2VisionModel.get_input_embeddingsß  s   € Ø‰×*Ñ*Ð*r4   r¿   rÀ   Úoutput_hidden_statesc                 óf  • Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  UUUS9nUS   nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUUR                  UR                  S9$ )ar  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```N)Úinputs_embedsrÀ   r  r   ©Úlast_hidden_stateÚpooler_outputr¡   Ú
attentions)rl   rÀ   r  rý   rþ   rv   r(   rÿ   r   r¡   r  )	r/   rœ   r¿   rÀ   r  r¡   Úencoder_outputsr
  r  s	            r2   r£   ÚAimv2VisionModel.forwardâ  s¿   € ð: 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð Ÿ™¨Ó5ˆàŸ,™,Ø'Ø/Ø!5ð 'ð 
ˆð ,¨AÑ.ÐØ ŸM™MÐ*;Ó<Ðà8<¿¿˜Ÿ	™	Ð"3Ô4È4ˆä)Ø/Ø'Ø)×7Ñ7Ø&×1Ñ1ñ	
ð 	
r4   )rl   rý   rþ   rÿ   rv   r(   ©NNN)r=   r>   r?   r@   r   rò   Úmain_input_namer-   r   ÚModuler  r   r
   r   rz   r§   rD   r   r£   rF   rG   rH   s   @r2   rû   rû   È  s   ø‡ ð ÓØ$€OðÐ0÷ ð+ b§i¡iô +ð Øð 26Ø,0Ø/3ñ2
ð ! §¡Ñ.ð2
ð $ D™>ð	2
ð
 ' t™nð2
ð 
$ô2
ó ó ö2
r4   rû   zJ
    The text model from AIMv2 without any head or projection on top.
    c                   óº   ^ • \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S r
\\   SS\\R                     S	\\   S
\\   S\4S jj5       5       rSrU =r$ )ÚAimv2TextModeli  Ú	input_idsrl   c                 óü   >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r¹   )r,   r-   rl   r©   rý   rÉ   rþ   rg   r   r"   rv   rO   r   r´   s     €r2   r-   ÚAimv2TextModel.__init__!  s_   ø€ Ü‰Ñ˜Ô ØŒÜ-¨fÓ5ˆŒÜ# FÓ+ˆŒÜ$ V×%7Ñ%7¸×9LÑ9LÓMˆŒà"×/Ñ/ˆÔà‰Õr4   r   c                 ó.   • U R                   R                  $ r¹   ©rý   Útoken_embeddingr  s    r2   r  Ú#Aimv2TextModel.get_input_embeddings,  s   € Ø‰×.Ñ.Ð.r4   c                 ó$   • XR                   l        g r¹   r  )r/   rÞ   s     r2   Úset_input_embeddingsÚ#Aimv2TextModel.set_input_embeddings/  s   € Ø*/‰Õ'r4   r¿   rÀ   r  c           	      ó  • Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nUR                  u  pgn[
        R                  " U[
        R                  UR                  S9n	U	R                  S5      R                  US5      n
Ub  [        U R                   UU
UU	S S9nU R                  UUUUS9nUS   nU R                  U5      nU[
        R                  " UR                  S   UR                  S9UR                  [
        R                  UR                  S9U R                   :H  R                  5       R#                  SS94   n[%        UUUR&                  UR(                  S9$ )	Nrƒ   r   rr   )rl   Úinput_embedsrp   r¿   Úcache_positionÚpast_key_values)r  r¿   rÀ   r  )r…   r‰   r	  )rl   rÀ   r  rý   rÔ   rz   r{   Úlongr…   Ú	unsqueezer|   r   rþ   rv   ÚtorB   rO   Úargmaxr   r¡   r  )r/   r  r¿   rÀ   r  r¡   rÚ   rÛ   r    r   rp   r  r
  Úpooled_outputs                 r2   r£   ÚAimv2TextModel.forward2  s  € ð 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð Ÿ™¨	Ó2ˆØ!.×!4Ñ!4Ñˆ
˜QäŸš g´U·Z±ZÈ×H\ÑH\Ñ]ˆØ%×/Ñ/°Ó2×9Ñ9¸*ÀbÓIˆØÑ%Ü/Ø—{‘{Ø*Ø)Ø-Ø-Ø $ñˆNð Ÿ,™,Ø'Ø)Ø/Ø!5ð	 'ð 
ˆð ,¨AÑ.ÐØ ŸM™MÐ*;Ó<Ðð *ÜLŠLÐ*×0Ñ0°Ñ3Ð<M×<TÑ<TÑUØ\‰\¤§	¡	Ð2C×2JÑ2Jˆ\ÐKÈt×O`ÑO`Ñ`×eÑeÓg×nÑnÐsuÐnÐvðxñ
ˆô
 *Ø/Ø'Ø)×7Ñ7Ø&×1Ñ1ñ	
ð 	
r4   )rl   rý   rþ   rO   rv   r  )r=   r>   r?   r@   r  rJ   r-   r   r  r  r  r   r
   r   rz   r§   rD   r   r£   rF   rG   rH   s   @r2   r  r    s‹   ø† ð "€Oð	˜÷ 	ð/ b§i¡iô /ò0ð Øð 26Ø,0Ø/3ñ0
ð ! §¡Ñ.ð0
ð $ D™>ð	0
ð
 ' t™nð0
ð 
$ô0
ó ó ö0
r4   r  c                   óÀ   • \ rS rSrS\4S jr\\     SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\   S
\4S jj5       5       rSrg)Ú
Aimv2Modelig  rl   c                 ó  • [         R                  " 5       R                  U5        UR                  U l        UR                  R
                  U l        UR                  R
                  U l        [        R                  UR                  5      U l        [        R                  UR                  5      U l        [         R                  " U R                  U R                  SS9U l        [         R                  " U R                  U R                  SS9U l        [         R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFr­   )r   r  r-   r[   r`   r   Úvision_embed_dimr_   Útext_embed_dimrû   Ú_from_configÚvision_modelr  Ú
text_modelr¯   Úvisual_projectionÚtext_projectionrÎ   rz   Útensorrl   r\   ræ   rí   rî   r]   Úmax_log_logit_scaler   )r/   rl   s     r2   r-   ÚAimv2Model.__init__i  s  € Ü
	Š	‹×Ñ˜VÔ$à$×3Ñ3ˆÔØ &× 4Ñ 4× @Ñ @ˆÔØ$×0Ñ0×<Ñ<ˆÔä,×9Ñ9¸&×:NÑ:NÓOˆÔÜ(×5Ñ5°f×6HÑ6HÓIˆŒä!#§¢¨4×+@Ñ+@À$×BUÑBUÐ\aÑ!bˆÔÜ!Ÿyšy¨×)<Ñ)<¸d×>QÑ>QÐX]Ñ^ˆÔäŸ<š<¬¯ª°T·[±[×5WÑ5WÓ(XÓYˆÔÜ#'§8¢8¨F×,BÑ,BÓ#CˆÔ à‰Õr4   Nr  rœ   r¿   rÀ   r  r   c           	      ó`  • Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nU R	                  UUUUS9nUR
                  nU R                  U5      nUR
                  n	U R                  U	5      n	U[        U5      -  nU	[        U	5      -  n	U R                  R                  SU R                  5      R                  5       R                  U	R                  5      n
X©-  UR                  5       -  nUR                  5       n[!        UUU	UUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```)rœ   rÀ   r  )r  r¿   rÀ   r  r:   )Úlogits_per_imageÚlogits_per_textÚtext_embedsÚimage_embedsÚtext_model_outputÚvision_model_output)rl   rÀ   r  r.  r/  r  r0  r1  r   ræ   Úclampr3  Úexpr$  r…   Útrc   )r/   r  rœ   r¿   rÀ   r  Úvision_outputsÚtext_outputsr9  r8  ræ   r7  r6  s                r2   r£   ÚAimv2Model.forward{  sS  € ðB 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð 6:×5FÑ5FØ%Ø/Ø!5ð 6Gð 6
ˆð 48·?±?ØØ)Ø/Ø!5ð	 4Cð 4
ˆð &×3Ñ3ˆØ×-Ñ-¨lÓ;ˆà"×0Ñ0ˆØ×*Ñ*¨;Ó7ˆð $Ô&6°|Ó&DÑDˆØ!Ô$4°[Ó$AÑAˆà×&Ñ&×,Ñ,¨S°$×2JÑ2JÓK×OÑOÓQ×TÑTÐU`×UgÑUgÓhˆØ&Ñ4¸¿¹Ó8HÑHˆØ*×,Ñ,Ó.ÐäØ-Ø+Ø#Ø%Ø*Ø .ñ
ð 	
r4   )	ræ   r3  r[   r,  r/  r1  r+  r.  r0  )NNNNN)r=   r>   r?   r@   rX   r-   r
   r   r   rz   Ú
LongTensorÚFloatTensorr§   rD   rc   r£   rF   r+   r4   r2   r)  r)  g  s¤   † ð˜{ô ð$ Øð 15Ø48Ø15Ø,0Ø/3ñF
à˜E×,Ñ,Ñ-ðF
ð ˜u×0Ñ0Ñ1ðF
ð ! §¡Ñ.ð	F
ð
 $ D™>ðF
ð ' t™nðF
ð 
ôF
ó ó óF
r4   r)  )rX   r   rJ   rû   r)  rã   r  )5rA   rí   Útypingr   rz   Útorch.nn.functionalr   Ú
functionalr×   Úmasking_utilsr   Úmodeling_layersr   Úmodeling_outputsr   Úmodeling_utilsr	   Úutilsr
   r   Úclip.modeling_clipr   r   r   Úllama.modeling_llamar   r   Úsiglip.configuration_siglipr   r   r   Úsiglip.modeling_siglipr   r   r   r   rJ   rX   rc   rg   ri   r  rk   r©   r«   r·   rÉ   rË   rã   rû   r  r)  Ú__all__r+   r4   r2   Ú<module>rQ     sŒ  ðñ  ,ã Ý ã ß Ð Ý å /Ý 9Ý :Ý -÷÷ QÑ Pß 9ß \Ñ \ß QÑ Qôa Ð*ô a ôHX Ð&ô X ôv6$,ô 6$ôr	,ô 	ô	<ô 	ô	ˆxô 	ô1˜BŸI™Iô 1ôh	Ð,ô 	ôX_ô Xô]Ð2ô ]ô2	=ô 	ô §	¡	ô ðD ôW˜?ó Wó ðWñ8 ðñô
I
Ð+ó I
óð
I
ñX ðñô
F
Ð)ó F
óð
F
ðR ô[
˜BŸI™Ió [
ó ð[
ò|r4   