
    <hq                        S r SSKrSSKJr  SSKrSSKJs  Jr  SSKJr  SSK	J
r
  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJrJr  SSKJrJr  SSKJrJrJr  SSKJ r J!r!J"r"   " S S\5      r# " S S\5      r$ " S S\5      r% " S S\"5      r& " S S\5      r' " S S\5      r( " S S\RR                  5      r* " S S\5      r+ " S  S!\ 5      r, " S" S#\5      r- " S$ S%\!5      r. " S& S'\RR                  5      r/\ " S( S)\5      5       r0\" S*S+9 " S, S-\05      5       r1\" S.S+9 " S/ S0\05      5       r2\ " S1 S2\\RR                  5      5       r3/ S3Qr4g)4z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputWithPooling)PreTrainedModel)auto_docstringcan_return_tuple   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                      ^  \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\S\S\S\S\4U 4S jjjr	Sr
U =r$ )Aimv2VisionConfig'   a  
This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
[apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1024):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2816):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 24):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 8):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 14):
        The size (resolution) of each patch.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.
    use_head (`str`, *optional*, defaults to `True`):
        Whether to use Attention Pooling Head or Not.
    is_native (`str`, *optional*, defaults to `False`):
        Whether to use ckpt trained for image native resolution or not.
Example:

```python
>>> from transformers import SiglipVisionConfig, SiglipVisionModel

>>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
>>> configuration = Aimv2VisionConfig()

>>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
>>> model = Aimv2VisionModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                    > [         TU ]  " SUUUUUUUUU
S.	UD6  Xl        Xl        Xl        Xl        Xl        Xl        Xl        U ?	g )N)	r   r   r   r   r&   r   r    r!   r$    )
super__init__r(   r'   r#   r%   r$   r"   r)   layer_norm_eps)selfr   r   r   r   r   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   kwargs	__class__s                    _/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aimv2/modular_aimv2.pyr-   Aimv2VisionConfig.__init__`   sj    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )r#   r'   r)   r%   r$   r"   r(   )i   i         r         h㈵>        FFsilu{Gz?TF)__name__
__module____qualname____firstlineno____doc__intfloatboolstrr-   __static_attributes____classcell__r1   s   @r2   r   r   '   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r4   r   c                      ^  \ rS rSrSr               SS\S\S\S\S\S\S	\S
\S\S\S\	\   S\	\   S\S\S\4U 4S jjjr
SrU =r$ )Aimv2TextConfig   a   
This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 49408):
        Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
        the `inputs_ids` passed when calling [`Aimv2Model`].
    hidden_size (`int`, *optional*, defaults to 768):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 2048):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 6):
        Number of attention heads for each attention layer in the Transformer encoder.
    rms_norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    qkv_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the queries, keys and values.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to add a bias to the Linear layers or Not.
    hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    pad_token_id (`int`, *optional*, defaults to 1):
        The id of the padding token in the vocabulary.
    bos_token_id (`int`, *optional*, defaults to 49406):
        The id of the beginning-of-sequence token in the vocabulary.
    eos_token_id (`int`, *optional*, defaults to 49407):
        The id of the end-of-sequence token in the vocabulary.
    max_position_embeddings (`int`, *optional*, defaults to 77):
        The maximum sequence length that this model might ever be used with. Typically set this to something large
        just in case (e.g., 512 or 1024 or 2048).
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the for initializing all weight matrices.

vocab_sizer   r   r   r   r"   r#   r$   r%   r&   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr'   c                    > [         TU ]  " SUUUUUU
UUUUS.
UD6  Xl        Xpl        Xl        Xl        X`l        U ?U ?U ?	U ?
g )N)
rL   r   r   r   r   r&   rP   rM   rN   rO   r+   )r,   r-   r'   r#   r%   r$   r"   rN   rM   projection_sizer.   )r/   rL   r   r   r   r   r"   r#   r$   r%   r&   rM   rN   rO   rP   r'   r0   r1   s                    r2   r-   Aimv2TextConfig.__init__   su    & 	 	
!#// 3!$;%%%	
 	
 "3!2  ( r4   )r#   r'   r%   r$   r"   )i   i   i         r9   r:   FFr;   NNi  M   r<   )r=   r>   r?   r@   rA   rB   rC   rD   rE   r   r-   rF   rG   rH   s   @r2   rJ   rJ      s    +^  !%!##$"#& &*&*!')"&!* *  *  	* 
 *  !*  *  !*  *  *  *  sm*  sm*  *  "%*    !*  * r4   rJ   c                   2   ^  \ rS rSrSr SU 4S jjrSrU =r$ )Aimv2Config   a  
[`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
[apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
    projection_dim (`int`, *optional*, defaults to 512):
        Dimensionality of text and vision projection layers.
    logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
        The initial value of the *logit_scale* parameter.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import Aimv2Config, Aimv2Model

>>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
>>> configuration = Aimv2Config()

>>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
>>> model = Aimv2Model(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
>>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

>>> # Initializing a AIMv2Text and AIMv2Vision configuration
>>> config_text = Aimv2TextConfig()
>>> config_vision = Aimv2VisionConfig()

>>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
```c                 R   > [         TU ]  " X40 UD6  X0l        X@l        SU l        U ?g )Ng      Y@)r,   r-   projection_dimlogit_scale_init_valuemax_logit_scaleinitializer_factor)r/   text_configvision_configr[   r\   r0   r1   s         r2   r-   Aimv2Config.__init__  s1     	>v>,&<#$#r4   )r\   r]   r[   )NNi   g/L
F@)r=   r>   r?   r@   rA   r-   rF   rG   rH   s   @r2   rX   rX      s    +\ `f$ $r4   rX   c                       \ rS rSrSrg)Aimv2Outputi  r+   Nr=   r>   r?   r@   rF   r+   r4   r2   rc   rc         r4   rc   c                       \ rS rSrSrg)Aimv2RMSNormi#  r+   Nrd   r+   r4   r2   rg   rg   #  re   r4   rg   c                       \ rS rSrSrg)Aimv2MLPi'  r+   Nrd   r+   r4   r2   ri   ri   '  re   r4   ri   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddingsi+  configc                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestrider   position_ids)   F)
persistent)r,   r-   rl   r!   r   Conv2dr   r   patch_embedrg   r"   rms_normr    r)   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r/   rl   num_patchesr1   s      r2   r-   Aimv2VisionEmbeddings.__init__,  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir4      g     @cpureturnc                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	Ndtypedevicexy)indexing   g      ?).Nrq   dim)rz   r{   rB   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r2   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding:  s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr4   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )Nr   rq   )r   r   r   )sizeru   r   	transposerv   rl   r)   r   r!   r   r   r   rx   rp   )r/   r   _r   r   hidden_states	pos_embeds          r2   forwardAimv2VisionEmbeddings.forwardK  s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1r4   )rl   ru   r!   rx   rv   )r=   r>   r?   r@   r   r-   staticmethodrz   float32Tensorr   r   rF   rG   rH   s   @r2   rk   rk   +  sb    j0 j !$'%u}}e	e e ELL U\\  r4   rk   c                       \ rS rSrSrg)Aimv2TextEmbeddingsi_  r+   Nrd   r+   r4   r2   r   r   _  re   r4   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )Aimv2Attentionic  c                   > [         TU ]  U5        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l        [        R                  " U R                  U R                  UR
                  S9U l	        g )Nbias)
r,   r-   r   Linearr   r$   k_projv_projq_projout_projr/   rl   r1   s     r2   r-   Aimv2Attention.__init__d  s     iiV__UiiV__UiiV__U		$..$..vWr4   )r   r   r   r   )r=   r>   r?   r@   r-   rF   rG   rH   s   @r2   r   r   c  s    X Xr4   r   c                      ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\\R                     S\\	   S\
\R                  \R                  4   4S jjrS	rU =r$ )Aimv2EncoderLayeril  rl   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g N)r,   r-   r   	attentionri   ffnrg   r   r"   	rms_norm1	rms_norm2r   s     r2   r-   Aimv2EncoderLayer.__init__m  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr4   r   attention_maskoutput_attentionsr   c                     U R                  U5      nU R                  XBS9u  pVX-   nU R                  U5      nU R                  U5      nX-   nU(       a  X4$ US 4$ )N)r   r   )r   r   r   r   )r/   r   r   r   norm_hidden_statesattn_outputattn_weights
mlp_outputs           r2   r   Aimv2EncoderLayer.forwardt  sk     "^^M:$(NNASN$s!%3!^^M:XX01
%20A,\W[G\\r4   )r   r   r   r   )NF)r=   r>   r?   r@   r   r-   rz   r   r   rD   tupler   rF   rG   rH   s   @r2   r   r   l  ss    O0 O 26,1	]||] !.] $D>	]
 
u||U\\)	*] ]r4   r   c                       \ rS rSrSrg)Aimv2Encoderi  r+   Nrd   r+   r4   r2   r   r     re   r4   r   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHeadi  rl   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   rq   T)r,   r-   r   r   	num_headsr   r   r$   r   r   	Parameterrz   zeros	cls_tokenoutput_projr   s     r2   r-   "Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr4   r   r   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )Nrr   rq   r   r   r   r   )shaper   r|   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r/   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r2   r   !Aimv2AttentionPoolingHead.forward  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.r4   )r   r   r   r   r   r   )r=   r>   r?   r@   r   r-   rz   r   r   rF   rG   rH   s   @r2   r   r     s2    	T0 	TU\\ ell  r4   r   c                   T   ^  \ rS rSr% Sr\\S'   SrSr/ SQr	Sr
SrSrU 4S jrSrU =r$ )	Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rl   aimv2T)r   r   rk   r   c                   > [         TU ]  U5        [        US5      (       ad  [        UR                  [
        R                  5      (       a:  UR                  R                  R                  [        R                  " S5      5        g g [        U[        5      (       a9  UR                  R                  R                  SU R                  R                  S9  g g )Nlogit_scaleg$I$I,@r:   )r   std)r,   _init_weightshasattr
isinstancer   r   r   datafill_mathlogr   r   normal_rl   r'   )r/   moduler1   s     r2   r   "Aimv2PreTrainedModel._init_weights  s    f%6=))&,,bll;;""''--dhhx.@A < 9::!!))s8U8U)V ;r4   r+   )r=   r>   r?   r@   rA   rX   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rF   rG   rH   s   @r2   r   r     sC    
 &*# NW Wr4   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\   SS\\R                     S\\   S	\\   S\4S
 jj5       5       rSrU =r$ )Aimv2VisionModeli  rl   r   c                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g r   )r,   r-   rl   rk   
embeddingsr   encoderrg   r   r"   rv   r(   r   head	post_initr   s     r2   r-   Aimv2VisionModel.__init__  so     /7#F+$V%7%79L9LM==1&9DIr4   r   c                 .    U R                   R                  $ r   )r   ru   r/   s    r2   get_input_embeddings%Aimv2VisionModel.get_input_embeddings  s    ***r4   r   r   output_hidden_statesc                 f   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  UUUS9nUS   nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUUR                  UR                  S9$ )ar  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```N)inputs_embedsr   r  r   last_hidden_statepooler_outputr   
attentions)rl   r   r  r   r   rv   r(   r   r   r   r  )	r/   r   r   r   r  r   encoder_outputsr
  r  s	            r2   r   Aimv2VisionModel.forward  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 5,,'/!5 ' 
 ,A. MM*;<8<		"344)/')77&11	
 	
r4   )rl   r   r   r   rv   r(   NNN)r=   r>   r?   r@   r   r   main_input_namer-   r   Moduler  r   r
   r   rz   r   rD   r   r   rF   rG   rH   s   @r2   r   r     s     $O0 +bii +  26,0/32
 !.2
 $D>	2

 'tn2
 
$2
  2
r4   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S r
\\   SS\\R                     S	\\   S
\\   S\4S jj5       5       rSrU =r$ )Aimv2TextModeli  	input_idsrl   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g r   )r,   r-   rl   r   r   r   r   rg   r   r"   rv   rO   r   r   s     r2   r-   Aimv2TextModel.__init__!  s_     -f5#F+$V%7%79L9LM"//r4   r   c                 .    U R                   R                  $ r   r   token_embeddingr  s    r2   r  #Aimv2TextModel.get_input_embeddings,  s    ...r4   c                 $    XR                   l        g r   r  )r/   r   s     r2   set_input_embeddings#Aimv2TextModel.set_input_embeddings/  s    */'r4   r   r   r  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nUR                  u  pgn[
        R                  " U[
        R                  UR                  S9n	U	R                  S5      R                  US5      n
Ub  [        U R                   UU
UU	S S9nU R                  UUUUS9nUS   nU R                  U5      nU[
        R                  " UR                  S   UR                  S9UR                  [
        R                  UR                  S9U R                   :H  R                  5       R#                  SS94   n[%        UUUR&                  UR(                  S9$ )	Nr   r   rr   )rl   input_embedsrp   r   cache_positionpast_key_values)r  r   r   r  )r   r   r	  )rl   r   r  r   r   rz   r{   longr   	unsqueezer|   r   r   rv   torB   rO   argmaxr   r   r  )r/   r  r   r   r  r   r   r   r   r   rp   r  r
  pooled_outputs                 r2   r   Aimv2TextModel.forward2  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 	2!.!4!4
QgUZZH\H\]%//299*bI%/{{*)-- $N ,,')/!5	 ' 
 ,A. MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */')77&11	
 	
r4   )rl   r   r   rO   rv   r  )r=   r>   r?   r@   r  rJ   r-   r   r  r  r  r   r
   r   rz   r   rD   r   r   rF   rG   rH   s   @r2   r  r    s     "O	 	/bii /0  26,0/30
 !.0
 $D>	0

 'tn0
 
$0
  0
r4   r  c                       \ rS rSrS\4S jr\\     SS\\	R                     S\\	R                     S\\	R                     S\\   S	\\   S
\4S jj5       5       rSrg)
Aimv2Modelig  rl   c                    [         R                  " 5       R                  U5        UR                  U l        UR                  R
                  U l        UR                  R
                  U l        [        R                  UR                  5      U l        [        R                  UR                  5      U l        [         R                  " U R                  U R                  SS9U l        [         R                  " U R                  U R                  SS9U l        [         R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFr   )r   r  r-   r[   r`   r   vision_embed_dimr_   text_embed_dimr   _from_configvision_modelr  
text_modelr   visual_projectiontext_projectionr   rz   tensorrl   r\   r   r   r   r]   max_log_logit_scaler   )r/   rl   s     r2   r-   Aimv2Model.__init__i  s   
		V$$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r4   Nr  r   r   r   r  r   c           	      `   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nU R	                  UUUUS9nUR
                  nU R                  U5      nUR
                  n	U R                  U	5      n	U[        U5      -  nU	[        U	5      -  n	U R                  R                  SU R                  5      R                  5       R                  U	R                  5      n
X-  UR                  5       -  nUR                  5       n[!        UUU	UUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```)r   r   r  )r  r   r   r  r:   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_output)rl   r   r  r.  r/  r  r0  r1  r   r   clampr3  expr$  r   trc   )r/   r  r   r   r   r  vision_outputstext_outputsr9  r8  r   r7  r6  s                r2   r   Aimv2Model.forward{  sS   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5 6G 6
 48??)/!5	 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r4   )	r   r3  r[   r,  r/  r1  r+  r.  r0  )NNNNN)r=   r>   r?   r@   rX   r-   r
   r   r   rz   
LongTensorFloatTensorr   rD   rc   r   rF   r+   r4   r2   r)  r)  g  s    { $  154815,0/3F
E,,-F
 u001F
 !.	F

 $D>F
 'tnF
 
F
  F
r4   r)  )rX   r   rJ   r   r)  r   r  )5rA   r   typingr   rz   torch.nn.functionalr   
functionalr   masking_utilsr   modeling_layersr   modeling_outputsr   modeling_utilsr	   utilsr
   r   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rJ   rX   rc   rg   ri   r  rk   r   r   r   r   r   r   r   r  r)  __all__r+   r4   r2   <module>rQ     s    ,       / 9 : - Q P 9 \ \ Q Qa * a HX & X v6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X]2 ]2	= 			 D W? W W8 
I
+ I

I
X 
F
) F

F
R [
BII [
 [
|r4   