
    Phm                        d Z ddlZddlmZ ddlZddlmc mZ ddlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*  G d de&      Z+ G d de%      Z, G d de$      Z- G d de*      Z. G d de"      Z/ G d de!      Z0 G d d ejb                        Z2 G d! d"e      Z3 G d# d$e(      Z4 G d% d&e      Z5 G d' d(e)      Z6 G d) d*ejb                        Z7e G d+ d,e             Z8 ed-.       G d/ d0e8             Z9 ed1.       G d2 d3e8             Z:e G d4 d5e             Z;g d6Z<y)7z%Pytorch implementation of AIMv2 Model    N)Optional)nn   )create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple)deprecate_kwarg)check_model_inputs   )	CLIPModelCLIPTextEmbeddings_get_vector_norm)LlamaMLPLlamaRMSNorm)SiglipConfigSiglipTextConfigSiglipVisionConfig)SiglipAttentionSiglipEncoderSiglipOutputc                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededededededef fdZ xZ	S )Aimv2VisionConfiga  
    This is the configuration class to store the configuration of a [`Aimv2VisionModel`]. It is used to instantiate a
    AIMv2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the vision encoder of the AIMv2
    [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1024):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2816):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 24):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 8):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            Number of channels in the input images.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
        use_head (`str`, *optional*, defaults to `True`):
            Whether to use Attention Pooling Head or Not.
        is_native (`str`, *optional*, defaults to `False`):
            Whether to use ckpt trained for image native resolution or not.
    Example:

    ```python
    >>> from transformers import SiglipVisionConfig, SiglipVisionModel

    >>> # Initializing a Aimv2VisionConfig with apple/aimv2-large-patch14-224 style configuration
    >>> configuration = Aimv2VisionConfig()

    >>> # Initializing a Aimv2VisionModel (with random weights) from the apple/aimv2-large-patch14-224 style configuration
    >>> model = Aimv2VisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_channels
image_size
patch_sizerms_norm_epsattention_dropoutqkv_biasmlp_bias
hidden_actinitializer_rangeuse_head	is_nativec                     t        |   d|||||||||
d	| || _        || _        |	| _        || _        |
| _        || _        || _        | `	y )N)	r   r    r!   r"   r*   r#   r$   r%   r(    )
super__init__r,   r+   r'   r)   r(   r&   r-   layer_norm_eps)selfr   r    r!   r"   r#   r$   r%   r&   r'   r(   r)   r*   r+   r,   r-   kwargs	__class__s                    a/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/aimv2/modular_aimv2.pyr1   zAimv2VisionConfig.__init__d   sx    & 	 	
#// 3!%!!	
 	
 !!2!2  ("    )i   i         r         h㈵>        FFsilu{Gz?TF)
__name__
__module____qualname____doc__intfloatboolstrr1   __classcell__r5   s   @r6   r   r   +   s    6t  !%!##$"#& #'!( (  (  	( 
 !(  (  (  (  (  !(  (  (  (  !(  (   !(  ( r7   r   c                        e Zd ZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddededededededed	ed
ededee   dee   dededef fdZ	 xZ
S )Aimv2TextConfiga  
    This is the configuration class to store the configuration of a [`Aimv2TextModel`]. It is used to instantiate a
    AIMv2 text encoder according to the specified arguments, defining the model architecture. Instantiating a
    configuration with the defaults will yield a similar configuration to that of the text encoder of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        vocab_size (`int`, *optional*, defaults to 49408):
            Vocabulary size of the AIMv2 text model. Defines the number of different tokens that can be represented by
            the `inputs_ids` passed when calling [`Aimv2Model`].
        hidden_size (`int`, *optional*, defaults to 768):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 2048):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 6):
            Number of attention heads for each attention layer in the Transformer encoder.
        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the rms normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        qkv_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the queries, keys and values.
        mlp_bias (`bool`, *optional*, defaults to `False`):
            Whether to add a bias to the Linear layers or Not.
        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        pad_token_id (`int`, *optional*, defaults to 1):
            The id of the padding token in the vocabulary.
        bos_token_id (`int`, *optional*, defaults to 49406):
            The id of the beginning-of-sequence token in the vocabulary.
        eos_token_id (`int`, *optional*, defaults to 49407):
            The id of the end-of-sequence token in the vocabulary.
        max_position_embeddings (`int`, *optional*, defaults to 77):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the for initializing all weight matrices.
    
vocab_sizer   r    r!   r"   r&   r'   r(   r)   r*   pad_token_idbos_token_ideos_token_idmax_position_embeddingsr+   c                     t        |   d||||||
||||d
| || _        || _        |	| _        || _        || _        | `| `| `	| `
y )N)
rL   r   r    r!   r"   r*   rP   rM   rN   rO   r/   )r0   r1   r+   r'   r)   r(   r&   rN   rM   projection_sizer2   )r3   rL   r   r    r!   r"   r&   r'   r(   r)   r*   rM   rN   rO   rP   r+   r4   r5   s                    r6   r1   zAimv2TextConfig.__init__   s    & 	 	
!#// 3!$;%%%	
 	
 "3!2  ( r7   )i   i   i         r<   r=   FFr>   NNi  M   r?   )r@   rA   rB   rC   rD   rE   rF   rG   r   r1   rH   rI   s   @r6   rK   rK      s    +^  !%!##$"#& &*&*!')"&!* *  *  	* 
 *  !*  *  !*  *  *  *  sm*  sm*  *  "%*    !*  * r7   rK   c                   &     e Zd ZdZ	 d fd	Z xZS )Aimv2Configa@  
    [`Aimv2Config`] is the configuration class to store the configuration of a [`Aimv2Model`]. It is used to
    instantiate a AIMv2 model according to the specified arguments, defining the text model and vision model configs.
    Instantiating a configuration with the defaults will yield a similar configuration to that of the AIMv2
    [apple/aimv2-large-patch14-224-lit](https://huggingface.co/apple/aimv2-large-patch14-224-lit) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        text_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2TextConfig`].
        vision_config (`dict`, *optional*):
            Dictionary of configuration options used to initialize [`Aimv2VisionConfig`].
        projection_dim (`int`, *optional*, defaults to 512):
            Dimensionality of text and vision projection layers.
        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
            The initial value of the *logit_scale* parameter.
        kwargs (*optional*):
            Dictionary of keyword arguments.

    Example:

    ```python
    >>> from transformers import Aimv2Config, Aimv2Model

    >>> # Initializing a Aimv2Config with apple/aimv2-large-patch14-224-lit style configuration
    >>> configuration = Aimv2Config()

    >>> # Initializing a Aimv2Model (with random weights) from the apple/aimv2-large-patch14-224-lit style configuration
    >>> model = Aimv2Model(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config

    >>> # We can also initialize a Aimv2Config from a Aimv2TextConfig and a Aimv2VisionConfig
    >>> from transformers import Aimv2TextConfig, Aimv2VisionConfig

    >>> # Initializing a AIMv2Text and AIMv2Vision configuration
    >>> config_text = Aimv2TextConfig()
    >>> config_vision = Aimv2VisionConfig()

    >>> config = Aimv2Config(text_config=config_text, vision_config=config_vision)
    ```c                 V    t        |   ||fi | || _        || _        d| _        | `y )Ng      Y@)r0   r1   projection_dimlogit_scale_init_valuemax_logit_scaleinitializer_factor)r3   text_configvision_configrY   rZ   r4   r5   s         r6   r1   zAimv2Config.__init__  s7     	m>v>,&<#$#r7   )NNi   g/L
F@)r@   rA   rB   rC   r1   rH   rI   s   @r6   rW   rW      s    +\ `f$ $r7   rW   c                       e Zd Zy)Aimv2OutputNr@   rA   rB   r/   r7   r6   r`   r`   #      r7   r`   c                       e Zd Zy)Aimv2RMSNormNra   r/   r7   r6   rd   rd   '  rb   r7   rd   c                       e Zd Zy)Aimv2MLPNra   r/   r7   r6   rf   rf   +  rb   r7   rf   c                        e Zd Zdef fdZedddej                  fdej                  fd       Z	dej                  dej                  fd	Z
 xZS )
Aimv2VisionEmbeddingsconfigc                 B   t         |           || _        |j                  | _        t	        j
                  |j                  |j                  |j                  |j                        | _        t        |j                  |j                        | _        |j                  |j                  z  dz  }| j                  j                  s%t	        j                  ||j                        | _        | j!                  dt#        j$                  |      j'                  d      d       y )N)kernel_sizestrider   position_ids)   F)
persistent)r0   r1   ri   r%   r   Conv2dr#   r   patch_embedrd   r&   rms_normr$   r-   	Embeddingposition_embeddingregister_buffertorcharangeexpand)r3   ri   num_patchesr5   s      r6   r1   zAimv2VisionEmbeddings.__init__0  s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir7      g     @cpureturnc                 :   t        j                  t        |      ||      }t        j                  t        |       ||      }t        j                  ||d      \  }}|dz  }t        j                  |||      |z  }	d||	z  z  }	|j	                         d   |	d d d f   z  }
|j	                         d   |	d d d f   z  }t        j
                  |
j                         |
j                         |j                         |j                         gd      d d d d d f   S )	Ndtypedevicexy)indexing   g      ?).Nrn   dim)rw   rx   rD   meshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   r   grid_wgrid_hpos_dimomegaout_hout_ws               r6   "build_2d_sincos_position_embeddingz8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding>  s     c%jfEc&kvFFq.WE&AGK{E)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr7   pixel_valuesc                    |j                         \  }}}}| j                  |      j                  d      j                  dd      }| j	                  |      }| j
                  j                  rY| j                  || j                  z  || j                  z  | j
                  j                  |j                  |j                        }n| j                  | j                        }||z   }|S )Nr   rn   )r   r   r   )sizerr   r   	transposers   ri   r-   r   r%   r   r   r   ru   rm   )r3   r   _r   r   hidden_states	pos_embeds          r6   forwardzAimv2VisionEmbeddings.forwardO  s    *//11fe((6>>qAKKAqQm4;;  ??$//)(++11$++#)) @ I //0A0ABI%	1r7   )r@   rA   rB   r   r1   staticmethodrw   float32Tensorr   r   rH   rI   s   @r6   rh   rh   /  s]    j0 j !$'%u}}e	e e ELL U\\ r7   rh   c                       e Zd Zy)Aimv2TextEmbeddingsNra   r/   r7   r6   r   r   c  rb   r7   r   c                        e Zd Z fdZ xZS )Aimv2Attentionc                    t         |   |       t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _        t        j                  | j                  | j                  |j
                        | _	        y )Nbias)
r0   r1   r   Linearr   r(   k_projv_projq_projout_projr3   ri   r5   s     r6   r1   zAimv2Attention.__init__h  s     iiV__UiiV__UiiV__U		$..$..vWr7   )r@   rA   rB   r1   rH   rI   s   @r6   r   r   g  s    X Xr7   r   c            	            e Zd Zdef fdZ	 ddej                  deej                     dee	   dej                  fdZ
 xZS )	Aimv2EncoderLayerri   c                     t         |           t        |      | _        t	        |      | _        t        |j                  |j                        | _	        t        |j                  |j                        | _
        y N)r0   r1   r   	attentionrf   ffnrd   r   r&   	rms_norm1	rms_norm2r   s     r6   r1   zAimv2EncoderLayer.__init__q  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr7   r   attention_maskr4   r}   c                     | j                  |      } | j                  d||d|\  }}||z   }| j                  |      }| j                  |      }||z   }|S )N)r   r   r/   )r   r   r   r   )r3   r   r   r4   norm_hidden_statesattn_outputr   
mlp_outputs           r6   r   zAimv2EncoderLayer.forwardx  sl     "^^M:'r6HYgrkqrQ%3!^^M:XX01
%
2r7   r   )r@   rA   rB   r   r1   rw   r   r   r   r   r   rH   rI   s   @r6   r   r   p  sY    O0 O 26|| !. +,	
 
r7   r   c                       e Zd Zy)Aimv2EncoderNra   r/   r7   r6   r   r     rb   r7   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Aimv2AttentionPoolingHeadri   c                 &   t         |           |j                  | _        |j                  | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _	        t        j                  t        j                  dd| j                              | _        t        j                  | j                  | j                  d      | _        y )Nr   rn   T)r0   r1   r   r"   	num_headsr   r   r(   r   r   	Parameterrw   zeros	cls_tokenoutput_projr   s     r6   r1   z"Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr7   r   r}   c                    |j                   \  }}}| j                  j                  |dd      }| j                  |      j	                  ||| j
                  || j
                  z        }| j                  |      j	                  ||| j
                  || j
                  z        }|j	                  |d| j
                  || j
                  z        }|j                  dddd      }|j                  dddd      }|j                  dddd      }t        j                  |||      }	|	j                  dd      j	                  |d|      }	|	j                  d      }	| j                  |	      }
|
S )Nro   rn   r   r   r   r   )shaper   ry   r   reshaper   r   permuteFscaled_dot_product_attentionr   meanr   )r3   r   
batch_sizeseq_len
hidden_dimr   keyvaluequeryr   outputs              r6   r   z!Aimv2AttentionPoolingHead.forward  sH   *7*=*='
GZNN))*b"=	kk-(00WdnnV`dhdrdrVrsM*22:wXbfjftftXtu!!*at~~A]^kk!Q1%aAq)aAq)44UCG!++Aq199*aT!&&1&-!!+.r7   )	r@   rA   rB   r   r1   rw   r   r   rH   rI   s   @r6   r   r     s-    	T0 	TU\\ ell r7   r   c                   J     e Zd ZU dZeed<   dZdZg dZdZ	dZ
dZ fdZ xZS )Aimv2PreTrainedModelz
    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
    models. The model is only intended for inference and doesn't support finetuning.
    ri   aimv2T)r   r   rh   r   c                    t         |   |       t        |d      r^t        |j                  t
        j                        r9|j                  j                  j                  t        j                  d             y y t        |t              r<|j                  j                  j                  d| j                  j                         y y )Nlogit_scaleg$I$I,@r=   )r   std)r0   _init_weightshasattr
isinstancer   r   r   datafill_mathlogr   r   normal_ri   r+   )r3   moduler5   s     r6   r   z"Aimv2PreTrainedModel._init_weights  s    f%6=)&,,bll;""''--dhhx.@A < 9:!!))s8U8U)V ;r7   )r@   rA   rB   rC   rW   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr   rH   rI   s   @r6   r   r     sC    
 &*# NW Wr7   r   zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc            
            e Zd ZU eed<   dZeedZdef fdZ	de
j                  fdZ edd	      ee	 ddeej$                     d
ee   defd                     Z xZS )Aimv2VisionModelri   r   r   
attentionsc                 6   t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                  rt        |      | _        | j                          y r   )r0   r1   ri   rh   
embeddingsr   encoderrd   r   r&   rs   r,   r   head	post_initr   s     r6   r1   zAimv2VisionModel.__init__  sq     /7#F+$V%7%79L9LM==1&9DIr7   r}   c                 .    | j                   j                  S r   )r   rr   r3   s    r6   get_input_embeddingsz%Aimv2VisionModel.get_input_embeddings  s    ***r7   r   zv4.58.0)versionr4   c                     | j                  |      } | j                  dd|i|}|j                  }| j                  |      }| j                  r| j                  |      nd}t        ||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```inputs_embedsNlast_hidden_statepooler_outputr/   )r   r   r   rs   r,   r   r	   )r3   r   r   r4   r   encoder_outputsr   r   s           r6   r   zAimv2VisionModel.forward  sz    : 5+74<< ,
',
,

 ,== MM*;<8<		"344)/'
 	
r7   r   )r@   rA   rB   r   r   main_input_namer   r   _can_record_outputsr1   r   Moduler   r   r   r   r   rw   r   r   r   r	   r   rH   rI   s   @r6   r   r     s     $O*$
0 +bii + %y9 26)
 !.)
 +,	)

 
$)
   :)
r7   r   zJ
    The text model from AIMv2 without any head or projection on top.
    c            	            e Zd ZdZeedZdef fdZde	j                  fdZd Zee	 ddeej"                     d	ee   defd
              Z xZS )Aimv2TextModel	input_idsr   ri   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        |j                  | _        | j                          y r   )r0   r1   ri   r   r   r   r   rd   r   r&   rs   rO   r   r   s     r6   r1   zAimv2TextModel.__init__&  sa     -f5#F+$V%7%79L9LM"//r7   r}   c                 .    | j                   j                  S r   r   token_embeddingr   s    r6   r   z#Aimv2TextModel.get_input_embeddings1  s    ...r7   c                 &    || j                   _        y r   r  )r3   r   s     r6   set_input_embeddingsz#Aimv2TextModel.set_input_embeddings4  s    */'r7   r   r4   c                    | j                  |      }|j                  \  }}}t        j                  |t        j                  |j
                        }|j                  d      j                  |d      }	|t        | j                  ||	||d       } | j                  d	||d|}
|
j                  }| j                  |      }|t        j                  |j                  d   |j
                        |j                  t        j                  |j
                        | j                  k(  j                         j!                  d      f   }t#        ||      S )
Nr   r   ro   )ri   input_embedsrm   r   cache_positionpast_key_values)r   r   )r   r   r   r/   )r   r   rw   rx   longr   	unsqueezery   r   ri   r   r   rs   torD   rO   argmaxr	   )r3   r  r   r4   r   r   r   r   r  rm   r   r   pooled_outputs                r6   r   zAimv2TextModel.forward7  sN    	2!.!4!4
GQgUZZH\H\]%//299*bI%/{{*)-- $N '$,, 
')
 
 ,== MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */'
 	
r7   r   )r@   rA   rB   r   r   r   r   rK   r1   r   r   r   r	  r   r   r   rw   r   r   r   r	   r   rH   rI   s   @r6   r  r    s     "O +$
	 	/bii /0  26'
 !.'
 +,	'

 
$'
  '
r7   r  c                       e Zd ZdZdefdZee	 	 	 ddee	j                     dee	j                     dee	j                     dee   d	ef
d
              Zy)
Aimv2ModelTri   c                    t        j                  | |       |j                  | _        |j                  j                  | _        |j                  j                  | _        t        j                  |j                        | _
        t        j                  |j                        | _        t        j                  | j
                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j"                  t%        j&                  | j(                  j*                              | _        t/        j0                  |j2                        | _        | j7                          y )NFr   )r
   r1   rY   r^   r   vision_embed_dimr]   text_embed_dimr   _from_configvision_modelr  
text_modelr   r   visual_projectiontext_projectionr   rw   tensorri   rZ   r   r   r   r[   max_log_logit_scaler   )r3   ri   s     r6   r1   zAimv2Model.__init__g  s     v.$33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r7   Nr  r   r   r4   r}   c                     | j                   dd|i|} | j                  d||d|}|j                  }| j                  |      }|j                  }| j	                  |      }|t        |      z  }|t        |      z  }| j                  j                  d| j                        j                         j                  |j                        }	|	|z  |j                         z  }
|
j                         }t        ||
||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Aimv2Model

        >>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
        >>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r   )r  r   r=   )logits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr/   )r  r  r   r  r  r   r   clampr  expr  r   tr`   )r3   r  r   r   r4   vision_outputstext_outputsr#  r"  r   r!  r   s               r6   r   zAimv2Model.forwardy  s&   > 6GT5F5F 6
%6
6

 4C4?? 4
)4
 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r7   )NNN)r@   rA   rB   r   rW   r1   r   r   r   rw   
LongTensorFloatTensorr   r   r   r`   r   r/   r7   r6   r  r  c  s    { $  154815	=
E,,-=
 u001=
 !.	=

 +,=
 
=
  =
r7   r  )rW   r   rK   r   r  r   r  )=rC   r   typingr   rw   torch.nn.functionalr   
functionalr   masking_utilsr   modeling_layersr   modeling_outputsr   r	   modeling_utilsr
   processing_utilsr   utilsr   r   r   utils.deprecationr   utils.genericr   clip.modeling_clipr   r   r   llama.modeling_llamar   r   siglip.configuration_siglipr   r   r   siglip.modeling_siglipr   r   r   r   rK   rW   r`   rd   rf   r   rh   r   r   r   r   r   r   r   r  r  __all__r/   r7   r6   <module>r=     s    ,       / 9 K - & 
 1 / P P 9 \ \ Q Qa * a HX & X v6$, 6$r	, 		< 		x 	1BII 1h	, 	X_ X2 2	= 			 D W? W W8 
E
+ E

E
P 
B
) B

B
J T
 T
 T
nr7   