
    <h<                        S SK r S SKJr  S SKJrJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJrJr  SSKJrJr  SSKJrJrJr  SSKJrJ r J!r!  \\ " S S\5      5       5       r"\" S5       " S S\	RF                  5      5       r$ " S S\	RF                  5      r% " S S\	RF                  5      r& " S S\	RF                  5      r' S9S\	RF                  S\RP                  S\RP                  S\RP                  S\\RP                     S\)S \)4S! jjr* " S" S#\	RF                  5      r+ " S$ S%\5      r, " S& S'\	RF                  5      r- " S( S)\	RF                  5      r.\ " S* S+\5      5       r/\" S,S-9 " S. S/\/5      5       r0\" S0S-9 " S1 S2\/5      5       r1S3\RP                  S4\RP                  4S5 jr2\ " S6 S7\/5      5       r3/ S8Qr4g):    N)	dataclass)AnyCallableOptional)nn   )ACT2FN)use_kernel_forward_from_hub)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuple   )Aimv2ConfigAimv2TextConfigAimv2VisionConfigc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)Aimv2Output)   ai  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`Aimv2TextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`Aimv2VisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2TextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`Aimv2VisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r    r!   N)getattrto_tuple).0kselfs     `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aimv2/modeling_aimv2.py	<genexpr>'Aimv2Output.to_tuple.<locals>.<genexpr>H   s<      
   LLDGRYZ^`aRbRkRkRmm s   14)tuplekeysr)   s   `r*   r&   Aimv2Output.to_tupleG   s#     
YY[
 
 	
     )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r    r   r!   r-   r   r&   __static_attributes__r2   r1   r*   r   r   )   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r1   r   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )Aimv2RMSNormN   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z+
Aimv2RMSNorm is equivalent to T5LayerNorm
N)super__init__r   	Parameterr8   onesweightvariance_epsilon)r)   hidden_sizeeps	__class__s      r*   rB   Aimv2RMSNorm.__init__P   s/     	ll5::k#:; #r1   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )N   T)keepdim)	dtypetor8   float32powmeanrsqrtrF   rE   )r)   hidden_statesinput_dtypevariances       r*   forwardAimv2RMSNorm.forwardX   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r1   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r-   rE   shaperF   r/   s    r*   
extra_reprAimv2RMSNorm.extra_repr_   s*    ))*+6$2G2G1HIIr1   )rF   rE   )gư>)	r3   r4   r5   r6   rB   rX   r\   r;   __classcell__rI   s   @r*   r>   r>   N   s    $;J Jr1   r>   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Aimv2MLPc   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nbias)rA   rB   configrG   intermediate_sizer   Linearmlp_bias	gate_projup_proj	down_projr	   
hidden_actact_fnr)   rf   rI   s     r*   rB   Aimv2MLP.__init__d   s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r1   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ N)rl   rn   rj   rk   )r)   xrl   s      r*   rX   Aimv2MLP.forwardn   s6    NN4;;t~~a/@#ADLLQRO#ST	r1   )rn   rf   rl   rj   rG   rg   rk   )r3   r4   r5   r6   rB   rX   r;   r^   r_   s   @r*   ra   ra   c   s    0 r1   ra   c                      ^  \ rS rSrS\4U 4S jjr\SSS\R                  4S\R                  4S jj5       r
S	\R                  S\R                  4S
 jrSrU =r$ )Aimv2VisionEmbeddingss   rf   c                 B  > [         TU ]  5         Xl        UR                  U l        [        R
                  " UR                  UR                  UR                  UR                  S9U l        [        UR                  UR                  5      U l        UR                  UR                  -  S-  nU R                  R                  (       d%  [        R                  " X!R                  5      U l        U R!                  S["        R$                  " U5      R'                  S5      SS9  g )N)kernel_sizestriderL   position_idsr   rM   F
persistent)rA   rB   rf   
patch_sizer   Conv2dnum_channelsrG   patch_embedr>   rms_norm_epsrms_norm
image_size	is_native	Embeddingposition_embeddingregister_bufferr8   arangeexpand)r)   rf   num_patchesrI   s      r*   rB   Aimv2VisionEmbeddings.__init__t   s     ++99!3!3ARAR[a[l[l
 %V%7%79L9LM((F,=,==!C{{$$&(ll;@R@R&SD#^U\\+-F-M-Mg-Vchir1      g     @cpur"   c                    [         R                  " [        U5      XTS9n[         R                  " [        U 5      XTS9n[         R                  " XgSS9u  pvUS-  n[         R                  " XUS9U-  n	SX9-  -  n	UR	                  5       S   U	S S S 24   -  n
UR	                  5       S   U	S S S 24   -  n[         R
                  " U
R                  5       U
R                  5       UR                  5       UR                  5       /SS9S S S 2S S 24   $ )	NrO   devicexy)indexing   g      ?).Nr   dim)r8   r   intmeshgridflattenconcatsincos)heightwidth	embed_dimtemperaturer   rO   grid_wgrid_hpos_dimomegaout_hout_ws               r*   "build_2d_sincos_position_embedding8Aimv2VisionEmbeddings.build_2d_sincos_position_embedding   s     c%jEc&kFFq.W&AGK{)* +eD!Gn< +eD!Gn<||UYY[%))+uyy{EIIKPVWXY]_`bcYcddr1   pixel_valuesc                    UR                  5       u    p#nU R                  U5      R                  S5      R                  SS5      nU R	                  U5      nU R
                  R                  (       aT  U R                  X0R                  -  X@R                  -  U R
                  R                  UR                  UR                  S9nOU R                  U R                  5      nXV-   nU$ )NrL   r   )r   r   rO   )sizer   r   	transposer   rf   r   r   r   rG   r   rO   r   r{   )r)   r   _r   r   rU   	pos_embeds          r*   rX   Aimv2VisionEmbeddings.forward   s    *//11e((6>>qAKKAqQm4;;  ??//)(++11$++#)) @ I //0A0ABI%1r1   )rf   r   r   r   r   )r3   r4   r5   r6   r   rB   staticmethodr8   rQ   Tensorr   rX   r;   r^   r_   s   @r*   rv   rv   s   sb    j0 j !$'%u}}e	e e ELL U\\  r1   rv   c            	          ^  \ rS rSrS\4U 4S jjr   S
S\\R                     S\\R                     S\\R                     S\R                  4S jjrS	rU =r$ )Aimv2TextEmbeddings   rf   c                 N  > [         TU ]  5         UR                  n[        R                  " UR
                  U5      U l        [        R                  " UR                  U5      U l        U R                  S[        R                  " UR                  5      R                  S5      SS9  g )Nr{   r|   Fr}   )rA   rB   rG   r   r   
vocab_sizetoken_embeddingmax_position_embeddingsr   r   r8   r   r   )r)   rf   r   rI   s      r*   rB   Aimv2TextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r1   	input_idsr{   inputs_embedsr"   c                 <   Ub  UR                   S   OUR                   S   nU R                  R                  R                   S   nXE:  a  [        SU SU 35      eUc  U R                  S S 2S U24   nUc  U R                  U5      nU R                  U5      nX6-   nU$ )NrM   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r[   r   rE   
ValueErrorr{   r   )r)   r   r{   r   
seq_lengthmax_position_embeddingposition_embeddings
embeddingss           r*   rX   Aimv2TextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H.d,<=S<TV 
 ,,Q^<L  00;M"55lC"8
r1   )r   r   NNN)r3   r4   r5   r6   r   rB   r   r8   
LongTensorr9   r   rX   r;   r^   r_   s   @r*   r   r      so    

 

 153759	E,,- u//0   1 12	
 
 r1   r   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrM   r   )r   rO   )ptrainingr   rL   )r8   matmulr   r   
functionalsoftmaxrQ   rP   rO   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r*   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r1   c            
          ^  \ rS rSrSrU 4S jr S	S\R                  S\\R                     S\	\R                  \\R                     4   4S jjr
SrU =r$ )
Aimv2Attention   z=Multi-headed attention from 'Attention Is All You Need' paperc                 h  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frd   )rA   rB   rf   rG   r   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   rh   qkv_biask_projv_projq_projout_projro   s     r*   rB   Aimv2Attention.__init__   s0   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//iiV__UiiV__UiiV__U		$..$..vWr1   rU   r   r"   c                 2   UR                   u  pEnU R                  U5      nU R                  U5      nU R                  U5      n	UR	                  XEU R
                  U R                  5      R                  SS5      nUR	                  XEU R
                  U R                  5      R                  SS5      nU	R	                  XEU R
                  U R                  5      R                  SS5      n	[        n
U R                  R                  S:w  a  [        U R                  R                     n
U
" U UUU	UU R                  U R                  U R                  (       d  SOU R                  S9u  pUR!                  XEU5      R#                  5       nU R%                  U5      nX4$ )z#Input shape: Batch x Time x Channelr   rL   eager        )r   r   r   )r[   r   r   r   viewr   r   r   r   rf   _attn_implementationr   r   r   r   r   reshaper   r   )r)   rU   r   r   
batch_sizer   r   queriesr.   valuesattention_interfacer   r   s                r*   rX   Aimv2Attention.forward   sS    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0((r1   )rf   r   r   r   r   r   r   r   r   r   r   rr   )r3   r4   r5   r6   r7   rB   r8   r   r   r-   rX   r;   r^   r_   s   @r*   r   r      s[    GX, 26$)||$) !.$)
 
u||Xell33	4$) $)r1   r   c                      ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\\R                     S\\	   S\
\R                  \R                  4   4S jjrS	rU =r$ )Aimv2EncoderLayeri#  rf   c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        UR                  UR                  5      U l	        [        UR                  UR                  5      U l
        g rr   )rA   rB   r   	attentionra   ffnr>   rG   r   	rms_norm1	rms_norm2ro   s     r*   rB   Aimv2EncoderLayer.__init__$  sZ    '/F#%f&8&8&:M:MN%f&8&8&:M:MNr1   rU   r   output_attentionsr"   c                     U R                  U5      nU R                  XBS9u  pVX-   nU R                  U5      nU R                  U5      nX-   nU(       a  X4$ US 4$ )N)rU   r   )r   r   r   r   )r)   rU   r   r   norm_hidden_statesr   r   
mlp_outputs           r*   rX   Aimv2EncoderLayer.forward+  sk     "^^M:$(NNASN$s!%3!^^M:XX01
%20A,\W[G\\r1   )r   r   r   r   NF)r3   r4   r5   r6   r   rB   r8   r   r   boolr-   rX   r;   r^   r_   s   @r*   r   r   #  ss    O0 O 26,1	]||] !.] $D>	]
 
u||U\\)	*] ]r1   r   c            
          ^  \ rS rSrSrS\4U 4S jjr\   SS\\	R                     S\\   S\\   S\4S	 jj5       rS
rU =r$ )Aimv2Encoderi<  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`Aimv2EncoderLayer`].

Args:
    config: Aimv2Config
rf   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rA   rB   rf   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r)   rf   r   rI   s      r*   rB   Aimv2Encoder.__init__E  sS    mmfNfNfHg$hHg1%6v%>Hg$hi&+# %is   A%r   r   output_hidden_statesr"   c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUnU R                   H-  nU(       a  XW4-   nU" UUUS9n	U	S   nU(       d  M%  XiS   4-   nM/     U(       a  XW4-   n[	        UUUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr2   )r   r   r   )last_hidden_staterU   
attentions)rf   r   r  r	  r   )
r)   r   r   r   r  encoder_statesall_attentionsrU   encoder_layerlayer_outputss
             r*   rX   Aimv2Encoder.forwardL  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[M#!/2B!B)"3M *!,M  !/3C2E!E )  +.>>N+(%
 	
r1   )rf   r
  r	  r   )r3   r4   r5   r6   r7   r   rB   r   r   r8   r   r  r   rX   r;   r^   r_   s   @r*   r  r  <  sl    ,{ ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r1   r  c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Aimv2AttentionPoolingHeadi  rf   c                   > [         TU ]  5         UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " [        R                  " SSU R                  5      5      U l        [
        R                  " U R                  U R                  SS9U l        g )Nrd   r   T)rA   rB   rG   r   r   r   rh   r   r   r   rC   r8   zeros	cls_tokenoutput_projro   s     r*   rB   "Aimv2AttentionPoolingHead.__init__  s    !--33ii 0 0$2B2BYii 0 0$2B2BYekk!Q8H8H&IJ99T%5%5t7G7GdSr1   rU   r"   c                    UR                   u  p#nU R                  R                  USS5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nU R                  U5      R	                  X#U R
                  X@R
                  -  5      nUR	                  USU R
                  X@R
                  -  5      nUR                  SSSS5      nUR                  SSSS5      nUR                  SSSS5      n[        R                  " XU5      n	U	R                  SS5      R	                  USU5      n	U	R                  SS9n	U R                  U	5      n
U
$ )NrM   r   r   rL   r   r   )r[   r  r   r   r   r   r   permuteFscaled_dot_product_attentionr   rS   r  )r)   rU   r   seq_len
hidden_dimr  r   r   r   r   outputs              r*   rX   !Aimv2AttentionPoolingHead.forward  s8   *7*=*='
ZNN))*b"=	kk-(00dnnV`drdrVrsM*22:XbftftXtu!!*a~~A]^kk!Q1%aAq)aAq)44UG!++Aq199*aT!&&1&-!!+.r1   )r  rG   r   r   r  r   )r3   r4   r5   r6   r   rB   r8   r   rX   r;   r^   r_   s   @r*   r  r    s2    	T0 	TU\\ ell  r1   r  c                   T   ^  \ rS rSr% Sr\\S'   SrSr/ SQr	Sr
SrSrU 4S jrSrU =r$ )	Aimv2PreTrainedModeli  z
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models. The model is only intended for inference and doesn't support finetuning.
rf   aimv2T)r   r  rv   r   c                   > [         TU ]  U5        [        US5      (       ad  [        UR                  [
        R                  5      (       a:  UR                  R                  R                  [        R                  " S5      5        g g [        U[        5      (       a9  UR                  R                  R                  SU R                  R                  S9  g g )Nlogit_scaleg$I$I,@r   )rS   std)rA   _init_weightshasattr
isinstancer(  r   rC   datafill_mathlogr  r  normal_rf   initializer_range)r)   r   rI   s     r*   r*  "Aimv2PreTrainedModel._init_weights  s    f%6=))&,,bll;;""''--dhhx.@A < 9::!!))s8U8U)V ;r1   r2   )r3   r4   r5   r6   r7   r   r:   base_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attnr*  r;   r^   r_   s   @r*   r%  r%    sC    
 &*# NW Wr1   r%  zL
    The Vision model from AIMv2 without any head or projection on top.
    )custom_introc                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\   SS\\R                     S\\   S	\\   S\4S
 jj5       5       rSrU =r$ )Aimv2VisionModeli  rf   r   c                 >  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  (       a  [        U5      U l        U R                  5         g rr   )rA   rB   rf   rv   r   r  encoderr>   rG   r   r   use_headr  head	post_initro   s     r*   rB   Aimv2VisionModel.__init__  so     /7#F+$V%7%79L9LM==1&9DIr1   r"   c                 .    U R                   R                  $ rr   )r   r   r/   s    r*   get_input_embeddings%Aimv2VisionModel.get_input_embeddings  s    ***r1   r   r   r  c                 f   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nU R	                  UUUS9nUS   nU R                  U5      nU R                  (       a  U R                  U5      OSn[        UUUR                  UR                  S9$ )ar  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Siglip2VisionModel

>>> model = Aimv2VisionModel.from_pretrained("apple/aimv2-large-patch14-native")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-native")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled features
```N)r   r   r  r   r  pooler_outputrU   r  )rf   r   r  r   r>  r   r?  r@  r   rU   r  )	r)   r   r   r   r  rU   encoder_outputsr  rH  s	            r*   rX   Aimv2VisionModel.forward  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 5,,'/!5 ' 
 ,A. MM*;<8<		"344)/')77&11	
 	
r1   )rf   r   r>  r@  r   r?  r   )r3   r4   r5   r6   r   r:   main_input_namerB   r   ModulerD  r   r   r   r8   r   r  r   rX   r;   r^   r_   s   @r*   r<  r<    s     $O0 +bii +  26,0/32
 !.2
 $D>	2

 'tn2
 
$2
  2
r1   r<  zJ
    The text model from AIMv2 without any head or projection on top.
    c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  4S jr	S r
\\   SS\\R                     S	\\   S
\\   S\4S jj5       5       rSrU =r$ )Aimv2TextModeli  r   rf   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        UR                  UR                  5      U l
        UR                  U l        U R                  5         g rr   )rA   rB   rf   r   r   r  r>  r>   rG   r   r   eos_token_idrA  ro   s     r*   rB   Aimv2TextModel.__init__$  s_     -f5#F+$V%7%79L9LM"//r1   r"   c                 .    U R                   R                  $ rr   r   r   r/   s    r*   rD  #Aimv2TextModel.get_input_embeddings/  s    ...r1   c                 $    XR                   l        g rr   rS  )r)   r   s     r*   set_input_embeddings#Aimv2TextModel.set_input_embeddings2  s    */'r1   r   r   r  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  U5      nUR                  u  pgn[
        R                  " U[
        R                  UR                  S9n	U	R                  S5      R                  US5      n
Ub  [        U R                   UU
UU	S S9nU R                  UUUUS9nUS   nU R                  U5      nU[
        R                  " UR                  S   UR                  S9UR                  [
        R                  UR                  S9U R                   :H  R                  5       R#                  SS94   n[%        UUUR&                  UR(                  S9$ )	Nr   r   rM   )rf   input_embedsr{   r   cache_positionpast_key_values)r   r   r   r  )r   r   rG  )rf   r   r  r   r[   r8   r   longr   	unsqueezer   r   r>  r   rP   r   rP  argmaxr   rU   r  )r)   r   r   r   r  rU   r   r   r   rZ  r{   rI  r  pooled_outputs                 r*   rX   Aimv2TextModel.forward5  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 	2!.!4!4
QgUZZH\H\]%//299*bI%/{{*)-- $N ,,')/!5	 ' 
 ,A. MM*;< *LL*003<M<T<TU\\		2C2J2J\KtO`O``eegnnsunvx

 */')77&11	
 	
r1   )rf   r   r>  rP  r   r   )r3   r4   r5   r6   rK  r   rB   r   rL  rD  rV  r   r   r   r8   r   r  r   rX   r;   r^   r_   s   @r*   rN  rN    s     "O	 	/bii /0  26,0/30
 !.0
 $D>	0

 'tn0
 
$0
  0
r1   rN  tensorr"   c                     [         R                  " U S5      n[         R                  " USSS9n[         R                  " US5      nU$ )z
This method is equivalent to tensor.norm(p=2, dim=-1, keepdim=True) and used to make
model `executorch` exportable. See issue https://github.com/pytorch/executorch/issues/3566
rL   rM   T)r   rN   g      ?)r8   rR   sum)ra  square_tensor
sum_tensornormed_tensors       r*   _get_vector_normrg  j  s<    
 IIfa(M=b$?JIIj#.Mr1   c                     ^  \ rS rSr% \\S'   / SQrS\4U 4S jjr\     SS\	\
R                     S\	\
R                     S\	\
R                     S\	\   S	\	\   S
\
R                  4S jj5       r\    SS\	\
R                     S\	\   S	\	\   S\S
\
R                  4
S jj5       r\\     SS\	\
R"                     S\	\
R                     S\	\
R                     S\	\   S	\	\   S
\4S jj5       5       rSrU =r$ )
Aimv2Modeliu  rf   )r   r   rv   c                   > [         TU ]  U5        UR                  U l        UR                  R                  U l        UR                  R                  U l        [        R                  UR                  5      U l
        [        R                  UR                  5      U l        [        R                  " U R
                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R"                  " [$        R&                  " U R(                  R*                  5      5      U l        [.        R0                  " UR2                  5      U l        U R7                  5         g )NFrd   )rA   rB   projection_dimvision_configrG   vision_embed_dimtext_configtext_embed_dimr<  _from_configvision_modelrN  
text_modelr   rh   visual_projectiontext_projectionrC   r8   ra  rf   logit_scale_init_valuer(  r/  r0  max_logit_scalemax_log_logit_scalerA  ro   s     r*   rB   Aimv2Model.__init__z  s     $33 & 4 4 @ @$00<<,99&:N:NO(55f6H6HI!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY#'88F,B,B#C r1   r   r   r{   r   r  r"   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUS9nUR                  nU R                  U5      nU$ )aG  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`Aimv2TextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, Aimv2Model

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> tokenizer = AutoTokenizer.from_pretrained("openai/aimv2-vit-base-patch32")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```)r   r   r{   r   r  )rf   r   r  rr  rH  rt  )	r)   r   r   r{   r   r  text_outputsr_  text_featuress	            r*   get_text_featuresAimv2Model.get_text_features  s    4 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22,,];r1   r   interpolate_pos_encodingc                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUS9nUR                  nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`Aimv2VisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("openai/aimv2-vit-base-patch32")
>>> processor = AutoProcessor.from_pretrained("openai/aimv2-vit-base-patch32")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```)r   r   r  r~  )rf   r   r  rq  rH  rs  )r)   r   r   r  r~  vision_outputsr_  image_featuress           r*   get_image_featuresAimv2Model.get_image_features  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5%=	 6G 6
 '44//>r1   c           	      `   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nU R	                  UUUUS9nUR
                  nU R                  U5      nUR
                  n	U R                  U	5      n	U[        U5      -  nU	[        U	5      -  n	U R                  R                  SU R                  5      R                  5       R                  U	R                  5      n
X-  UR                  5       -  nUR                  5       n[!        UUU	UUUS9$ )a  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Aimv2Model

>>> model = Aimv2Model.from_pretrained("apple/aimv2-large-patch14-224-lit")
>>> processor = AutoProcessor.from_pretrained("apple/aimv2-large-patch14-224-lit")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```)r   r   r  )r   r   r   r  r   )r   r   r   r   r    r!   )rf   r   r  rq  rr  rH  rs  rt  rg  r(  clamprw  exprP   r   tr   )r)   r   r   r   r   r  r  rz  r   r   r(  r   r   s                r*   rX   Aimv2Model.forward  sS   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/!5 6G 6
 48??)/!5	 4C 4
 &33--l;"00**;7 $&6|&DD!$4[$AA&&,,S$2J2JKOOQTTU`UgUgh&48HH*,,.-+#%* .
 	
r1   )	r(  rw  rk  ro  rr  rt  rm  rq  rs  )NNNNN)NNNF)r3   r4   r5   r6   r   r:   r6  rB   r   r   r8   r   r  r9   r|  r  r   r   r   rX   r;   r^   r_   s   @r*   ri  ri  u  s   ]{ $  -115/3,0/3)ELL)) !.) u||,	)
 $D>) 'tn) 
		) )V  59,0/3).-u001- $D>- 'tn	-
 #'- 
		- -^  154815,0/3F
E,,-F
 u001F
 !.	F

 $D>F
 'tnF
 
F
  F
r1   ri  )r<  ri  r%  rN  )r   )5r/  dataclassesr   typingr   r   r   r8   torch.nn.functionalr   r   r  activationsr	   integrationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   utilsr   r   r   configuration_aimv2r   r   r   r   rL  r>   ra   rv   r   r   floatr   r   r   r  r  r%  r<  rN  rg  ri  __all__r2   r1   r*   <module>r     s  .  ! * *     ! 7 / 9 K F B B P P  
+  
   
F Y'J299 J (J(ryy  1BII 1h%")) %^ %II%<<% 
% <<	%
 U\\*% % %.:)RYY :)z]2 ]2M
299 M
`		 D W? W W8 
I
+ I

I
X 
F
) F

F
RU\\ ell  z
% z
 z
z Wr1   