
    Ph                        d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
mZ ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZmZ dd
lmZmZmZ ddlmZmZmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!jN                  e(      Z)de	jT                  de	jT                  fdZ+de	jT                  de	jT                  fdZ,ee G d de                    Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1de0iZ2 G d dej\                        Z3 G d dej\                        Z4 G d  d!ej\                        Z5 G d" d#e      Z6 G d$ d%ej\                        Z7 G d& d'ej\                        Z8	 dJd(ej\                  d)e	jT                  d*e	jT                  d+e	jT                  d,ee	jT                     d-e9d.e9fd/Z: G d0 d1ej\                        Z; G d2 d3ej\                        Z< G d4 d5e      Z= G d6 d7ej\                        Z> G d8 d9ej\                        Z?e G d: d;e             Z@ G d< d=ej\                        ZA G d> d?e@      ZB ed@A       G dB dCe@             ZC G dD dEe@      ZD G dF dGe@      ZEdKdHZFg dIZGy)LzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)nn
functionalcross_entropytorcharangelenr"   )r   s    f/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr*   +   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   /   s,    #J/L!*,,.1J:%,,r+   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AltCLIPOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r9   r:   N)getattrto_tuple).0kselfs     r)   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>U   s=      
   LLDGRYZ^`aRbRkRkRmm s   -0)tuplekeysrA   s   `r)   r>   zAltCLIPOutput.to_tupleT   s#     
YY[
 
 	
r+   )__name__
__module____qualname____doc__r4   r   r&   FloatTensor__annotations__r5   r6   r7   r8   r9   r   r:   rC   r   r>    r+   r)   r3   r3   5   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   r3   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )AltRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r#   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr=   rS   register_bufferr&   r'   expandzerosrU   sizelongrP   rA   config	__class__s     r)   r^   zAltRobertaEmbeddings.__init__b   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r+   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )NrW   r   rZ   r   r\   r"   rT   )"create_position_ids_from_input_idsrP   &create_position_ids_from_inputs_embedsrp   hasattrrZ   rn   r&   ro   rq   rU   r"   rc   rg   rS   re   rh   rl   )rA   	input_idsrZ   rU   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrg   
embeddingsre   s                r)   forwardzAltRobertaEmbeddings.forward{   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r+   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrW   r   rv   r   )rp   r&   r'   rP   rq   r"   	unsqueezern   )rA   r{   r}   sequence_lengthrU   s        r)   rx   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )NNNNr   )rF   rG   rH   rI   r^   r   rx   __classcell__rt   s   @r)   rN   rN   \   s    

4 rs&P=r+   rN   c                        e Zd Zd fd	Z	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )
AltRobertaSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rG|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        y y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rS   rT   relative_keyrelative_key_query   r   )r]   r^   ra   num_attention_headsry   
ValueErrorintattention_head_sizeall_head_sizer#   Linearquerykeyvaluerj   attention_probs_dropout_probrl   r=   rS   rd   r_   distance_embeddingrA   rs   rS   rt   s      r)   r^   z AltRobertaSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr+   hidden_statesattention_mask	head_maskoutput_attentionsr   c                 D   |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	t        j                  ||j	                  dd            }
| j                  dk(  s| j                  dk(  rQ|j                   d   |j                   d   }}t        j                  |t        j                  |j                        j                  dd      }t        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                         }| j                  dk(  rt        j"                  d	||      }|
|z   }
nE| j                  dk(  r6t        j"                  d	||      }t        j"                  d
||      }|
|z   |z   }
|
t%        j&                  | j                        z  }
||
|z   }
t(        j*                  j-                  |
d      }| j/                  |      }|||z  }t        j                  ||	      }|j1                  dddd      j3                         }|j5                         d d | j6                  fz   }|j                  |      }|r||f}|S |f}|S )NrW   r   r   r   r   rv   r[   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper   r   view	transposer   r   r&   matmulrS   r'   rq   r"   r   rd   tor\   einsummathsqrtr#   r$   softmaxrl   permute
contiguousrp   r   )rA   r   r   r   r   r}   hidden_shapequery_layer	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r)   r   zAltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L"\\,ejjQ^QeQefkklnpqrN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r+   NNNF)rF   rG   rH   r^   r&   Tensorr   rJ   boolrC   r   r   r   s   @r)   r   r      sp    u6 7;15,1:||: !!2!23: E--.	:
 $D>: 
u||	:r+   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrQ   )r]   r^   r#   r   ra   denserh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r+   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   rl   rh   rA   r   r   s      r)   r   zAltRobertaSelfOutput.forward  7    

=1]3}|'CDr+   rF   rG   rH   r^   r&   r   r   r   r   s   @r)   r   r     1    >U\\  RWR^R^ r+   r   eagerc                        e Zd Zd	 fd	Zd Z	 	 	 d
dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )AltRobertaAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )N)rS   )	r]   r^   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrA   r   outputsetpruned_headsr   s      r)   r^   zAltRobertaAttention.__init__   sC    6v7R7RS,C
	 +62Er+   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )r(   r   rA   r   r   r   r   r   r   r   r   r   r   union)rA   headsindexs      r)   prune_headszAltRobertaAttention.prune_heads(  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   r   r   r   r   c                 l    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S N)r   r   r   r   r   )rA   r   )rA   r   r   r   r   self_outputsattention_outputr   s           r)   r   zAltRobertaAttention.forward:  sS     yy)/	 ! 
  ;;|AF#%QR(88r+   r   r   )rF   rG   rH   r^   r   r&   r   r   rJ   r   rC   r   r   r   s   @r)   r   r     st    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r]   r^   r#   r   ra   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrr   s     r)   r^   zAltRobertaIntermediate.__init__N  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r+   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rA   r   s     r)   r   zAltRobertaIntermediate.forwardV  s&    

=100?r+   r   r   s   @r)   r   r   M  s#    9U\\ ell r+   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r]   r^   r#   r   r   ra   r   rh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaOutput.__init__^  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r+   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r)   r   zAltRobertaOutput.forwardd  r   r+   r   r   s   @r)   r   r   ]  r   r+   r   c                        e Zd Z fdZ	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
d Z xZS )
AltRobertaLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y )Nr   )
r]   r^   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   rr   s     r)   r^   zAltRobertaLayer.__init__m  sI    '-'E'E$,V426:&v.r+   r   r   r   r   r   c                      | j                   |f|||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S r   )r   r   feed_forward_chunkr   r   )
rA   r   r   r   r   kwargsself_attention_outputsr   r   layer_outputs
             r)   r   zAltRobertaLayer.forwardu  s     "0"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r+   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rA   r   intermediate_outputr   s       r)   r   z"AltRobertaLayer.feed_forward_chunk  s,    "//0@A{{#68HIr+   r   )rF   rG   rH   r^   r&   r   r   rJ   r   rC   r   r   r   r   s   @r)   r   r   l  st    / 7;15,1|| !!2!23 E--.	
 $D> 
u||	2r+   r   c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )AltRobertaEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r]   r^   rs   r#   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rA   rs   irt   s      r)   r^   zAltRobertaEncoder.__init__  sT    ]]U6KcKcEd#eEdOF$;Ed#ef
&+# $f   A#r   r   r   r   output_hidden_statesreturn_dictr   c           	          |rdnd }|rdnd }	t        | j                        D ]4  \  }
}|r||fz   }|||
   nd } |d||||d|}|d   }|s,|	|d   fz   }	6 |r||fz   }t        |||	      S )NrL   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater  r   )rA   r   r   r   r   r
  r  r   all_hidden_statesall_self_attentionsr  layer_modulelayer_head_masklayer_outputss                 r)   r   zAltRobertaEncoder.forward  s     #7BD$5b4(4OA|#$58H$H!.7.CilO( +-)"3	
 M *!,M &9]1=M<O&O#!  5$   1]4D D++*
 	
r+   )NNFFT)rF   rG   rH   r^   r   r&   r   r   rJ   r   r   rC   r   r   r   r   s   @r)   r   r     s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r]   r^   r#   r   ra   r   Tanh
activationrr   s     r)   r^   zAltRobertaPooler.__init__  s9    YYv1163E3EF
'')r+   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rA   r   first_token_tensorpooled_outputs       r)   r   zAltRobertaPooler.forward  s6     +1a40

#566r+   r   r   s   @r)   r  r    s#    $
U\\ ell r+   r  moduler   r   r   r   scalingrl   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrW   r   )r   r\   )ptrainingr   r   )r&   r   r   r#   r$   r   float32r   r\   rl   r"  r   )
r  r   r   r   r   r  rl   r   attn_weightsattn_outputs
             r)   eager_attention_forwardr&    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r+   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)r]   r^   rs   ra   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrl   	is_causalr#   r   k_projv_projq_projout_projrr   s     r)   r^   zAltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   r   r   causal_attention_maskr   r   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|| j                  | j                  | j                  sdn| j                        \  }}|j!                  |||      j#                         }| j%                  |      }|sd}||fS )z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr           )r1  r  rl   )r   r4  r2  r3  r   r-  r.  r   rs   r   r1  r&  r   r/  r"  rl   reshaper   r5  )rA   r   r   r6  r   
batch_sizer~   r,  queriesrD   valuesattention_interfacer%  r$  s                 r)   r   zAltCLIPAttention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6"9$++:Z:Z"[$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r+   r   )rF   rG   rH   rI   r^   r&   r   r   r   rC   r   r   r   s   @r)   r(  r(    s}    GB. 268<,1/)||/) !./)  (5	/)
 $D>/) 
u||Xell33	4/)r+   r(  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
AltCLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r]   r^   rs   r	   r   activation_fnr#   r   ra   r   fc1fc2rr   s     r)   r^   zAltCLIPMLP.__init__9  sd    #F$5$5699V//1I1IJ99V55v7I7IJr+   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rC  rB  rD  r   s     r)   r   zAltCLIPMLP.forward@  s4    /**=9/r+   r   r   s   @r)   r@  r@  8  s$    KU\\ ell r+   r@  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
AltCLIPEncoderLayerrs   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r]   r^   ra   r,  r(  	self_attnr#   rh   ri   layer_norm1r@  mlplayer_norm2rr   s     r)   r^   zAltCLIPEncoderLayer.__init__H  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr+   r   r   r6  r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r6  r   )rJ  rI  rL  rK  )rA   r   r   r6  r   residualr$  r   s           r)   r   zAltCLIPEncoderLayer.forwardP  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr+   F)rF   rG   rH   r   r^   r&   r   r   r   rC   rJ   r   r   r   s   @r)   rG  rG  G  sf    S} S -2&||& &  %||	&
 $D>& 
u  	!&r+   rG  c                        e Zd ZdZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   d	eeef   fd
       Z xZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rs   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
r]   r^   rs   r#   r  r  r  rG  layersr  )rA   rs   _rt   s      r)   r^   zAltCLIPEncoder.__init__  sV    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %kr	  r   r6  r   r
  r  r   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]*  \  }
}|r||	fz   } ||	|||      }|d   }	|s"||d   fz   }, |r||	fz   }t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrL   )r   r   r   r  )rs   r   r
  use_return_dictr  rS  r   )rA   r{   r   r6  r   r
  r  encoder_statesall_attentionsr   idxencoder_layerr  s                r)   r   zAltCLIPEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/=2B!B)%"3	M *!,M !/=3C2E!E #9  +}.>>N+>Vd
 	
r+   )NNNNN)rF   rG   rH   rI   r   r^   r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   rQ  rQ  y  s    ,} ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r+   rQ  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )AltCLIPVisionEmbeddingsrs   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr   r   rU   rV   rX   )r]   r^   rs   ra   r,  
image_size
patch_sizer#   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr_   position_embeddingrm   r'   rn   rr   s     r)   r^   z AltCLIPVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   r   heightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrW   g      ?r   r   bicubicF)rp   modealign_cornersr   )r   rm  weightr   r&   jit
is_tracingrU   rd  r   r:  r   r#   r$   interpolater   cat)rA   r   rn  ro  rk  rm  rl  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr+   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (r*  r[   r   r   rW   r   )r   rc  r   rj  rt  r\   r   flattenr   rg  rn   r&   rx  r~  rm  rU   )rA   r  r~  r;  rT  rn  ro  target_dtypepatch_embedsclass_embedsr   s              r)   r   zAltCLIPVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr+   rO  )rF   rG   rH   r   r^   r&   r   r   r~  rJ   r   r   r   s   @r)   r\  r\    se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r+   r\  c                   *    e Zd ZU eed<   dZdZg Zd Zy)AltCLIPPreTrainedModelrs   altclipTc                 :   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       t        j
                  j                  |j$                  j                  |       yt        |t&              r| j                   j                  }|j                   j(                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j(                  z  dz  |z  }t        j
                  j                  |j*                  j                  |       t        j
                  j                  |j,                  j                  |       yt        |t.              rt        j
                  j                  |j0                  j                  |j2                  dz  | j                   j                  z         d|j0                  _        t        j
                  j                  |j6                  j                  |j8                  dz  | j                   j                  z         d|j6                  _        yt        |t        j:                        rJ|j<                  j>                  jA                          |j                  j>                  jC                  d       yt        |t        jD                        rm|j                  j>                  j                  d| j                   j                         |j<                  %|j<                  j>                  jA                          yyt        |t        jF                        rz|j                  j>                  j                  d| j                   j                         |jH                  2|j                  j>                  |jH                     jA                          yyy)	zInitialize the weightsr9  r+  )meanstd)r  r   Tg      ?N)%rs   initializer_factorr   r\  r#   initnormal_rg  r,  rj  rt  initializer_rangerm  r(  r  r4  r2  r3  r5  r@  ra   rC  rD  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrh   rb  datazero_fill_r   r_   rP   )rA   r  factorin_proj_stdout_proj_stdfc_stds         r)   _init_weightsz$AltCLIPPreTrainedModel._init_weights+  s   //f56[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?-GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7-KK""$MM$$S)		*MM&&CT[[5S5S&T{{&  &&( '-MM&&CT[[5S5S&T!!-""6#5#56<<> . .r+   N)	rF   rG   rH   r   rK   base_model_prefixsupports_gradient_checkpointing_no_split_moduler  rL   r+   r)   r  r  $  s    !&*#+?r+   r  c                        e Zd Zdef fdZee	 	 	 	 	 d
deej                     dee
   dee
   dee
   dee
   deeef   fd	              Z xZS )AltCLIPVisionTransformerrs   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r]   r^   rs   ra   r\  r   r#   rh   ri   pre_layrnormrQ  encoderpost_layernorm)rA   rs   r,  rt   s      r)   r^   z!AltCLIPVisionTransformer.__init__Z  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr+   r  r   r
  r  r~  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  |||d      }|d   }|d d dd d f   }	| j                  |	      }	t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r~  T)r{   r   r
  r  r   r  pooler_outputr   r  )rs   r   r
  rV  r   r   r  r  r  r   r   r  )
rA   r  r   r
  r  r~  r   encoder_outputsr  r  s
             r)   r   z AltCLIPVisionTransformer.forwardd  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5	 ' 
 ,A.)!Q'2++M:)/')77&11	
 	
r+   )NNNNF)rF   rG   rH   r   r^   r   r   r   r&   rJ   r   r   rC   r   r   r   r   s   @r)   r  r  Y  s    Q2 Q  59,0/3&*38$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
  $
r+   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )AltCLIPVisionModelrs   r  c                 d    t         |   |       t        |      | _        | j	                          y r   )r]   r^   r  vision_model	post_initrr   s     r)   r^   zAltCLIPVisionModel.__init__  s'     4V<r+   r   c                 B    | j                   j                  j                  S r   )r  r   rj  rE   s    r)   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddings  s      ++;;;r+   r   r
  r~  r  c                 b    ||n| j                   j                  }| j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r  r   r
  r~  r  )rs   rV  r  )rA   r  r   r
  r~  r  s         r)   r   zAltCLIPVisionModel.forward  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r+   )NNNFN)rF   rG   rH   r   rK   main_input_namer^   r#   Moduler  r   r   r&   rJ   r   r   rC   r   r   r   r   s   @r)   r  r    s    $O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r+   r  aE  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    )custom_introc                   ^    e Zd ZU eed<   d fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
e   de
e   de
e   deeej                     ef   fd       Z xZS )AltRobertaModelrs   c                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r]   r^   rs   rN   r   r   r  r  poolerr  )rA   rs   add_pooling_layerrt   s      r)   r^   zAltRobertaModel.__init__  sN    
 	 .v6(02C&v. 	r+   c                 .    | j                   j                  S r   r   rc   rE   s    r)   r  z$AltRobertaModel.get_input_embeddings  s    ...r+   c                 &    || j                   _        y r   r  rA   r   s     r)   set_input_embeddingsz$AltRobertaModel.set_input_embeddings  s    */'r+   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r   r   )rA   heads_to_pruner  r   s       r)   _prune_headszAltRobertaModel._prune_heads  s>    
 +002LE5LLu%//;;EB 3r+   rz   r   rZ   rU   r   r{   r   r
  r  r   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         d d }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }| j'                  |||||d	      }|d
   }| j(                  | j)                  |      nd }t+        |||j,                  |j.                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timerW   z5You have to specify either input_ids or inputs_embedsr!   rZ   rv   )rz   rU   rZ   r{   T)r   r   r   r
  r  r   r  )rs   r   r
  rV  r   %warn_if_padding_and_no_attention_maskrp   r"   r&   onesry   r   rZ   rn   ro   rq   get_extended_attention_maskget_head_maskr  r  r  r   r   r  )rA   rz   r   rZ   rU   r   r{   r   r
  r  r}   r;  r~   r"   r   r   extended_attention_maskembedding_outputr  sequence_outputr  s                        r)   r   zAltRobertaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r+   )T	NNNNNNNNN)rF   rG   rH   r   rK   r^   r  r  r  r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   r  r    s     /0C  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
 G
r+   r  c                       e Zd ZU eed<    fdZdej                  fdZdej                  ddfdZ
ddee   dej                  f fd	Zee	 	 	 	 	 	 	 	 	 dd
eej"                     deej"                     deej"                     deej"                     deej"                     deej"                     dee   dee   dee   deeef   fd              Z xZS )AltCLIPTextModelrs   c                 &   t         |   |       t        |d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        | j                          y )NF)r  rQ   )r]   r^   r  robertar#   r   ra   project_dimtransformationrh   ri   pre_LNr  rr   s     r)   r^   zAltCLIPTextModel.__init__<  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr+   r   c                 B    | j                   j                  j                  S r   r  r   rc   rE   s    r)   r  z%AltCLIPTextModel.get_input_embeddingsC  s    ||&&666r+   r   Nc                 :    || j                   j                  _        y r   r  r  s     r)   r  z%AltCLIPTextModel.set_input_embeddingsF  s    27/r+   new_num_tokensc                 "    t         |   |      S r   )r]   resize_token_embeddings)rA   r  rt   s     r)   r  z(AltCLIPTextModel.resize_token_embeddingsI  s    w.~>>r+   rz   r   rZ   rU   r   r{   r   r  r
  c
                    ||n| j                   j                  }| j                  ||||||||	d	      }
|
d   }| j                  |      }| j	                  |      }|dddf   }t        |||
j                  |
j                        S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```NT)	rz   r   rZ   rU   r   r{   r   r
  r  r   r  )rs   rV  r  r  r  r   r   r  )rA   rz   r   rZ   rU   r   r{   r   r  r
  r   r  projection_stater  s                 r)   r   zAltCLIPTextModel.forwardL  s    @ &1%<k$++B]B],,))%'/!5  

 "!* ++o6  ..?(A.6.'!//))	
 	
r+   r   r  )rF   rG   rH   r   rK   r^   r#   r  r  r_   r  r   r   r  r   r   r&   r   r   r   rC   r   r   r   r   s   @r)   r  r  9  s8   7bii 78",, 84 8?hsm ?r|| ?  -11515/3,004,0&*/3;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 $D>;
 d^;
 'tn;
 
u==	>;
  ;
r+   r  c                   >    e Zd ZU eed<   def fdZ e       e	 	 	 ddej                  de
ej                     de
ej                     de
ej                     dej                  f
d              Z e       e	 dd	ej                  d
edej                  fd              Ze	 	 	 	 	 	 	 	 	 	 dde
ej                     d	e
ej                     de
ej                     de
ej                     de
ej                     de
e   de
e   de
e   d
ede
e   deeef   fd       Z xZS )r  rs   c                 r   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t#        |      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j.                  t1        j2                  | j4                  j6                              | _        | j;                          y )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)rb  )r]   r^   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr  r  ra   r  r  
text_modelr  r  r#   r   r  r  re  r&   tensorrs   logit_scale_init_valuelogit_scaler  )rA   rs   r  r  rt   s       r)   r^   zAltCLIPModel.__init__  se    &..0CD--./q2  &,,.?@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   rz   r   rU   rZ   r   c                 j    | j                  ||||      }|j                  }| j                  |      }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)rz   r   rU   rZ   )r  r  r  )rA   rz   r   rU   rZ   text_outputsr  text_featuress           r)   get_text_featureszAltCLIPModel.get_text_features  sF    6 )%)	 ' 
 %22,,];r+   r  r~  c                 f    | j                  ||      }|j                  }| j                  |      }|S )aQ  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AltCLIPModel
        >>> from transformers.image_utils import load_image

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r  r~  )r  r  r  )rA   r  r~  vision_outputsr  image_featuress         r)   get_image_featureszAltCLIPModel.get_image_features  sB    : **%%= + 
 '44//>r+   return_lossr   r
  r  c           	         ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
| j	                  |||||||
      }| j                  ||||	|
      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                  }d}|rt        |      }|
s||||||f}||f|z   S |S t!        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)rz   r   rZ   rU   r   r
  r  r  r   r   rW   T)r!  r   keepdim)r4   r5   r6   r7   r8   r9   r:   )rs   r   r
  rV  r  r  r  r  normr  expr&   r   r.   Tr1   r3   )rA   rz   r  r   rU   rZ   r  r   r
  r~  r  r  r  r8   r7   r  r6   r5   r4   r   s                       r)   r   zAltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,_-D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r+   )NNNrO  )
NNNNNNNNFN)rF   rG   rH   r   rK   r^   r   r   r&   r   r   rJ   r  r   r  
LongTensorr   rC   r3   r   r   r   s   @r)   r  r    s   } B %& 26/315"<<" !." u||,	"
 !." 
		"  '"H %& */"''" #'" 
			"  '"H  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r+   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r&   cumsumtype_asrq   )rz   rP   r|   maskincremental_indicess        r)   rw   rw   \  sW     <<$((*D <<!4<<TBE[[_cc##%33r+   )r  r  r  r  )r9  )r   )HrI   r   dataclassesr   typingr   r   r   r   r&   torch.nnr#   activationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   configuration_altclipr   r   r   
get_loggerrF   loggerr   r*   r1   r3   r  rN   r   r   r   r   r   r   r   r   r  floatr&  r(  r@  rG  rQ  r\  r  r  r  r  r  r  rw   __all__rL   r+   r)   <module>r     s     ! 1 1   ! 9  G l l w w X X 
		H	%
`U\\ `ell `-%,, -5<< -  
K  
   
HV=299 V=rSbii Sn299  $& "
*")) *\RYY  ryy %0 %R.
		 .
dryy . %II%<<% 
% <<	%
 U\\*% % %.F)ryy F)T /4 /dT
RYY T
pPbii Pf 1?_ 1? 1?h1
ryy 1
h2
/ 2
j k
, k
k
\P
- P
fL
) L
`4  _r+   