
    <h                        S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
Jr  SSKr	SSKJr  SSKJr  SSKJrJrJrJr  SS	KJrJr  SS
KJrJrJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$J%r%J&r&  \!RN                  " \(5      r)S\	RT                  S\	RT                  4S jr+S\	RT                  S\	RT                  4S jr,\\ " S S\5      5       5       r- " S S\R\                  5      r/ " S S\R\                  5      r0 " S S\R\                  5      r1S\00r2 " S S\R\                  5      r3 " S S\R\                  5      r4 " S  S!\R\                  5      r5 " S" S#\5      r6 " S$ S%\R\                  5      r7 " S& S'\R\                  5      r8 SJS(\R\                  S)\	RT                  S*\	RT                  S+\	RT                  S,\\	RT                     S-\9S.\94S/ jjr: " S0 S1\R\                  5      r; " S2 S3\R\                  5      r< " S4 S5\5      r= " S6 S7\R\                  5      r> " S8 S9\R\                  5      r?\ " S: S;\5      5       r@ " S< S=\R\                  5      rA " S> S?\@5      rB\" S@SA9 " SB SC\@5      5       rC " SD SE\@5      rD " SF SG\@5      rESKSH jrF/ SIQrGg)LzPyTorch AltCLIP model.    N)	dataclass)AnyCallableOptionalUnion   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     [         R                  R                  U [        R                  " [        U 5      U R                  S95      $ )Ndevice)nn
functionalcross_entropytorcharangelenr!   )r   s    d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr)   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)r)   t)r+   caption_loss
image_losss      r(   	clip_lossr0   0   s*    #J/L!*,,.1J%,,r*   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AltCLIPOutput6   aq  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPTextModel`].
vision_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AltCLIPVisionModel`].
Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r9   r:   N)getattrto_tuple).0kselfs     r(   	<genexpr>)AltCLIPOutput.to_tuple.<locals>.<genexpr>V   s<      
   LLDGRYZ^`aRbRkRkRmm s   14)tuplekeysrA   s   `r(   r>   AltCLIPOutput.to_tupleU   s#     
YY[
 
 	
r*    )__name__
__module____qualname____firstlineno____doc__r4   r   r%   FloatTensor__annotations__r5   r6   r7   r8   r9   r   r:   rD   r   r>   __static_attributes__rH   r*   r(   r2   r2   6   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r*   r2   c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )AltRobertaEmbeddings]   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r"   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr=   rX   register_bufferr%   r&   expandzerosrZ   sizelongrU   rA   config	__class__s     r(   rc   AltRobertaEmbeddings.__init__c   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r*   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr\   r   r_   r   ra   r!   rY   )"create_position_ids_from_input_idsrU   &create_position_ids_from_inputs_embedsru   hasattrr_   rs   r%   rt   rv   rZ   r!   rh   rl   rX   rj   rm   rq   )rA   	input_idsr_   rZ   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrl   
embeddingsrj   s                r(   forwardAltRobertaEmbeddings.forward|   sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r*   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr\   r   r|   r   )ru   r%   r&   rU   rv   r!   	unsqueezers   )rA   r   r   sequence_lengthrZ   s        r(   r~   ;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r*   )rm   rq   rU   rX   rj   rl   rh   )NNNNr   )
rI   rJ   rK   rL   rM   rc   r   r~   rP   __classcell__ry   s   @r(   rR   rR   ]   s$    

4 rs&P= =r*   rR   c                      ^  \ rS rSrS
U 4S jjr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )AltRobertaSelfAttention   c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aH  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        g g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rX   rY   relative_keyrelative_key_query   r   )rb   rc   rf   num_attention_headsr   
ValueErrorintattention_head_sizeall_head_sizer"   Linearquerykeyvaluero   attention_probs_dropout_probrq   r=   rX   ri   rd   distance_embeddingrA   rx   rX   ry   s      r(   rc    AltRobertaSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr*   hidden_statesattention_mask	head_maskoutput_attentionsr   c                 H   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	[        R                  " XxR	                  SS5      5      n
U R                  S:X  d  U R                  S:X  GaL  UR                   S   UR                   S   p[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R                  XR                  -   S-
  5      nUR                  UR                   S9nU R                  S:X  a  [        R"                  " S	UU5      nU
U-   n
OHU R                  S:X  a8  [        R"                  " S	UU5      n[        R"                  " S
UU5      nU
U-   U-   n
U
[$        R&                  " U R                  5      -  n
Ub  X-   n
[(        R*                  R-                  U
SS9nU R/                  U5      nUb  UU-  n[        R                  " UU	5      nUR1                  SSSS5      R3                  5       nUR5                  5       S S U R6                  4-   nUR                  U5      nU(       a  UU4nU$ U4nU$ )Nr\   r   r   r   r   r|   r`   zbhld,lrd->bhlrzbhrd,lrd->bhlrdimr   r   )shaper   r   view	transposer   r   r%   matmulrX   r&   rv   r!   r   ri   tora   einsummathsqrtr"   r#   softmaxrq   permute
contiguousru   r   )rA   r   r   r   r   r   hidden_shapequery_layer	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                           r(   r   AltRobertaSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*"\\,ejjQ^QeQefkklnpqrN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8FbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r*   )
r   r   r   rq   r   ri   r   rX   r   r   NNNF)rI   rJ   rK   rL   rc   r%   Tensorr   rN   boolrD   r   rP   r   r   s   @r(   r   r      su    u6 7;15,1:||: !!2!23: E--.	:
 $D>: 
u||	: :r*   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g NrV   )rb   rc   r"   r   rf   denserm   rn   ro   rp   rq   rw   s     r(   rc   AltRobertaSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   rq   rm   rA   r   r   s      r(   r   AltRobertaSelfOutput.forward  5    

=1]3}'CDr*   rm   r   rq   
rI   rJ   rK   rL   rc   r%   r   r   rP   r   r   s   @r(   r   r     6    >U\\  RWR^R^  r*   r   eagerc                      ^  \ rS rSrSU 4S jjrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )AltRobertaAttentioni   c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        g )N)rX   )	rb   rc   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrA   r   outputsetpruned_headsr   s      r(   rc   AltRobertaAttention.__init__!  s@    6v7R7RS
	 +62Er*   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )r'   r   rA   r   r   r   r   r   r   r   r   r   r   union)rA   headsindexs      r(   prune_headsAltRobertaAttention.prune_heads)  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r*   r   r   r   r   r   c                 f    U R                  UUUUS9nU R                  US   U5      nU4USS  -   nU$ N)r   r   r   r   r   )rA   r   )rA   r   r   r   r   self_outputsattention_outputr   s           r(   r   AltRobertaAttention.forward;  sS     yy)/	 ! 
  ;;|AF#%QR(88r*   )r   r   rA   r   r   )rI   rJ   rK   rL   rc   r   r%   r   r   rN   r   rD   r   rP   r   r   s   @r(   r   r      sy    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	 r*   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaIntermediateiN  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )rb   rc   r"   r   rf   intermediate_sizer   
isinstance
hidden_actstrr	   intermediate_act_fnrw   s     r(   rc   AltRobertaIntermediate.__init__O  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r*   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r   r   r   rA   r   s     r(   r   AltRobertaIntermediate.forwardW  s&    

=100?r*   r   r   r   s   @r(   r   r   N  s(    9U\\ ell  r*   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AltRobertaOutputi^  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rb   rc   r"   r   r   rf   r   rm   rn   ro   rp   rq   rw   s     r(   rc   AltRobertaOutput.__init___  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r   r   r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r   r   s      r(   r   AltRobertaOutput.forwarde  r   r*   r   r   r   s   @r(   r  r  ^  r   r*   r  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	 rS
rU =r$ )AltRobertaLayerim  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g )Nr   )
rb   rc   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater  r   rw   s     r(   rc   AltRobertaLayer.__init__n  sI    '-'E'E$,V426:&v.r*   r   r   r   r   r   c                     U R                   " U4UUUS.UD6nUS   nUSS  n[        U R                  U R                  U R                  U5      n	U	4U-   nU$ r   )r  r   feed_forward_chunkr
  r  )
rA   r   r   r   r   kwargsself_attention_outputsr   r   layer_outputs
             r(   r   AltRobertaLayer.forwardv  s     "&"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r*   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r   )rA   r   intermediate_outputr  s       r(   r  "AltRobertaLayer.feed_forward_chunk  s)    "//0@A{{#6Ir*   )r  r
  r  r   r  r   )rI   rJ   rK   rL   rc   r%   r   r   rN   r   rD   r   r  rP   r   r   s   @r(   r  r  m  sy    / 7;15,1|| !!2!23 E--.	
 $D> 
u||	2 r*   r  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )AltRobertaEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf NF)
rb   rc   rx   r"   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)rA   rx   iry   s      r(   rc   AltRobertaEncoder.__init__  sR    ]]U6KcKcEd#eEdOF$;Ed#ef
&+# $f   A%r   r   r   r   output_hidden_statesreturn_dictr   c           	         U(       a  SOS nU(       a  SOS n	[        U R                  5       H=  u  pU(       a  X4-   nUb  X:   OS nU" SUUUUS.UD6nUS   nU(       d  M5  XS   4-   n	M?     U(       a  X4-   n[        UUU	S9$ )NrH   )r   r   r   r   r   r   last_hidden_stater   
attentions)	enumerater  r   )rA   r   r   r   r   r$  r%  r  all_hidden_statesall_self_attentionsr!  layer_modulelayer_head_masklayer_outputss                 r(   r   AltRobertaEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO( +-)"3	
 M *!,M  &91=M<O&O#!  5$   14D D++*
 	
r*   )rx   r   r  )NNFFT)rI   rJ   rK   rL   rc   r   r%   r   r   rN   r   r   rD   r   r   rP   r   r   s   @r(   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r*   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AltRobertaPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rb   rc   r"   r   rf   r   Tanh
activationrw   s     r(   rc   AltRobertaPooler.__init__  s9    YYv1163E3EF
'')r*   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r5  )rA   r   first_token_tensorpooled_outputs       r(   r   AltRobertaPooler.forward  s6     +1a40

#566r*   )r5  r   r   r   s   @r(   r2  r2    s(    $
U\\ ell  r*   r2  moduler   r   r   r   scalingrq   c                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr\   r   )r   ra   )ptrainingr   r   )r%   r   r   r"   r#   r   float32r   ra   rq   r?  r   )
r;  r   r   r   r   r<  rq   r  attn_weightsattn_outputs
             r(   eager_attention_forwardrC    s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r*   c                      ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                  \\R                     4   4
S	 jjrS
rU =r$ )AltCLIPAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paperc                    > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l        SU l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        [        R                  " U R                  U R                  5      U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)rb   rc   rx   rf   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrq   	is_causalr"   r   k_projv_projq_projout_projrw   s     r(   rc   AltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   r   r   causal_attention_maskr   r   c                    UR                   u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
UR	                  XVU R
                  U R                  5      R                  SS5      nU	R	                  XVU R
                  U R                  5      R                  SS5      n	U
R	                  XVU R
                  U R                  5      R                  SS5      n
U R                  R                  S:w  a  Ub  Ub  X#-   nOUb  UnO	USLU l
        [        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U UU	U
UU R                  U R                  U R                   (       d  SOU R"                  S	9u  pUR%                  XVU5      R'                  5       nU R)                  U5      nU(       d  SnX4$ )
z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rN  r<  rq   )r   rQ  rO  rP  r   rJ  rK  r   rx   r   rN  rC  loggerwarning_oncer   rL  r?  rq   reshaper   rR  )rA   r   r   rT  r   
batch_sizer   rI  queriesrE   valuesattention_interfacerB  rA  s                 r(   r   AltCLIPAttention.forward  s    -:,?,?)
	++m,{{=)]+,,zt~~t}}U__`acdeyyOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
! "))*)LWWYmmK0 L((r*   )rx   rq   rI  rK  rN  rO  rJ  rR  rQ  rL  rP  r   )rI   rJ   rK   rL   rM   rc   r%   r   r   r   rD   r   rP   r   r   s   @r(   rE  rE    s    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45) 5)r*   rE  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )
AltCLIPMLPi?  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )rb   rc   rx   r	   r   activation_fnr"   r   rf   r   fc1fc2rw   s     r(   rc   AltCLIPMLP.__init__@  sb    #F$5$5699V//1I1IJ99V55v7I7IJr*   r   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )re  rd  rf  r   s     r(   r   AltCLIPMLP.forwardG  s4    /**=9/r*   )rd  rx   re  rf  r   r   s   @r(   rb  rb  ?  s)    KU\\ ell  r*   rb  c                      ^  \ rS rSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\\	   S\
\R                     4
S	 jjrS
rU =r$ )AltCLIPEncoderLayeriN  rx   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g r   )rb   rc   rf   rI  rE  	self_attnr"   rm   rn   layer_norm1rb  mlplayer_norm2rw   s     r(   rc   AltCLIPEncoderLayer.__init__O  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr*   r   r   rT  r   r   c                     UnU R                  U5      nU R                  UUUUS9u  pXQ-   nUnU R                  U5      nU R                  U5      nXQ-   nU4nU(       a  Xv4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   rT  r   )rn  rm  rp  ro  )rA   r   r   rT  r   residualrA  r   s           r(   r   AltCLIPEncoderLayer.forwardW  s    " !((7&*nn')"7/	 '5 '
# !0 ((7/ 0 "&Gr*   )rI  rn  rp  ro  rm  F)rI   rJ   rK   rL   r   rc   r%   r   r   r   rD   rN   r   rP   r   r   s   @r(   rk  rk  N  sk    S} S -2&||& &  %||	&
 $D>& 
u  	!& &r*   rk  c                      ^  \ rS rSrSrS\4U 4S jjr\     SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\\4   4S jj5       rSrU =r$ )AltCLIPEncoderi  z
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`AltCLIPEncoderLayer`].

Args:
    config: AltCLIPConfig
rx   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r  )
rb   rc   rx   r"   r  r  r  rk  layersr   )rA   rx   _ry   s      r(   rc   AltCLIPEncoder.__init__  sT    mm%PVPhPhJi$jJiQ%8%@Ji$jk&+# %kr#  r   rT  r   r$  r%  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn	[	        U R
                  5       H0  u  pU(       a  Xy4-   nU" U	UUUS9nUS   n	U(       d  M(  XS   4-   nM2     U(       a  Xy4-   n[        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Causal mask for the text model. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
NrH   )r   r   r   r'  )rx   r   r$  use_return_dictr*  ry  r   )rA   r   r   rT  r   r$  r%  encoder_statesall_attentionsr   idxencoder_layerr/  s                r(   r   AltCLIPEncoder.forward  s    N 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)%"3	M *!,M  !/3C2E!E #9  +.>>N+Vd
 	
r*   )rx   r   ry  )NNNNN)rI   rJ   rK   rL   rM   r   rc   r   r   r%   r   r   r   rD   r   r   rP   r   r   s   @r(   rw  rw    s    ,} ,  268<,0/3&*D
 !.D
  (5	D

 $D>D
 'tnD
 d^D
 
uo%	&D
 D
r*   rw  c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )AltCLIPVisionEmbeddingsi  rx   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebiasr   r   rZ   r[   r]   )rb   rc   rx   rf   rI  
image_size
patch_sizer"   	Parameterr%   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsrd   position_embeddingrr   r&   rs   rw   s     r(   rc    AltCLIPVisionEmbeddings.__init__  s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   r   heightwidthr   c                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr\   g      ?r   r   bicubicF)ru   modealign_cornersr   )r   r  weightr   r%   jit
is_tracingrZ   r  r   r[  r   r"   r#   interpolater   cat)rA   r   r  r  r  r  r  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encoding0AltCLIPVisionEmbeddings.interpolate_pos_encoding  si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr*   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (rG  r`   r   r   r\   r   )r   r  r   r  r  ra   r   flattenr   r  rs   r%   r  r  r  rZ   )rA   r  r  r\  rz  r  r  target_dtypepatch_embedsclass_embedsr   s              r(   r   AltCLIPVisionEmbeddings.forward  s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr*   )	r  rx   rI  r  r  r  r  r  r  ru  )rI   rJ   rK   rL   r   rc   r%   r   r   r  rN   r   rP   r   r   s   @r(   r  r    sj    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r*   r  c                   2    \ rS rSr% \\S'   SrSr/ rS r	Sr
g)AltCLIPPreTrainedModeli+  rx   altclipTc                 6   U R                   R                  n[        U[        5      (       a  U R                   R                  n[        R
                  R                  UR                  SUR                  S-  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  [        R
                  R                  UR                  R                  UR                   R                  U-  S9  g[        U[        5      (       Ga   U R                   R                  nUR                  S-  SUR                   R                  -  S-  -  U-  nUR                  S-  U-  n[        R
                  R                  UR                  R                  US9  [        R
                  R                  UR                   R                  US9  [        R
                  R                  UR"                  R                  US9  [        R
                  R                  UR$                  R                  US9  g[        U[&        5      (       a  U R                   R                  nUR                   R(                  S-  SUR                   R                  -  S-  -  U-  nSUR                   R(                  -  S-  U-  n[        R
                  R                  UR*                  R                  US9  [        R
                  R                  UR,                  R                  US9  g[        U[.        5      (       a  [        R
                  R                  UR0                  R                  UR2                  S-  U R                   R                  -  S9  SUR0                  l        [        R
                  R                  UR6                  R                  UR8                  S-  U R                   R                  -  S9  SUR6                  l        g[        U[        R:                  5      (       aJ  UR<                  R>                  RA                  5         UR                  R>                  RC                  S5        g[        U[        RD                  5      (       ak  UR                  R>                  R                  SU R                   R                  S9  UR<                  b%  UR<                  R>                  RA                  5         gg[        U[        RF                  5      (       ax  UR                  R>                  R                  SU R                   R                  S9  URH                  b2  UR                  R>                  URH                     RA                  5         ggg)	zInitialize the weightsrX  rH  )meanstd)r  r   Tg      ?N)%rx   initializer_factorr   r  r"   initnormal_r  rI  r  r  initializer_ranger  rE  r  rQ  rO  rP  rR  rb  rf   re  rf  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrm   r  datazero_fill_r   rd   rU   )rA   r;  factorin_proj_stdout_proj_stdfc_stds         r(   _init_weights$AltCLIPPreTrainedModel._init_weights2  s   //f566[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 011[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
++[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?--GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7--KK""$MM$$S)		**MM&&CT[[5S5S&T{{&  &&( '--MM&&CT[[5S5S&T!!-""6#5#56<<> . .r*   rH   N)rI   rJ   rK   rL   r   rO   base_model_prefixsupports_gradient_checkpointing_no_split_moduler  rP   rH   r*   r(   r  r  +  s    !&*#+?r*   r  c                      ^  \ rS rSrS\4U 4S jjr\\     SS\\	R                     S\\   S\\   S\\   S\\   S	\\\4   4S
 jj5       5       rSrU =r$ )AltCLIPVisionTransformeri`  rx   c                   > [         TU ]  5         Xl        UR                  n[	        U5      U l        [        R                  " X!R                  S9U l	        [        U5      U l        [        R                  " X!R                  S9U l        g r   )rb   rc   rx   rf   r  r   r"   rm   rn   pre_layrnormrw  encoderpost_layernorm)rA   rx   rI  ry   s      r(   rc   !AltCLIPVisionTransformer.__init__a  sd    &&	1&9LL8M8MN%f- ll9:O:OPr*   r  r   r$  r%  r  r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  U5      nU R                  UUUSS9nUS   nUS S 2SS S 24   n	U R                  U	5      n	[        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r  T)r   r   r$  r%  r   r(  pooler_outputr   r)  )rx   r   r$  r}  r   r   r  r  r  r   r   r)  )
rA   r  r   r$  r%  r  r   encoder_outputsr(  r9  s
             r(   r    AltCLIPVisionTransformer.forwardk  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h))-8,,'/!5	 ' 
 ,A.)!Q'2++M:)/')77&11	
 	
r*   )rx   r   r  r  r  )NNNNF)rI   rJ   rK   rL   r   rc   r   r   r   r%   rN   r   r   rD   r   r   rP   r   r   s   @r(   r  r  `  s    Q2 Q  59,0/3&*38$
u001$
 $D>$
 'tn	$

 d^$
 #+4.$
 
u00	1$
  $
r*   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\   S\\   S	\S
\\   S\\\4   4S jj5       rSrU =r$ )AltCLIPVisionModeli  rx   r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rb   rc   r  vision_model	post_initrw   s     r(   rc   AltCLIPVisionModel.__init__  s'     4V<r*   r   c                 B    U R                   R                  R                  $ r   )r  r   r  rF   s    r(   get_input_embeddings'AltCLIPVisionModel.get_input_embeddings  s      ++;;;r*   r   r$  r  r%  c                 ^    Ub  UOU R                   R                  nU R                  UUUUUS9$ )aN  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPVisionModel

>>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```r  r   r$  r  r%  )rx   r}  r  )rA   r  r   r$  r  r%  s         r(   r   AltCLIPVisionModel.forward  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r*   )r  NNNFN)rI   rJ   rK   rL   r   rO   main_input_namerc   r"   Moduler  r   r   r%   rN   r   r   rD   r   r   rP   r   r   s   @r(   r  r    s    $O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r*   r  a=  
    The model behaves as an encoder following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                   n  ^  \ rS rSr% \\S'   SU 4S jjrS rS rS r	\
         SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )AltRobertaModeli  rx   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rb   rc   rx   rR   r   r  r  r2  poolerr  )rA   rx   add_pooling_layerry   s      r(   rc   AltRobertaModel.__init__  sL    
 	 .v6(02C&v. 	r*   c                 .    U R                   R                  $ r   r   rh   rF   s    r(   r  $AltRobertaModel.get_input_embeddings  s    ...r*   c                 $    XR                   l        g r   r  rA   r   s     r(   set_input_embeddings$AltRobertaModel.set_input_embeddings  s    */'r*   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r  r   )rA   heads_to_pruner  r   s       r(   _prune_headsAltRobertaModel._prune_heads  s<    
 +002LELLu%//;;EB 3r*   r   r   r_   rZ   r   r   r   r$  r%  r   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  X*5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R'                  UUUUUSS	9nUS
   nU R(                  b  U R)                  U5      OS n[+        UUUR,                  UR.                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer\   z5You have to specify either input_ids or inputs_embedsr    r_   r|   )r   rZ   r_   r   T)r   r   r   r$  r%  r   r  )rx   r   r$  r}  r   %warn_if_padding_and_no_attention_maskru   r!   r%   onesr   r   r_   rs   rt   rv   get_extended_attention_maskget_head_maskr  r  r  r   r   r)  )rA   r   r   r_   rZ   r   r   r   r$  r%  r   r\  r   r!   r   r   extended_attention_maskembedding_outputr  sequence_outputr9  s                        r(   r   AltRobertaModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r*   )rx   r   r  r  )T	NNNNNNNNN)rI   rJ   rK   rL   r   rO   rc   r  r  r  r   r   r%   r   r   r   rD   r   r   rP   r   r   s   @r(   r  r    s     /0C  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
 G
r*   r  c                     ^  \ rS rSr% \\S'   U 4S jrS\R                  4S jr	S\R                  SS4S jrSS	\\   S\R                  4U 4S
 jjjr\\         SS\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\R$                     S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )AltCLIPTextModeli@  rx   c                   > [         TU ]  U5        [        USS9U l        [        R
                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        U R                  5         g )NF)r  rV   )rb   rc   r  robertar"   r   rf   project_dimtransformationrm   rn   pre_LNr  rw   s     r(   rc   AltCLIPTextModel.__init__C  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr*   r   c                 B    U R                   R                  R                  $ r   r  r   rh   rF   s    r(   r  %AltCLIPTextModel.get_input_embeddingsJ  s    ||&&666r*   r   Nc                 8    XR                   R                  l        g r   r  r  s     r(   r  %AltCLIPTextModel.set_input_embeddingsM  s    27/r*   new_num_tokensc                 "   > [         TU ]  U5      $ r   )rb   resize_token_embeddings)rA   r  ry   s     r(   r  (AltCLIPTextModel.resize_token_embeddingsP  s    w.~>>r*   r   r   r_   rZ   r   r   r   r%  r$  c
                    Ub  UOU R                   R                  nU R                  UUUUUUUU	SS9	n
U
S   nU R                  U5      nU R	                  U5      nUSS2S4   n[        UUU
R                  U
R                  S9$ )a  
Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPTextModel

>>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

>>> texts = ["it's a cat", "it's a dog"]

>>> inputs = processor(text=texts, padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```NT)	r   r   r_   rZ   r   r   r   r$  r%  r   r  )rx   r}  r  r  r
  r   r   r)  )rA   r   r   r_   rZ   r   r   r   r%  r$  r   r  projection_stater  s                 r(   r   AltCLIPTextModel.forwardS  s    @ &1%<k$++B]B],,))%'/!5  

 "!* ++o6  ..?(A.6.'!//))	
 	
r*   )r  r  r
  r   r  )rI   rJ   rK   rL   r   rO   rc   r"   r  r  rd   r  r   r   r  r   r   r%   r   r   r   rD   r   r   rP   r   r   s   @r(   r  r  @  s=   7bii 78",, 84 8?hsm ?r|| ? ?  -11515/3,004,0&*/3;
ELL);
 !.;
 !.	;

 u||,;
 ELL);
  -;
 $D>;
 d^;
 'tn;
 
u==	>;
  ;
r*   r  c                   h  ^  \ rS rSr% \\S'   S\4U 4S jjr\       SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\     SS\\	R                     S\\   S\\   S\S	\\   S
\	R                  4S jj5       r\          SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\S	\\   S
\\\4   4S jj5       rSrU =r$ )r  i  rx   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  Ul	        UR                  U l
        UR                  U l        UR                  U l        [        U5      U l        [#        U5      U l        [&        R(                  " U R                  U R                  SS9U l        [&        R(                  " U R                  U R                  SS9U l        [&        R.                  " [0        R2                  " U R4                  R6                  5      5      U l        U R;                  5         g )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r  )rb   rc   r   vision_configr   	TypeErrortypetext_configr   r   projection_dimr	  r  rf   r  r  
text_modelr  r  r"   r   r  r  r  r%   tensorrx   logit_scale_init_valuelogit_scaler  )rA   rx   r  r  ry   s       r(   rc   AltCLIPModel.__init__  sk    &..0CDD--./q2  &,,.?@@++,-Q0 
 ((,,-3-H-H*$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   r   r   rZ   r   r$  r%  r   c           
          Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUUS9nUS   n	U R                  U	5      n
U
$ )a/  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPTextModel`].

Examples:

```python
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```)r   r   rZ   r_   r   r$  r%  r   )rx   r   r$  r}  r!  r  )rA   r   r   rZ   r_   r   r$  r%  text_outputsr9  text_featuress              r(   get_text_featuresAltCLIPModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%)/!5# ' 
 %Q,,];r*   r  r  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9nUS   nU R                  U5      nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(images=image, return_tensors="pt")
>>> image_features = model.get_image_features(**inputs)
```r  r   )rx   r   r$  r}  r  r  )	rA   r  r   r$  r  r%  vision_outputsr9  image_featuress	            r(   get_image_featuresAltCLIPModel.get_image_features  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r*   r_   return_lossc           
         Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
U R	                  UUUUUUU
S9nU R                  UUUU	U
S9nUS   nU R                  U5      nUS   nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       n[        R                  " XR                  5       5      U-  nUR                  nSnU(       a  [        U5      nU
(       d  UUXX4nUb  U4U-   $ U$ [!        UUUUUUUS	9$ )
a0  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AltCLIPModel

>>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
>>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> inputs = processor(
...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
... )
>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```N)r   r   r_   rZ   r   r$  r%  r  r   r   r\   T)r>  r   keepdim)r4   r5   r6   r7   r8   r9   r:   )rx   r   r$  r}  r!  r  r  r  normr$  expr%   r   r-   Tr0   r2   )rA   r   r  r   rZ   r_   r0  r   r$  r  r%  r'  r,  r8   r7   r$  r6   r5   r4   r   s                       r(   r   AltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $&7&7!T&7&RR!$4$4qb$$4$OO &&**,,,{NN4DES*,,_-D&T`qF)-)9TGf$EvE-+#%* .
 	
r*   )r$  r   r  r!  r  r  r  r  )NNNNNNNr  )
NNNNNNNNFN)rI   rJ   rK   rL   r   rO   rc   r   r   r%   r   r   rN   r)  r.  
LongTensorr   rD   r2   r   rP   r   r   s   @r(   r  r    s*   } B  -115/3,0/3&*,ELL), !., u||,	, $D>, 'tn, d^, 
		, ,\  59,0/3).&*-u001- $D>- 'tn	-
 #'- d^- 
		- -^  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r*   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r%   cumsumtype_asrv   )r   rU   r   maskincremental_indicess        r(   r}   r}   v  sW     <<$((*D <<!4<<TBE[[_cc##%33r*   )r  r  r  r  )rX  )r   )HrM   r   dataclassesr   typingr   r   r   r   r%   torch.nnr"   torch.utils.checkpointactivationsr	   modeling_layersr
   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_altclipr   r   r   
get_loggerrI   rY  r   r)   r0   r2   r  rR   r   r   r   r   r   r  r  r  r2  floatrC  rE  rb  rk  rw  r  r  r  r  r  r  r  r}   __all__rH   r*   r(   <module>rL     s     ! 1 1    ! 9  G l l V V X X 
		H	%
`U\\ `ell `-%,, -5<< -  
K  
   
HV=299 V=rSbii Sn299  $& "
*")) *\RYY  ryy %0 %R.
		 .
dryy . %II%<<% 
% <<	%
 U\\*% % %.L)ryy L)` /4 /dT
RYY T
pPbii Pf 1?_ 1? 1?h1
ryy 1
h2
/ 2
j k
, k
k
\P
- P
f_
) _
F4  _r*   