
    <h                     ,   S r SSKrSSKJr  SSKJrJrJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SSKJr  SS	KJrJrJrJr  SS
KJrJr  SSKJrJrJr  SSKJrJrJrJ r   SSK!J"r"J#r#J$r$  \ RJ                  " \&5      r'\\" SS9 " S S\5      5       5       r(\\" SS9 " S S\5      5       5       r)\\ " S S\5      5       5       r*S\	RV                  S\	RV                  4S jr,S\	RV                  S\	RV                  4S jr-S\$S\.4S jr/SYS \\.\04   S!\14S" jjr2 " S# S$\Rf                  5      r4 " S% S&\Rj                  5      r6 " S' S(\Rf                  5      r7 " S) S*\Rf                  5      r8 " S+ S,\Rf                  5      r9 " S- S.\Rf                  5      r: " S/ S0\Rf                  5      r; " S1 S2\Rf                  5      r< " S3 S4\Rf                  5      r=  SZS5\Rf                  S6\	RV                  S7\	RV                  S8\	RV                  S9\\	RV                     S:\>S;\>S<\\	RV                     4S= jjr? " S> S?\Rf                  5      r@ " S@ SA\Rf                  5      rA " SB SC\Rf                  5      rB " SD SE\Rf                  5      rC " SF SG\Rf                  5      rD " SH SI\5      rE " SJ SK\Rf                  5      rF " SL SM\Rf                  5      rG\ " SN SO\5      5       rH\" SPS9 " SQ SR\H5      5       rI\" SSS9 " ST SU\H5      5       rJ\ " SV SW\H5      5       rK/ SXQrLg)[zPyTorch ALIGN model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Srg)AlignVisionModelOutput*   z
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
Nimage_embedslast_hidden_statehidden_states )__name__
__module____qualname____firstlineno____doc__r!   r   torchFloatTensor__annotations__r"   r#   tuple__static_attributes__r$       `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/align/modeling_align.pyr   r   *   sN    
 15L(5,,-459x 1 1298<M8E%"3"345<r/   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Srg)	AlignTextModelOutput;   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedsr"   r#   
attentionsr$   )r%   r&   r'   r(   r)   r4   r   r*   r+   r,   r"   r#   r-   r5   r.   r$   r/   r0   r2   r2   ;   sh    
 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r/   r2   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)AlignOutputM   a.  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for image-text similarity.
logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
    The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
    The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The output of [`AlignVisionModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`AlignTextModel`].
vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
    The output of the [`AlignVisionModel`].
Nlosslogits_per_imagelogits_per_textr4   r!   text_model_outputvision_model_outputreturnc                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))r<   r=   N)getattrto_tuple).0kselfs     r0   	<genexpr>'AlignOutput.to_tuple.<locals>.<genexpr>l   s<      
   LLDGRYZ^`aRbRkRkRmm s   14)r-   keysrE   s   `r0   rB   AlignOutput.to_tuplek   s#     
YY[
 
 	
r/   r$   )r%   r&   r'   r(   r)   r9   r   r*   r+   r,   r:   r;   r4   r!   r<   r   r=   r   r-   r   rB   r.   r$   r/   r0   r7   r7   M   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-44818DHAH
%* 
r/   r7   logitsr>   c                     [         R                  R                  U [        R                  " [        U 5      U R                  S9SS9$ )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr*   arangelenrN   )rK   s    r0   contrastive_lossrT   t   s5    ==&&vu||CKPVP]P]/^ps&ttr/   
similarityc                 X    [        U 5      n[        U R                  5       5      nX-   S-  $ )Ng       @)rT   t)rU   caption_loss
image_losss      r0   
align_lossrZ   x   s*    #J/L!*,,.1J%,,r/   confignum_channelsc                     U R                   nXR                  -  n[        U[        XS-  -   5      U-  U-  5      nUSU-  :  a  X2-  n[        U5      $ )z4
Round number of filters based on depth multiplier.
   g?)depth_divisorwidth_coefficientmaxint)r[   r\   divisornew_dims       r0   round_filtersre      s`     ""G,,,L'3|k9:gEOPG |##w<r/   kernel_sizeadjustc                     [        U [        5      (       a  X 4n U S   S-  U S   S-  4nU(       a  US   S-
  US   US   S-
  US   4$ US   US   US   US   4$ )a.  
Utility function to get the tuple padding value for the depthwise convolution.

Args:
    kernel_size (`int` or `tuple`):
        Kernel size of the convolution layers.
    adjust (`bool`, *optional*, defaults to `True`):
        Adjusts padding value to apply to right and bottom sides of the input.
r   r^   r   )
isinstancerb   )rf   rg   corrects      r0   correct_padrk      s~     +s##"01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r/   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	AlignVisionEmbeddings   zD
A module that corresponds to the stem module of the original work.
r[   c           	      |  > [         TU ]  5         [        US5      U l        [        R
                  " SS9U l        [        R                  " UR                  U R                  SSSSS9U l	        [        R                  " U R                  UR                  UR                  S	9U l        [        UR                     U l        g )
N    )r   r   r   r   paddingr	   r^   validFrf   striderr   bias)epsmomentum)super__init__re   out_dimr   	ZeroPad2drr   Conv2dr\   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationrE   r[   	__class__s     r0   rz   AlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r/   pixel_valuesr>   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rr   r~   r   r   )rE   r   featuress      r0   forwardAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r/   )r   r   r~   r{   rr   )r%   r&   r'   r(   r)   r   rz   r*   Tensorr   r.   __classcell__r   s   @r0   rm   rm      s5    	40 	4ELL U\\  r/   rm   c                   :   ^  \ rS rSr       SU 4S jjrSrU =r$ )AlignVisionDepthwiseConv2d   c	                 8   > X-  n	[         T
U ]  UU	UUUUUUUS9	  g )N)	in_channelsout_channelsrf   ru   rr   dilationgroupsrv   padding_mode)ry   rz   )rE   r   depth_multiplierrf   ru   rr   r   rv   r   r   r   s             r0   rz   #AlignVisionDepthwiseConv2d.__init__   s:     #5#%#% 	 
	
r/   r$   )r   r	   r   r   r   Tzeros)r%   r&   r'   r(   rz   r.   r   r   s   @r0   r   r      s$     
 
r/   r   c                   z   ^  \ rS rSrSrS\S\S\S\4U 4S jjrS\R                  S	\R                  4S
 jrSrU =r$ )AlignVisionExpansionLayer   zW
This corresponds to the expansion phase of each block in the original implementation.
r[   in_dimr{   ru   c                    > [         TU ]  5         [        R                  " UUSSSS9U l        [        R
                  " X1R                  S9U l        [        UR                     U l
        g )Nr   sameFr   r   rf   rr   rv   )num_featuresrw   )ry   rz   r   r}   expand_convr   r   	expand_bnr
   r   
expand_act)rE   r[   r   r{   ru   r   s        r0   rz   "AlignVisionExpansionLayer.__init__   sX    99 
 WBWBWX !2!23r/   r#   r>   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   rE   r#   s     r0   r   !AlignVisionExpansionLayer.forward   s4    ((7}56r/   )r   r   r   )r%   r&   r'   r(   r)   r   rb   rz   r*   r+   r   r   r.   r   r   s   @r0   r   r      sM    
40 
4# 
4 
4UX 
4U%6%6 5<<  r/   r   c            
       ~   ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjrS	\	R                  S
\	R                  4S jrSrU =r$ )AlignVisionDepthwiseLayer   zc
This corresponds to the depthwise convolution phase of each block in the original implementation.
r[   r   ru   rf   adjust_paddingc                 F  > [         TU ]  5         X0l        U R                  S:X  a  SOSn[        XES9n[        R
                  " US9U l        [        X$X6SS9U l        [        R                  " X!R                  UR                  S9U l        [        UR                     U l        g )	Nr^   rs   r   )rg   rq   Frt   r   rw   rx   )ry   rz   ru   rk   r   r|   depthwise_conv_padr   depthwise_convr   r   r   depthwise_normr
   r   depthwise_act)	rE   r[   r   ru   rf   r   conv_padrr   r   s	           r0   rz   "AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7FkA"$,,w"?8FSX
 !nn%:%:VE_E_
 $F$5$56r/   r#   r>   c                     U R                   S:X  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ )Nr^   )ru   r   r   r   r   r   s     r0   r   !AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r/   )r   r   r   r   ru   r%   r&   r'   r(   r)   r   rb   boolrz   r*   r+   r   r   r.   r   r   s   @r0   r   r      s_    7!7 7 	7
 7 7,	U%6%6 	5<< 	 	r/   r   c            	       ~   ^  \ rS rSrSrSS\S\S\S\4U 4S jjjrS\	R                  S	\	R                  4S
 jrSrU =r$ )AlignVisionSqueezeExciteLayeri  zd
This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
r[   r   
expand_dimexpandc                   > [         TU ]  5         U(       a  UOUU l        [        S[	        X!R
                  -  5      5      U l        [        R                  " SS9U l	        [        R                  " U R                  U R                  SSS9U l        [        R                  " U R                  U R                  SSS9U l        [        UR                     U l        [        R                   " 5       U l        g )Nr   )output_sizer   )r   r   rf   rr   )ry   rz   dimra   rb   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezer}   reducer   r
   r   
act_reduceSigmoid
act_expand)rE   r[   r   r   r   r   s        r0   rz   &AlignVisionSqueezeExciteLayer.__init__!  s    !':V!S*H*H!HIJ++:ii	
 ii	
 !!2!23**,r/   r#   r>   c                     UnU R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      n[
        R                  " X!5      nU$ r   )r   r   r   r   r   r*   mul)rE   r#   inputss      r0   r   %AlignVisionSqueezeExciteLayer.forward6  sa    ]3M26M26		&8r/   )r   r   r   r   r   r   r   )Fr   r   s   @r0   r   r     sR    '0 '# '3 'X\ ' '*
U%6%6 
5<< 
 
r/   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\4U 4S	 jjr	S
\
R                  S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionFinalBlockLayeriC  zS
This corresponds to the final phase of each block in the original implementation.
r[   r   r{   ru   	drop_rateid_skipc                   > [         TU ]  5         US:H  =(       a    U(       + U l        [        R                  " UUSSSS9U l        [        R                  " X1R                  UR                  S9U l	        [        R                  " US9U l        g )Nr   r   Fr   r   )p)ry   rz   apply_dropoutr   r}   project_convr   r   r   
project_bnDropoutdropout)rE   r[   r   r{   ru   r   r   r   s          r0   rz   #AlignVisionFinalBlockLayer.__init__H  sx     	#q[8[II 
 .. &;&;fF`F`
 zzI.r/   
embeddingsr#   r>   c                     U R                  U5      nU R                  U5      nU R                  (       a  U R                  U5      nX!-   nU$ r   )r   r   r   r   )rE   r   r#   s      r0   r   "AlignVisionFinalBlockLayer.forwardY  sE    ))-86 LL7M)6Mr/   )r   r   r   r   r%   r&   r'   r(   r)   r   rb   floatr   rz   r*   r+   r   r   r.   r   r   s   @r0   r   r   C  so    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf  r/   r   c                      ^  \ rS rSrSrS\S\S\S\S\S\S	\S
\S\4U 4S jjr	S\
R                  S\
R                  4S jrSrU =r$ )AlignVisionBlockid  a1  
This corresponds to the block module of original the EfficientNet vision encoder implementation.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
    in_dim (`int`):
        Number of input channels.
    out_dim (`int`):
        Number of output channels.
    stride (`int`):
        Stride size to be used in convolution layers.
    expand_ratio (`int`):
        Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
    kernel_size (`int`):
        Kernel size for the depthwise convolution layer.
    drop_rate (`float`):
        Dropout rate to be used in the final phase of each block.
    id_skip (`bool`):
        Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
        of each block. Set to `True` for the first block of each stage.
    adjust_padding (`bool`):
        Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
        operation, set to `True` for inputs with odd input sizes.
r[   r   r{   ru   expand_ratiorf   r   r   r   c
           	      f  > [         TU ]  5         XPl        U R                  S:g  U l        X%-  n
U R                  (       a  [	        XXS9U l        [        UU R                  (       a  U
OUUUU	S9U l        [        XXR                  S9U l	        [        UU R                  (       a  U
OUUUUUS9U l        g )Nr   )r[   r   r{   ru   )r[   r   ru   rf   r   )r[   r   r   r   )r[   r   r{   ru   r   r   )ry   rz   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rE   r[   r   r{   ru   r   rf   r   r   r   expand_in_dimr   s              r0   rz   AlignVisionBlock.__init__  s     	(''1,-;;6mDN 8$(KK=V#)
 <];;
 5$(KK=V
r/   r#   r>   c                     UnU R                   S:w  a  U R                  U5      nU R                  U5      nU R                  U5      nU R	                  X!5      nU$ Nr   )r   r   r   r   r   )rE   r#   r   s      r0   r   AlignVisionBlock.forward  sY    "
! NN=9M++M: ++M:
Br/   )r   r   r   r   r   r   r   r   s   @r0   r   r   d  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
 
r/   r   c            	       v   ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\
   S\	\
   S\4S	 jjrS
rU =r$ )AlignVisionEncoderi  z
Forward propagates the embeddings through each vision encoder (EfficientNet) block.

Args:
    config ([`AlignVisionConfig`]):
        Model configuration class.
r[   c                   >^ ^ [         TT ]  5         UR                  T l        U 4S jm[        UR                  5      n[        U4S jUR                   5       5      nSn/ n[        U5       H  n[        XR                  U   5      n[        XR                  U   5      nUR                  U   n	UR                  U   n
UR                  U   n[        T" UR                  U   5      5       Hc  nUS:H  nUS:  a  SOU	n	US:  a  UOUnXAR                  ;  nUR                  U-  U-  n[        UUUU	U
UUUUS9	nUR!                  U5        US-  nMe     M     ["        R$                  " U5      T l        g )Nc                 \   > [        [        R                  " TR                  U -  5      5      $ r   )rb   mathceildepth_coefficient)repeatsrE   s    r0   round_repeats2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr/   c              3   2   >#    U H  nT" U5      v   M     g 7fr   r$   )rC   nr   s     r0   rF   .AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq))3Ks   r   r   )	r[   r   r{   ru   rf   r   r   r   r   )ry   rz   r   rS   r   sumnum_block_repeatsrangere   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rE   r[   num_base_blocks
num_blockscurr_block_numr  ir   r{   ru   rf   r   jr   r   r   blockr   r   s   `                @r0   rz   AlignVisionEncoder.__init__  sp   !'!9!9	D f001L63K3KLL
'A"6+=+=a+@AF#F,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEFq&!e$%Ev!/7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r/   r#   output_hidden_statesreturn_dictr>   c                     U(       a  U4OS nU R                    H  nU" U5      nU(       d  M  XA4-  nM     U(       d  [        S X4 5       5      $ [        UUS9$ )Nc              3   ,   #    U H  oc  M  Uv   M     g 7fr   r$   )rC   vs     r0   rF   -AlignVisionEncoder.forward.<locals>.<genexpr>  s     X$Fq$Fs   	)r"   r#   )r  r-   r   )rE   r#   r  r  all_hidden_statesr
  s         r0   r   AlignVisionEncoder.forward  sh     1E],$[[E!-0M##!%55! !
 X]$FXXX-++
 	
r/   )r  r   )FT)r%   r&   r'   r(   r)   r   rz   r*   r+   r   r   r   r   r.   r   r   s   @r0   r   r     s\    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
 
r/   r   c                      ^  \ rS rSrSrU 4S jr    SS\\R                     S\\R                     S\\R                     S\\R                     S\R                  4
S	 jjrS
rU =r$ )AlignTextEmbeddingsi  zGConstruct the embeddings from word, position and token_type embeddings.c                 .  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  g )N)padding_idxrw   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_ids)dtype)ry   rz   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   rA   r  register_bufferr*   rR   r   r   r  sizelongr   s     r0   rz   AlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r/   	input_idsr  r  inputs_embedsr>   c                 `   Ub  UR                  5       nOUR                  5       S S nUS   nUc  U R                  S S 2S U24   nUcv  [        U S5      (       a-  U R                  S S 2S U24   nUR	                  US   U5      nUnO8[
        R                  " U[
        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n	XI-   n
U R                  S:X  a  U R                  U5      nX-  n
U R                  U
5      n
U R                  U
5      n
U
$ )Nr  r   r  r   r  rN   r  )r-  r  hasattrr  r   r*   r   r.  rN   r$  r(  r  r&  r)  r   )rE   r0  r  r  r1  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr(  r   r&  s               r0   r   AlignTextEmbeddings.forward  s<     #..*K',,.s3K ^
,,Q^<L
 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r/   )r)  r   r  r&  r(  r$  )NNNN)r%   r&   r'   r(   r)   rz   r   r*   
LongTensorr+   r   r   r.   r   r   s   @r0   r  r    s    Q
* 15593759&E,,-& !!1!12& u//0	&
   1 12& 
& &r/   r  modulequerykeyvalueattention_maskscalingr   	head_maskc                    [         R                  " XR                  SS5      5      U-  n	Ub"  US S 2S S 2S S 2S UR                  S   24   n
X-   n	[        R
                  R                  U	S[         R                  S9R                  UR                  5      n	[        R
                  R                  XU R                  S9n	Ub  XR                  SSSS5      -  n	[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr^   r	   r  )r   r  )r   trainingr   )r*   matmul	transposeshaper   rP   softmaxfloat32tor  r   rD  view
contiguous)r;  r<  r=  r>  r?  r@  r   rA  kwargsattn_weightscausal_maskattn_outputs               r0   eager_attention_forwardrQ  >  s     <<}}Q':;gEL!$Q1o		"o%=>#1==((2U]](SVVW\WbWbcL==((6??([L#nnQAq&AA,,|3K''1-88:K$$r/   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )AlignTextSelfAttentioniY  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )ry   rz   r"  num_attention_headsr4  
ValueErrorr[   rb   attention_head_sizeall_head_sizer   Linearr<  r=  r>  r   attention_probs_dropout_probr   attention_dropoutr@  r   s     r0   rz   AlignTextSelfAttention.__init__Z  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r/   r#   r?  rA  output_attentionsr>   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
U4U R                  (       d  SOU R                  U R                  US.UD6u  pUR                  " / UQSP76 R                  5       nU(       a  X4nU$ U4nU$ )Nr  r   r^   eager        )r   r@  rA  )rG  rY  r<  rK  rF  r=  r>  rQ  r[   _attn_implementationr   rD  r]  r@  reshaperL  )rE   r#   r?  rA  r_  rM  r5  hidden_shapequery_states
key_statesvalue_statesattention_interfacerP  rN  outputss                  r0   r   AlignTextSelfAttention.forwardo  s[    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
! "));;;;FFH1B;- JUr/   )
rZ  r]  rY  r[   r   r=  rW  r<  r@  r>  NNF)r%   r&   r'   r(   rz   r*   r   r   r+   r   r-   r   r.   r   r   s   @r0   rS  rS  Y  st    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	! !r/   rS  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr  )ry   rz   r   r[  r"  denser)  r*  r   r+  r   r   s     r0   rz   AlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r/   r#   input_tensorr>   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rq  r   r)  rE   r#   rs  s      r0   r   AlignTextSelfOutput.forward  5    

=1]3}'CDr/   r)  rq  r   
r%   r&   r'   r(   rz   r*   r   r   r.   r   r   s   @r0   rn  rn    6    >U\\  RWR^R^  r/   rn  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )AlignTextAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )ry   rz   rS  rE   rn  outputsetpruned_headsr   s     r0   rz   AlignTextAttention.__init__  s0    *62	)&1Er/   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   )r   )rS   r   rE   rW  rY  r  r   r<  r=  r>  r  rq  rZ  union)rE   headsindexs      r0   prune_headsAlignTextAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r/   r#   r?  rA  r_  r>   c                 p    U R                   " U4UUUS.UD6nU R                  US   U5      nU4USS  -   nU$ N)r?  rA  r_  r   r   )rE   r  )	rE   r#   r?  rA  r_  rM  self_outputsattention_outputrj  s	            r0   r   AlignTextAttention.forward  s]     yy
)/	

 
  ;;|AF#%QR(88r/   )r  r  rE   rl  )r%   r&   r'   r(   rz   r  r*   r   r   r+   r   r-   r   r.   r   r   s   @r0   r}  r}    sy    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	 r/   r}  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )ry   rz   r   r[  r"  intermediate_sizerq  ri   r   strr
   intermediate_act_fnr   s     r0   rz   AlignTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r/   r#   r>   c                 J    U R                  U5      nU R                  U5      nU$ r   rq  r  r   s     r0   r   AlignTextIntermediate.forward  s&    

=100?r/   r  rz  r   s   @r0   r  r    s(    9U\\ ell  r/   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )AlignTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rp  )ry   rz   r   r[  r  r"  rq  r)  r*  r   r+  r   r   s     r0   rz   AlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r/   r#   rs  r>   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   ru  rv  s      r0   r   AlignTextOutput.forward  rx  r/   ry  rz  r   s   @r0   r  r    r{  r/   r  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	 rS
rU =r$ )AlignTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g r   )
ry   rz   chunk_size_feed_forwardseq_len_dimr}  	attentionr  intermediater  r  r   s     r0   rz   AlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r/   r#   r?  rA  r_  r>   c                     U R                   " U4UUUS.UD6nUS   nUSS  n[        U R                  U R                  U R                  U5      n	U	4U-   nU$ r  )r  r   feed_forward_chunkr  r  )
rE   r#   r?  rA  r_  rM  self_attention_outputsr  rj  layer_outputs
             r0   r   AlignTextLayer.forward  s     "&"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r/   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  )rE   r  intermediate_outputr  s       r0   r  !AlignTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir/   )r  r  r  r  r  rl  )r%   r&   r'   r(   rz   r*   r   r   r+   r   r-   r   r  r.   r   r   s   @r0   r  r    sy    . 7;15,1|| !!2!23 E--.	
 $D> 
u||	2 r/   r  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )AlignTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
ry   rz   r[   r   r  r   num_hidden_layersr  layergradient_checkpointing)rE   r[   r  r   s      r0   rz   AlignTextEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A%r#   r?  rA  r_  r  r  r>   c           	         U(       a  SOS nU(       a  SOS n	[        U R                  5       H=  u  pU(       a  X4-   nUb  X:   OS nU" SUUUUS.UD6nUS   nU(       d  M5  XS   4-   n	M?     U(       a  X4-   n[        UUU	S9$ )Nr$   )r#   r?  rA  r_  r   r   )r"   r#   r5   )	enumerater  r   )rE   r#   r?  rA  r_  r  r  rM  r  all_self_attentionsr  layer_modulelayer_head_masklayer_outputss                 r0   r   AlignTextEncoder.forward  s     #7BD$5b4(4OA#$58H$H!.7.CilO( +-)"3	
 M *!,M  &91=M<O&O#!  5$   14D D++*
 	
r/   )r[   r  r  )NNFFT)r%   r&   r'   r(   rz   r   r*   r   r   r+   r   r   r-   r   r   r.   r   r   s   @r0   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r/   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )AlignTextPooleriH  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )ry   rz   r   r[  r"  rq  Tanhr   r   s     r0   rz   AlignTextPooler.__init__I  s9    YYv1163E3EF
'')r/   r#   r>   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )rq  r   )rE   r#   first_token_tensorpooled_outputs       r0   r   AlignTextPooler.forwardN  s6     +1a40

#566r/   )r   rq  rz  r   s   @r0   r  r  H  s(    $
U\\ ell  r/   r  c                   J    \ rS rSr% \\S'   SrSrS\R                  4S jr
Srg)	AlignPreTrainedModeliW  r[   alignTr;  c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b$  UR                  R                  R                  5         GO1[        U[        5      (       a  [        R                  R                  UR                  R                  5        UR                  R                  R                  R                  5         UR                  R                  R!                  U R                   R"                  5        O[        U[        R$                  5      (       ab  UR                  R                  R                  SUS9  UR&                  b1  UR                  R                  UR&                     R                  5         [        U[        R(                  [        R*                  45      (       aJ  UR                  R                  R                  5         UR                  R                  R!                  S5        gg)zInitialize the weightsrb  )meanstdNg      ?)r[   initializer_rangeri   r   r[  r}   weightdatanormal_rv   zero_
AlignModelinitxavier_uniform_text_projectiontemperaturefill_temperature_init_valuer   r  r)  r   )rE   r;  r  s      r0   _init_weights"AlignPreTrainedModel._init_weights]  s~   kk++fryy"))455MM&&CS&9{{&  &&(
++GG##F$:$:$A$AB""'',,224##))$++*L*LM--MM&&CS&9!!-""6#5#56<<>fr||R^^<==KK""$MM$$S) >r/   r$   N)r%   r&   r'   r(   r   r,   base_model_prefixsupports_gradient_checkpointingr   Moduler  r.   r$   r/   r0   r  r  W  s$    &*#*BII *r/   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   j  ^  \ rS rSr% \\S'   S/rSS\S\4U 4S jjjrS r	S r
\\         SS\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                      S\\R                     S\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )AlignTextModeliq  r[   r  add_pooling_layerc                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
ry   rz   r[   r  r   r  encoderr  pooler	post_init)rE   r[   r  r   s      r0   rz   AlignTextModel.__init__z  sK    
 	 -f5'/1Bof- 	r/   c                 .    U R                   R                  $ r   r   r$  rI   s    r0   get_input_embeddings#AlignTextModel.get_input_embeddings  s    ...r/   c                 $    XR                   l        g r   r  )rE   r>  s     r0   set_input_embeddings#AlignTextModel.set_input_embeddings  s    */'r/   r0  r?  r  r  rA  r1  r_  r  r  r>   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  SS2SU24   nUR                  X5      nUnO$[        R                  " U[        R                  US9nU R!                  X+5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R&                  " U4UUUUS	S
.U
D6nUS   nU R(                  b  U R)                  U5      OSn[+        UUUR,                  UR.                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, AlignTextModel

>>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
```NzDYou cannot specify both input_ids and inputs_embeds at the same timer  z5You have to specify either input_ids or inputs_embedsrM   r  r3  )r0  r  r  r1  T)r?  rA  r_  r  r  r   )r"   pooler_outputr#   r5   )r[   r_  r  use_return_dictrX  %warn_if_padding_and_no_attention_maskr-  rN   r*   onesr4  r   r  r   r   r.  get_extended_attention_maskget_head_maskr  r  r  r   r#   r5   )rE   r0  r?  r  r  rA  r1  r_  r  r  rM  r5  
batch_sizer6  rN   r7  r8  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                         r0   r   AlignTextModel.forward  s   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,
2/!5
 
 *!,8<8OO4UY)-')77&11	
 	
r/   r[   r   r  r  T	NNNNNNNNN)r%   r&   r'   r(   r   r,   _no_split_modulesr   rz   r  r  r   r   r   r*   r   r+   r   r-   r   r   r.   r   r   s   @r0   r  r  q  s,    ./ 4   /0  -11515/31504,0/3&*\
ELL)\
 !.\
 !.	\

 u||,\
 E--.\
  -\
 $D>\
 'tn\
 d^\
 
u00	1\
  \
r/   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                      ^  \ rS rSr% \\S'   SrSrS\4U 4S jjrS\	R                  4S jr\\   SS\\R                      S\\   S	\\   S\\\4   4S
 jj5       5       rSrU =r$ )AlignVisionModeli  r[   r   Fc                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:X  a%  [        R                  " UR                  SS9U l        OMUR                  S:X  a%  [        R                  " UR                  SS9U l        O[        SUR                   35      eU R                  5         g )Nr  T)	ceil_modera   z2config.pooling must be one of ['mean', 'max'] got )ry   rz   r[   rm   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2drX  poolingr  r   s     r0   rz   AlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r/   r>   c                 B    U R                   R                  R                  $ r   )vision_modelr   r~   rI   s    r0   r  %AlignVisionModel.get_input_embeddings  s      ++777r/   r  r  c                 `   Ub  UOU R                   R                  nUb  UOU R                   R                  nUc  [        S5      eU R	                  U5      nU R                  UUSS9nUS   nU R                  U5      nUR                  UR                  SS 5      n[        UUUR                  S9$ )a\  
Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignVisionModel

>>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
>>> pooled_output = outputs.pooler_output  # pooled CLS states
```Nz You have to specify pixel_valuesT)r  r  r   r^   )r"   r  r#   )r[   r  r  rX  r   r  r  rd  rG  r   r#   )rE   r   r  r  r  r  r"   r  s           r0   r   AlignVisionModel.forward  s    : %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5 ' 
 ,A.$56%--m.A.A"1.EF7/')77
 	
r/   r  NNN)r%   r&   r'   r(   r   r,   main_input_namer  rz   r   r  r  r   r   r   r*   r+   r   r   r-   r   r   r.   r   r   s   @r0   r  r    s     $O&+#0 "8bii 8  59/3&*	2
u0012
 'tn2
 d^	2

 
u>>	?2
  2
r/   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\         SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\	R                  4S jj5       r\   SS\\	R                     S\\   S\\   S\	R                  4S jj5       r\\           SS\\	R                      S\\	R                     S\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S\\   S
\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )r  iF  r[   c                   > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  nUR                  U l	        UR                  U l        [        U5      U l        [        U5      U l        [         R"                  " U R                  U R                  5      U l        [         R&                  " [(        R*                  " U R,                  R.                  5      5      U l        U R3                  5         g )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )ry   rz   ri   text_configr   	TypeErrortypevision_configr   projection_dimr"  text_embed_dimr  
text_modelr  r  r   r[  r  	Parameterr*   tensorr[   r  r  r  )rE   r[   r  r  r   s       r0   rz   AlignModel.__init__J  s)    &,,o>>++,-Q0 
 &..0ABB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r/   r0  r?  r  r  rA  r1  r_  r  r  r>   c
                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUUUUUU	S9	n
U
S   SS2SSS24   nU R                  U5      nU$ )a7  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`AlignTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

>>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```N	r0  r?  r  r  rA  r1  r_  r  r  r   )r[   r_  r  r  r  r  )rE   r0  r?  r  r  rA  r1  r_  r  r  text_outputsr"   text_featuress                r0   get_text_featuresAlignModel.get_text_featuresh  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%'/!5# ' 

 )OAq!G4,,->?r/   r   c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUS9nUS   nU$ )a  
Returns:
    image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
    applying the projection layer to the pooled output of [`AlignVisionModel`].

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, return_tensors="pt")

>>> image_features = model.get_image_features(**inputs)
```r   r  r  r   )r[   r  r  r  )rE   r   r  r  vision_outputsimage_featuress         r0   get_image_featuresAlignModel.get_image_features  sf    > %9$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 (*r/   return_lossc                 R   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R	                  UU
SS9nU R                  UUUUUUU	U
SS9	nUS   nUS   SS2SSS24   nU R                  U5      nXR                  SSSS	9-  nXR                  SSSS	9-  n[        R                  " XR                  5       5      U R                  -  nUR                  5       nSnU(       a  [        U5      n[        UUUUUUUS
9$ )aA  
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, AlignModel

>>> model = AlignModel.from_pretrained("kakaobrain/align-base")
>>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(
...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
... )

>>> outputs = model(**inputs)
>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
```NTr!  r  r   r   r^   r  )r   r   keepdim)r9   r:   r;   r4   r!   r<   r=   )r[   r_  r  r  r  r  r  normr*   rE  rW   r  rZ   r7   )rE   r0  r   r?  r  r  rA  r1  r&  r_  r  r  r"  r  r!   r4   r;   r:   r9   s                      r0   r   AlignModel.forward  sl   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5 + 
 ))%'/!5 ' 

 &a("1oaAg.**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  ,,{NN4DEHXHXX*,,.o.D-+#%* .
 	
r/   )r  r  r  r  r  r  r  r  )NNNNNNNNNNN)r%   r&   r'   r(   r   r,   rz   r   r   r*   r   r   r+   r  r$  r   r:  r   r-   r7   r   r.   r   r   s   @r0   r  r  F  sp   { <  -11515/3,004,0/3&*2ELL)2 !.2 !.	2
 u||,2 ELL)2  -2 $D>2 'tn2 d^2 
		2 2h  59/3&*	*u001* 'tn* d^	*
 
		* *X  15481515/3,004&*,0/3&*X
E,,-X
 u001X
 !.	X

 !.X
 u||,X
 ELL)X
  -X
 d^X
 $D>X
 'tnX
 d^X
 
uk!	"X
  X
r/   r  )r  r  r  r  r  )rb  N)Mr)   r   dataclassesr   typingr   r   r   r   r*   torch.utils.checkpointr   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_alignr   r   r   
get_loggerr%   loggerr   r2   r7   r   rT   rZ   rb   re   r-   r   rk   r  rm   r}   r   r   r   r   r   r   r   r  r   rQ  rS  rn  r}  r  r  r  r  r  r  r  r  r  __all__r$   r/   r0   <module>r8     sp     ! 1 1    ! 9  G l l K K P P 
		H	% 
=[ = = 
	:; 	: 	:  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
T<")) <L (,%II%<<% 
% <<	%
 U\\*% % % %%67RYY 7v")) * *\BII  bii %/ %P.
ryy .
dbii  *? * *2 
x
) x

x
v 
M
+ M

M
` ]
% ]
 ]
@ Wr/   