
    Ph	                     2   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZmZ ddlmZmZmZ ddlmZmZmZmZm Z  ddl!m"Z"m#Z#m$Z$  e jJ                  e&      Z'e ed       G d de                    Z(e ed       G d de                    Z)ee G d de                    Z*de	jV                  de	jV                  fdZ,de	jV                  de	jV                  fdZ-de$de.fdZ/dYd ee.e0f   d!e1fd"Z2 G d# d$e
jf                        Z4 G d% d&e
jj                        Z6 G d' d(e
jf                        Z7 G d) d*e
jf                        Z8 G d+ d,e
jf                        Z9 G d- d.e
jf                        Z: G d/ d0e
jf                        Z; G d1 d2e
jf                        Z< G d3 d4e
jf                        Z=	 	 dZd5e
jf                  d6e	jV                  d7e	jV                  d8e	jV                  d9ee	jV                     d:e>d;e>d<ee	jV                     fd=Z? G d> d?e
jf                        Z@ G d@ dAe
jf                        ZA G dB dCe
jf                        ZB G dD dEe
jf                        ZC G dF dGe
jf                        ZD G dH dIe      ZE G dJ dKe
jf                        ZF G dL dMe
jf                        ZGe G dN dOe             ZH edP       G dQ dReH             ZI edS       G dT dUeH             ZJe G dV dWeH             ZKg dXZLy)[zPyTorch ALIGN model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithNoAttentionBaseModelOutputWithPooling(BaseModelOutputWithPoolingAndNoAttention)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringcan_return_tuplefilter_out_non_signature_kwargslogging   )AlignConfigAlignTextConfigAlignVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   y)AlignVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r!   r   torchFloatTensor__annotations__r"   r#   tuple     b/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/align/modeling_align.pyr    r    )   sN    
 15L(5,,-459x 1 1298<M8E%"3"345<r-   r    ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)AlignTextModelOutputz
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The text embeddings obtained by applying the projection layer to the pooler_output.
    Ntext_embedsr"   r#   
attentions)r$   r%   r&   r'   r1   r   r(   r)   r*   r"   r#   r+   r2   r,   r-   r.   r0   r0   :   sh    
 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r-   r0   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AlignOutputar  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The output of [`AlignVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`AlignTextModel`].
    vision_model_output (`BaseModelOutputWithPoolingAndNoAttention`):
        The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr1   r!   text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r8   r9   N)getattrto_tuple).0kselfs     r.   	<genexpr>z'AlignOutput.to_tuple.<locals>.<genexpr>k   s=      
   LLDGRYZ^`aRbRkRkRmm s   -0)r+   keysrA   s   `r.   r>   zAlignOutput.to_tuplej   s#     
YY[
 
 	
r-   )r$   r%   r&   r'   r5   r   r(   r)   r*   r6   r7   r1   r!   r8   r   r9   r   r+   r   r>   r,   r-   r.   r4   r4   L   s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-44818DHAH
%* 
r-   r4   logitsr:   c                     t         j                  j                  | t        j                  t        |       | j                        d      S )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr(   arangelenrH   )rE   s    r.   contrastive_lossrN   s   s5    ==&&vu||CKPVP]P]/^ps&ttr-   
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)rN   t)rO   caption_loss
image_losss      r.   
align_lossrT   w   s,    #J/L!*,,.1J:%,,r-   confignum_channelsc                     | j                   }|| j                  z  }t        |t        ||dz  z         |z  |z        }|d|z  k  r||z  }t        |      S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rU   rV   divisornew_dims       r.   round_filtersr_   ~   sf     ""GF,,,L'3|gk9:gEOPG |##7w<r-   kernel_sizeadjustc                     t        | t              r| | f} | d   dz  | d   dz  f}|r|d   dz
  |d   |d   dz
  |d   fS |d   |d   |d   |d   fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rX   r   )
isinstancer\   )r`   ra   corrects      r.   correct_padre      s}     +s#"K01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r-   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rU   c                    t         |           t        |d      | _        t	        j
                  d      | _        t	        j                  |j                  | j                  dddd      | _	        t	        j                  | j                  |j                  |j                  	      | _        t        |j                     | _        y )
N    )r   r   r   r   paddingr	   rX   validFr`   striderk   bias)epsmomentum)super__init__r_   out_dimr   	ZeroPad2drk   Conv2drV   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationrA   rU   	__class__s     r.   rs   zAlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r-   pixel_valuesr:   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)rk   rw   r{   r}   )rA   r   featuress      r.   forwardzAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r-   )
r$   r%   r&   r'   r   rs   r(   Tensorr   __classcell__r   s   @r.   rg   rg      s0    	40 	4ELL U\\ r-   rg   c                   .     e Zd Z	 	 	 	 	 	 	 d fd	Z xZS )AlignVisionDepthwiseConv2dc	                 @    ||z  }	t         
|   ||	|||||||	       y )N)	in_channelsout_channelsr`   rn   rk   dilationgroupsro   padding_mode)rr   rs   )rA   r   depth_multiplierr`   rn   rk   r   ro   r   r   r   s             r.   rs   z#AlignVisionDepthwiseConv2d.__init__   s=     #%55#%#% 	 
	
r-   )r   r	   r   r   r   Tzeros)r$   r%   r&   rs   r   r   s   @r.   r   r      s$     
 
r-   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z
 xZS )
AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rU   in_dimrt   rn   c                     t         |           t        j                  ||ddd      | _        t        j
                  ||j                        | _        t        |j                     | _
        y )Nr   sameFr   r   r`   rk   ro   )num_featuresrp   )rr   rs   r   rv   expand_convrx   ry   	expand_bnr
   r|   
expand_act)rA   rU   r   rt   rn   r   s        r.   rs   z"AlignVisionExpansionLayer.__init__   sZ    99 
 W&BWBWX !2!23r-   r#   r:   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   rA   r#   s     r.   r   z!AlignVisionExpansionLayer.forward   s4    ((7}56r-   )r$   r%   r&   r'   r   r\   rs   r(   r)   r   r   r   r   s   @r.   r   r      sH    
40 
4# 
4 
4UX 
4U%6%6 5<< r-   r   c            
       p     e Zd ZdZdededededef
 fdZdej                  d	ej                  fd
Z xZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rU   r   rn   r`   adjust_paddingc                 b   t         |           || _        | j                  dk(  rdnd}t        ||      }t	        j
                  |      | _        t        ||||d      | _        t	        j                  ||j                  |j                        | _        t        |j                     | _        y )	NrX   rl   r   )ra   rj   Frm   r   rp   rq   )rr   rs   rn   re   r   ru   depthwise_conv_padr   depthwise_convrx   ry   rz   depthwise_normr
   r|   depthwise_act)	rA   rU   r   rn   r`   r   conv_padrk   r   s	           r.   rs   z"AlignVisionDepthwiseLayer.__init__   s     	"kkQ.7Fk.A"$,,w"?8FHSX
 !nnV%:%:VE_E_
 $F$5$56r-   r#   r:   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S )NrX   )rn   r   r   r   r   r   s     r.   r   z!AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r-   r$   r%   r&   r'   r   r\   boolrs   r(   r)   r   r   r   r   s   @r.   r   r      sZ    7!7 7 	7
 7 7,	U%6%6 	5<< 	r-   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    rU   r   
expand_dimexpandc                    t         |           |r|n|| _        t        dt	        ||j
                  z              | _        t        j                  d      | _	        t        j                  | j                  | j                  dd      | _        t        j                  | j                  | j                  dd      | _        t        |j                     | _        t        j                          | _        y )Nr   )output_sizer   )r   r   r`   rk   )rr   rs   dimr[   r\   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezerv   reducer   r
   r|   
act_reduceSigmoid
act_expand)rA   rU   r   r   r   r   s        r.   rs   z&AlignVisionSqueezeExciteLayer.__init__   s    !':V!S&*H*H!HIJ++:ii	
 ii	
 !!2!23**,r-   r#   r:   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }t        j                  ||      }|S r   )r   r   r   r   r   r(   mul)rA   r#   inputss      r.   r   z%AlignVisionSqueezeExciteLayer.forward5  sc    ]3M26M26		&-8r-   )Fr   r   s   @r.   r   r     sH    '0 '# '3 'X\ '*
U%6%6 
5<< 
r-   r   c                        e Zd ZdZdedededededef fdZd	e	j                  d
e	j                  de	j                  fdZ xZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rU   r   rt   rn   	drop_rateid_skipc                    t         |           |dk(  xr | | _        t        j                  ||ddd      | _        t        j                  ||j                  |j                        | _	        t        j                  |      | _        y )Nr   r   Fr   r   )p)rr   rs   apply_dropoutr   rv   project_convrx   ry   rz   
project_bnDropoutdropout)rA   rU   r   rt   rn   r   r   r   s          r.   rs   z#AlignVisionFinalBlockLayer.__init__G  sz     	#q[8[II 
 .. f&;&;fF`F`
 zzI.r-   
embeddingsr#   r:   c                     | j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S r   )r   r   r   r   )rA   r   r#   s      r.   r   z"AlignVisionFinalBlockLayer.forwardX  sG    ))-86 LL7M)J6Mr-   r$   r%   r&   r'   r   r\   floatr   rs   r(   r)   r   r   r   r   s   @r.   r   r   B  sj    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf r-   r   c                        e Zd ZdZdededededededed	ed
ef fdZde	j                  de	j                  fdZ xZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rU   r   rt   rn   expand_ratior`   r   r   r   c
                 l   t         |           || _        | j                  dk7  | _        ||z  }
| j                  rt	        |||
|      | _        t        || j                  r|
n||||	      | _        t        |||
| j                        | _	        t        || j                  r|
n|||||      | _        y )Nr   )rU   r   rt   rn   )rU   r   rn   r`   r   )rU   r   r   r   )rU   r   rt   rn   r   r   )rr   rs   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)rA   rU   r   rt   rn   r   r`   r   r   r   expand_in_dimr   s              r.   rs   zAlignVisionBlock.__init__~  s     	(''1,-;;6fmFDN 8$(KK=V#)
 <&]4;;
 5$(KK=V
r-   r#   r:   c                     |}| j                   dk7  r| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }|S Nr   )r   r   r   r   r   )rA   r#   r   s      r.   r   zAlignVisionBlock.forward  s[    "
! NN=9M++M: ++M:
MBr-   r   r   s   @r.   r   r   c  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
r-   r   c            	       f     e Zd ZdZdef fdZ	 	 d	dej                  dee	   dee	   de
fdZ xZS )
AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rU   c                     t                    |j                   _         fdt        |j                        }t        fd|j                  D              }d}g }t        |      D ]  }t        ||j                  |         }t        ||j                  |         }|j                  |   }	|j                  |   }
|j                  |   }t         |j                  |               D ]c  }|dk(  }|dkD  rdn|	}	|dkD  r|n|}||j                  v}|j                  |z  |z  }t        ||||	|
||||	      }|j!                  |       |dz  }e  t#        j$                  |       _        y )Nc                 Z    t        t        j                  j                  | z              S r   )r\   mathceildepth_coefficient)repeatsrA   s    r.   round_repeatsz2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr-   c              3   .   K   | ]  } |        y wr   r,   )r?   nr   s     r.   rB   z.AlignVisionEncoder.__init__.<locals>.<genexpr>  s     L3Kaq)3Ks   r   r   )	rU   r   rt   rn   r`   r   r   r   r   )rr   rs   r   rM   r   sumnum_block_repeatsranger_   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)rA   rU   num_base_blocks
num_blockscurr_block_numr   ir   rt   rn   r`   r   jr   r   r   blockr   r   s   `                @r.   rs   zAlignVisionEncoder.__init__  sv   !'!9!9	D f001L63K3KLL
'A"66+=+=a+@AF#FF,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEFq&!e$%Ev!/v7O7O!O"44~E
R	(!!#! +!-'##1
 e$!#' G (8 mmF+r-   r#   output_hidden_statesreturn_dictr:   c                     |r|fnd }| j                   D ]  } ||      }|s||fz  } |st        d ||fD              S t        ||      S )Nc              3   &   K   | ]	  }||  y wr   r,   )r?   vs     r.   rB   z-AlignVisionEncoder.forward.<locals>.<genexpr>  s     X$Fq!-$Fs   )r"   r#   )r   r+   r   )rA   r#   r   r   all_hidden_statesr   s         r.   r   zAlignVisionEncoder.forward  sj     1E],$[[E!-0M#!m%55! !
 X]4E$FXXX-++
 	
r-   )FT)r$   r%   r&   r'   r   rs   r(   r)   r   r   r   r   r   r   s   @r.   r   r     sW    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
r-   r   c                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     deej                     dej                  f
dZ
 xZS )
AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxrp   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_ids)dtype)rr   rs   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r=   r   register_bufferr(   rL   r   r   r   sizelongr~   s     r.   rs   zAlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r-   	input_idsr  r   inputs_embedsr:   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr   r   r  r   r  rH   r   )r  r   hasattrr  r   r(   r   r  rH   r  r  r   r
  r  r   )rA   r  r  r   r  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r
  s               r.   r   zAlignTextEmbeddings.forward  s?     #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r-   )NNNN)r$   r%   r&   r'   rs   r   r(   
LongTensorr)   r   r   r   r   s   @r.   r   r     s~    Q
* 15593759&E,,-& !!1!12& u//0	&
   1 12& 
&r-   r   modulequerykeyvalueattention_maskscalingr   	head_maskc                 .   t        j                  ||j                  dd            |z  }	|#|d d d d d d d |j                  d   f   }
|	|
z   }	t        j
                  j                  |	dt         j                        j                  |j                        }	t        j
                  j                  |	|| j                        }	||	|j                  dddd      z  }	t        j                  |	|      }|j                  dd      j                         }||	fS )NrX   r	   r   )r   r  )r   trainingr   )r(   matmul	transposeshaper   rJ   softmaxfloat32tor  r   r&  view
contiguous)r  r  r  r   r!  r"  r   r#  kwargsattn_weightscausal_maskattn_outputs               r.   eager_attention_forwardr3  =  s     <<s}}Q':;gEL!$Q1o		"o%=>#k1==((2U]](SVVW\WbWbcL==((6??([L#innQAq&AA,,|U3K''1-88:K$$r-   c                        e Zd Z fdZ	 	 	 ddej
                  deej                     deej                     dee   de	ej
                     f
dZ
 xZS )	AlignTextSelfAttentionc                 $   t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                         | _        |j                   | _        | j                  dz  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()g      )rr   rs   r  num_attention_headsr  
ValueErrorrU   r\   attention_head_sizeall_head_sizer   Linearr  r  r   r   attention_probs_dropout_probr   attention_dropoutr"  r~   s     r.   rs   zAlignTextSelfAttention.__init__Y  sC    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r-   r#   r!  r#  output_attentionsr:   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
t        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
|f| j                  sdn| j                  | j                  |d|\  }} |j                  g |d j                         }|r||f}|S |f}|S )Nr   r   rX   eager        )r   r"  r#  )r)  r;  r  r-  r(  r  r   r3  rU   _attn_implementationr   r&  r?  r"  reshaper.  )rA   r#   r!  r#  r@  r/  r  hidden_shapequery_states
key_statesvalue_statesattention_interfacer2  r0  outputss                  r.   r   zAlignTextSelfAttention.forwardn  sa    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
!\ *k));;;;FFH1B;- JUr-   NNF)r$   r%   r&   rs   r(   r   r   r)   r   r+   r   r   r   s   @r.   r5  r5  X  so    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	!r-   r5  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )rr   rs   r   r=  r  denser  r  r   r  r   r~   s     r.   rs   zAlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r-   r#   input_tensorr:   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rQ  r   r  rA   r#   rR  s      r.   r   zAlignTextSelfOutput.forward  7    

=1]3}|'CDr-   r$   r%   r&   rs   r(   r   r   r   r   s   @r.   rN  rN    1    >U\\  RWR^R^ r-   rN  c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
AlignTextAttentionc                     t         |           t        |      | _        t	        |      | _        t               | _        y r   )rr   rs   r5  rA   rN  outputsetpruned_headsr~   s     r.   rs   zAlignTextAttention.__init__  s0    *62	)&1Er-   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   )r   )rM   r   rA   r9  r;  r^  r   r  r  r   r\  rQ  r<  union)rA   headsindexs      r.   prune_headszAlignTextAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r-   r#   r!  r#  r@  r:   c                 p     | j                   |f|||d|}| j                  |d   |      }|f|dd  z   }|S N)r!  r#  r@  r   r   )rA   r\  )	rA   r#   r!  r#  r@  r/  self_outputsattention_outputrK  s	            r.   r   zAlignTextAttention.forward  s_     !tyy
)/	

 
  ;;|AF#%QR(88r-   rL  )r$   r%   r&   rs   rc  r(   r   r   r)   r   r+   r   r   r   s   @r.   rZ  rZ    st    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	r-   rZ  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rr   rs   r   r=  r  intermediate_sizerQ  rc   r|   strr
   intermediate_act_fnr~   s     r.   rs   zAlignTextIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r-   r#   r:   c                 J    | j                  |      }| j                  |      }|S r   )rQ  rm  r   s     r.   r   zAlignTextIntermediate.forward  s&    

=100?r-   rW  r   s   @r.   ri  ri    s#    9U\\ ell r-   ri  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y rP  )rr   rs   r   r=  rk  r  rQ  r  r  r   r  r   r~   s     r.   rs   zAlignTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r-   r#   rR  r:   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rT  rU  s      r.   r   zAlignTextOutput.forward  rV  r-   rW  r   s   @r.   rp  rp    rX  r-   rp  c                        e Zd Z fdZ	 	 	 d	dej
                  deej                     deej                     dee   de	ej
                     f
dZ
d Z xZS )
AlignTextLayerc                     t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |      | _	        y r   )
rr   rs   chunk_size_feed_forwardseq_len_dimrZ  	attentionri  intermediaterp  r\  r~   s     r.   rs   zAlignTextLayer.__init__  sI    '-'E'E$+F31&9%f-r-   r#   r!  r#  r@  r:   c                      | j                   |f|||d|}|d   }|dd  }t        | j                  | j                  | j                  |      }	|	f|z   }|S re  )rx  r   feed_forward_chunkrv  rw  )
rA   r#   r!  r#  r@  r/  self_attention_outputsrg  rK  layer_outputs
             r.   r   zAlignTextLayer.forward  s     "0"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r-   c                 L    | j                  |      }| j                  ||      }|S r   )ry  r\  )rA   rg  intermediate_outputr}  s       r.   r{  z!AlignTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr-   rL  )r$   r%   r&   rs   r(   r   r   r)   r   r+   r   r{  r   r   s   @r.   rt  rt    st    . 7;15,1|| !!2!23 E--.	
 $D> 
u||	2r-   rt  c                        e Zd Z fdZe	 	 	 	 	 d
dej                  deej                     deej                     dee	   dee	   dee	   de
eej                     ef   fd	       Z xZS )AlignTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rr   rs   rU   r   r   r   num_hidden_layersrt  layergradient_checkpointing)rA   rU   r   r   s      r.   rs   zAlignTextEncoder.__init__  sT    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A#r#   r!  r#  r@  r   r   r:   c           	          |rdnd }|rdnd }	t        | j                        D ]4  \  }
}|r||fz   }|||
   nd } |d||||d|}|d   }|s,|	|d   fz   }	6 |r||fz   }t        |||	      S )Nr,   )r#   r!  r#  r@  r   r   )r"   r#   r2   )	enumerater  r   )rA   r#   r!  r#  r@  r   r   r/  r   all_self_attentionsr   layer_modulelayer_head_masklayer_outputss                 r.   r   zAlignTextEncoder.forward  s     #7BD$5b4(4OA|#$58H$H!.7.CilO( +-)"3	
 M *!,M &9]1=M<O&O#!  5$   1]4D D++*
 	
r-   )NNFFT)r$   r%   r&   rs   r   r(   r   r   r)   r   r   r+   r   r   r   r   s   @r.   r  r    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r-   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )rr   rs   r   r=  r  rQ  Tanhr}   r~   s     r.   rs   zAlignTextPooler.__init__H  s9    YYv1163E3EF
'')r-   r#   r:   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )rQ  r}   )rA   r#   first_token_tensorpooled_outputs       r.   r   zAlignTextPooler.forwardM  s6     +1a40

#566r-   rW  r   s   @r.   r  r  G  s#    $
U\\ ell r-   r  c                   @    e Zd ZU eed<   dZdZdej                  fdZ	y)AlignPreTrainedModelrU   alignTr  c                 r   | j                   j                  }t        |t        j                  t        j
                  f      rZ|j                  j                  j                  d|       |j                  O|j                  j                  j                          n)t        |t              rt        j                  j                  |j                  j                         |j                  j                  j                  j                          |j                  j                  j!                  | j                   j"                         n~t        |t        j$                        rd|j                  j                  j                  d|       |j&                  1|j                  j                  |j&                     j                          t        |t        j(                  t        j*                  f      rJ|j                  j                  j                          |j                  j                  j!                  d       yy)zInitialize the weightsrC  )meanstdNg      ?)rU   initializer_rangerc   r   r=  rv   weightdatanormal_ro   zero_
AlignModelinitxavier_uniform_text_projectiontemperaturefill_temperature_init_valuer  r   r  rx   )rA   r  r  s      r.   _init_weightsz"AlignPreTrainedModel._init_weights\  sq   kk++fryy"))45MM&&CS&9{{&  &&(
+GG##F$:$:$A$AB""'',,224##))$++*L*LM-MM&&CS&9!!-""6#5#56<<>fr||R^^<=KK""$MM$$S) >r-   N)
r$   r%   r&   r   r*   base_model_prefixsupports_gradient_checkpointingr   Moduler  r,   r-   r.   r  r  V  s$    &*#*BII *r-   r  zJ
    The text model from ALIGN without any head or projection on top.
    c                   X    e Zd ZU eed<   dgZddedef fdZd Zd Z	e
e	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deeef   fd              Z xZS )AlignTextModelrU   r   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rr   rs   rU   r   r   r  encoderr  pooler	post_init)rA   rU   r  r   s      r.   rs   zAlignTextModel.__init__y  sM    
 	 -f5'/1Bof- 	r-   c                 .    | j                   j                  S r   r   r  rD   s    r.   get_input_embeddingsz#AlignTextModel.get_input_embeddings  s    ...r-   c                 &    || j                   _        y r   r  )rA   r   s     r.   set_input_embeddingsz#AlignTextModel.set_input_embeddings  s    */'r-   r  r!  r  r   r#  r  r@  r   r   r:   c
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }n!||j                         dd }nt	        d      |\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  ddd|f   }|j                  ||      }|}n&t        j                  |t        j                  |      }| j!                  ||      }| j#                  || j                   j$                        }| j                  ||||      } | j&                  |f||||d	d
|
}|d   }| j(                  | j)                  |      nd}t+        |||j,                  |j.                        S )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrG   r  r  )r  r   r  r  T)r!  r#  r@  r   r   r   )r"   pooler_outputr#   r2   )rU   r@  r   use_return_dictr:  %warn_if_padding_and_no_attention_maskr  rH   r(   onesr  r   r  r   r   r  get_extended_attention_maskget_head_maskr  r  r  r   r#   r2   )rA   r  r!  r  r   r#  r  r@  r   r   r/  r  
batch_sizer  rH   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                         r.   r   zAlignTextModel.forward  s%   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 '$,,
2/!5
 
 *!,8<8OO4UY)-')77&11	
 	
r-   T)	NNNNNNNNN)r$   r%   r&   r   r*   _no_split_modulesr   rs   r  r  r   r   r   r(   r   r)   r   r+   r   r   r   r   s   @r.   r  r  p  s'    ./ 4  /0  -11515/31504,0/3&*\
ELL)\
 !.\
 !.	\

 u||,\
 E--.\
  -\
 $D>\
 'tn\
 d^\
 
u00	1\
  \
r-   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 	 d
deej                     dee   dee   deeef   fd	              Z xZS )AlignVisionModelrU   r   Fc                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  r't        j                  |j                  d      | _        nN|j                  dk(  r't        j                  |j                  d      | _        nt        d|j                         | j                          y )Nr  T)	ceil_moder[   z2config.pooling must be one of ['mean', 'max'] got )rr   rs   rU   rg   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr:  poolingr  r~   s     r.   rs   zAlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r-   r:   c                 B    | j                   j                  j                  S r   )vision_modelr   rw   rD   s    r.   r  z%AlignVisionModel.get_input_embeddings  s      ++777r-   r   r   c                 f   ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  ||d      }|d   }| j                  |      }|j                  |j                  dd       }t        |||j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_valuesT)r   r   r   rX   )r"   r  r#   )rU   r   r  r:  r   r  r  rE  r)  r   r#   )rA   r   r   r   r  r  r"   r  s           r.   r   zAlignVisionModel.forward  s    : %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5 ' 
 ,A.$56%--m.A.A"1.EF7/')77
 	
r-   )NNN)r$   r%   r&   r   r*   main_input_namer  rs   r   r  r  r   r   r   r(   r)   r   r   r+   r   r   r   r   s   @r.   r  r    s     $O&+#0 "8bii 8  59/3&*	2
u0012
 'tn2
 d^	2

 
u>>	?2
  2
r-   r  c                       e Zd ZU eed<   def fdZ e       e	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     d	e	j                  fd
              Z e       ede	j                  d	e	j                  fd              Zee	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee	j                     dee   dee   dee   dee   d	eeef   fd              Z xZS )r  rU   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        t        |      | _        t        |      | _        t!        j"                  | j                  | j                        | _        t!        j&                  t)        j*                  | j,                  j.                              | _        | j3                          y )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rr   rs   rc   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  r  r   r=  r  	Parameterr(   tensorrU   r  r  r  )rA   rU   r  r  r   s       r.   rs   zAlignModel.__init__I  s#    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r-   r  r!  r  r   r#  r  r:   c                 t    | j                  ||||||      }|d   dddddf   }| j                  |      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```)r  r!  r  r   r#  r  r   N)r  r  )
rA   r  r!  r  r   r#  r  text_outputsr"   text_featuress
             r.   get_text_featureszAlignModel.get_text_featuresg  sW    : ))%' ' 
 )OAq!G4,,->?r-   r   c                 B    | j                  |      }|j                  }|S )a]  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```)r   )r  r  )rA   r   vision_outputsimage_featuress       r.   get_image_featureszAlignModel.get_image_features  s(    2 ***E'55r-   return_lossr@  r   r   c                 d   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }| j	                  ||
d      }| j                  |||||||	|
d	      }|d   }|d   dddddf   }| j                  |      }||j                  ddd	      z  }||j                  ddd	      z  }t        j                  ||j                               | j                  z  }|j                         }d}|rt        |      }t        |||||||
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AlignModel
        >>> from transformers.image_utils import load_image

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NT)r   r   r   )	r  r!  r  r   r#  r  r@  r   r   r   r   rX   r   )r   r   keepdim)r5   r6   r7   r1   r!   r8   r9   )rU   r@  r   r  r  r  r  normr(   r'  rQ   r  rT   r4   )rA   r  r   r!  r  r   r#  r  r  r@  r   r   r  r  r!   r1   r7   r6   r5   s                      r.   r   zAlignModel.forward  sr   V 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5 + 
 ))%'/!5 ' 

 &a("1oaAg.**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4DEHXHXX*,,.o.D-+#%* .
 	
r-   )NNNNNN)NNNNNNNNNNN)r$   r%   r&   r   r*   rs   r   r   r   r(   r   r)   r  r  r   r  r   r   r+   r4   r   r   r   s   @r.   r  r  E  s"   { < %& -11515/3,004&ELL)& !.& !.	&
 u||,& ELL)&  -& 
		&  '&P %&u/@/@ UEVEV   '6  15481515/3,004&*,0/3&*Y
E,,-Y
 u001Y
 !.	Y

 !.Y
 u||,Y
 ELL)Y
  -Y
 d^Y
 $D>Y
 'tnY
 d^Y
 
uk!	"Y
  Y
r-   r  )r  r  r  r  r  )rC  N)Mr'   r   dataclassesr   typingr   r   r   r   r(   r   activationsr
   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_alignr   r   r   
get_loggerr$   loggerr    r0   r4   r   rN   rT   r\   r_   r+   r   re   r  rg   rv   r   r   r   r   r   r   r   r   r   r3  r5  rN  rZ  ri  rp  rt  r  r  r  r  r  r  __all__r,   r-   r.   <module>r     so     ! 1 1   ! 9  G l l l l P P 
		H	% 
=[ = = 
	:; 	: 	:  
+  
   
JuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
T<")) <L (,%II%<<% 
% <<	%
 U\\*% % % %%67RYY 7v")) * *\BII  bii %/ %P.
ryy .
dbii  *? * *2 
x
) x

x
v 
M
+ M

M
` C
% C
 C
L Wr-   