
    <hK                       S r SSKrSSKJr  SSKJr  SSKJrJr  SSK	r	SSK
r	SSK	Jr  SSKJr  SS	KJrJr  SS
KJrJr  SSKJr  SSKJrJrJrJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'J(r(J)r)  \$RT                  " \+5      r,Sr-\\#" SS9 " S S\5      5       5       r.\\#" SS9 " S S\5      5       5       r/ " S S\R`                  5      r1 " S S\R`                  5      r2 " S S\R`                  5      r3 " S  S!\R`                  5      r4 " S" S#\R`                  5      r5 " S$ S%\R`                  5      r6 " S& S'\R`                  5      r7 " S( S)\R`                  5      r8 " S* S+\R`                  5      r9 " S, S-\R`                  5      r:S.\:0r; " S/ S0\R`                  5      r< " S1 S2\R`                  5      r= " S3 S4\5      r> " S5 S6\R`                  5      r? " S7 S8\R`                  5      r@SVS9 jrA\# " S: S;\5      5       rB " S< S=\B5      rC\#" S>S9 " S? S@\B5      5       rD\#" SAS9 " SB SC\B5      5       rE " SD SE\R`                  5      rF " SF SG\R`                  5      rG " SH SI\R`                  5      rH\#" SJS9 " SK SL\B5      5       rI\#" SMS9 " SN SO\B5      5       rJ " SP SQ\R`                  5      rK\#" SRS9 " SS ST\B5      5       rL/ SUQrMg)WzPyTorch BridgeTower Model    N)OrderedDict)	dataclass)OptionalUnion)nn)CrossEntropyLoss   )ACT2FNQuickGELUActivation)CacheEncoderDecoderCache)GradientCheckpointingLayer))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)PreTrainedModelapply_chunking_to_forward) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerz.
    Output type of [`BridgeTowerModel`].
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
BridgeTowerModelOutput0   a  
text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
    Sequence of hidden-states at the text output of the last layer of the model.
image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
    Sequence of hidden-states at the image output of the last layer of the model.
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
    Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
    token), respectively, after further processing through layers used for auxiliary pretraining tasks.
Ntext_featuresimage_featurespooler_outputhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r$   r   torchFloatTensor__annotations__r%   r&   r'   tupler(   __static_attributes__r)       l/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr"   r"   0   s|     26M8E--.526NHU../615M8E--.58<M8E%"3"345<59Ju00129r4   r"   z>
    Output type of ['BridgeTowerForContrastiveLearning']
    c                   P   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\\R                        \	S
'   Srg)BridgeTowerContrastiveOutputH   a  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Image-text contrastive loss.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The image embeddings obtained by applying the projection layer to the pooler_output.
cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
    The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogitstext_embedsimage_embedscross_embedsr'   r(   r)   )r*   r+   r,   r-   r.   r9   r   r/   r0   r1   r:   r;   r2   r<   r=   r'   r(   r3   r)   r4   r5   r7   r7   H   s      )-D(5$$
%,*.FHU&&'.6:K% 1 123:7;L(5!2!234;7;L(5!2!234;8<M8E%"3"345<59Ju00129r4   r7   c                      ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSS\R                  S\\R                     4S jjr	Sr
U =r$ )	BridgeTowerResidualAttentionh   c                 h  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " [        S[        R                  " UR                  UR                  S-  5      4S[        5       4S[        R                  " UR                  S-  UR                  5      4/5      5      U l        [        R                  " UR                  UR                  S9U l        S U l        g )N@   epsc_fc   geluc_proj)super__init__r   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r5   rJ   %BridgeTowerResidualAttention.__init__i   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r4   hidden_stateattention_maskc           	         Ub(  UR                  [        R                  UR                  S9nU R                  b.  U R                  R                  UR
                  UR                  S9OS U l        U R                  UUUSU R                  US9S   $ )NdtypedeviceF)need_weightsrU   key_padding_maskr   )tor/   boolr`   rU   r_   rM   )rW   r[   r\   s      r5   	attention&BridgeTowerResidualAttention.attentionz   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r4   c                     XR                  U R                  U5      U5      -   nU R                  U5      nU R                  R	                  5        H  nU" U5      nM     X1-   nU$ N)re   rP   rT   rS   values)rW   r[   r\   residual_statelayers        r5   forward$BridgeTowerResidualAttention.forward   sZ    %tyy7NP^(__yy0XX__&E .L '%4r4   )rM   rU   rP   rT   rS   rh   )r*   r+   r,   r-   rJ   r/   Tensorre   r   rl   r3   __classcell__rY   s   @r5   r?   r?   h   sH    "ell ELL "ELL (5<<BX  r4   r?   c                   l   ^  \ rS rSrU 4S jrSS\R                  S\\R                     4S jjrSr	U =r
$ )BridgeTowerTransformer   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  (       aK  [
        R                  " [        U R                  S-
  5       Vs/ sH  n[        U5      PM     sn5      U l	        OG[
        R                  " [        U R                  5       Vs/ sH  n[        U5      PM     sn5      U l	        UR                  U l
        g s  snf s  snf )Nr   )rI   rJ   rL   num_hidden_layersremove_last_layerr   
ModuleListranger?   	resblocksstop_gradientrW   rX   _rY   s      r5   rJ   BridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a?`!-f5?`aDN  ]]?DTE[E[?\]?\!-f5?\]DN $11 b ^s   -C'5C,r[   r\   c                     / nU R                    HN  nU" X5      nU R                  (       a!  UR                  UR                  5       5        M=  UR                  U5        MP     U$ rh   )ry   rz   appenddetach)rW   r[   r\   r'   blocks        r5   rl   BridgeTowerTransformer.forward   sU    ^^E >L!!$$\%8%8%:;$$\2 $ r4   )rL   ru   ry   rz   rh   )r*   r+   r,   r-   rJ   r/   rn   r   rl   r3   ro   rp   s   @r5   rr   rr      s-    2ELL (5<<BX  r4   rr   c                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S\R                  4S
 jjrSrU =r$ )BridgeTowerVisionEmbeddings   rX   c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " U R                  5      5      U l        [        R                  " UR                  U R                  U R                  U R                  SS9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R"                  " U R                   U R                  5      U l        U R'                  S[        R(                  " U R                   5      R+                  S5      SS9  g )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)rI   rJ   rX   rL   	embed_dim
image_size
patch_sizer   	Parameterr/   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrV   s     r5   rJ   $BridgeTowerVisionEmbeddings.__init__   s   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr4   
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                  R                  S5      nUR                   S   S-
  n[        R
                  R                  5       (       d%  XF:X  a   X#:X  a  U R                  U R                  5      $ USS2SS24   nUSS2SS24   nUR                   S   n	X R                  -  n
X0R                  -  n[        US-  5      nUR                  SXU	5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU	5      n[        R                   " Xx4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   r   Nr         ?r	   r   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer/   jit
is_tracingr   r   r   reshapepermuter   
functionalinterpolateviewcat)rW   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r5   interpolate_pos_encoding4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   si    !&&q)A-!44;;EEaH*003a7 yy##%%+*F6?**4+<+<==,QU3,QU3r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr4   pixel_valuesc                 ^   UR                   u  p4pVU(       dJ  XPR                  :w  d  X`R                  :w  a,  [        SU SU SU R                   SU R                   S3	5      eU R                  R                  R
                  nU R                  UR                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      n	[        R                  " X/SS	9n
U(       a  XR                  XU5      -   n
U
$ XR                  U R                  5      -   n
U
$ )
NzInput image size (*z) doesn't match model (z).r_   r   r   r   r   )r   r   
ValueErrorr   r   r_   rc   flatten	transposer   r   r/   r   r   r   r   )rW   r   r   
batch_sizer|   r   r   target_dtypepatch_embedsclass_embedsr   s              r5   rl   #BridgeTowerVisionEmbeddings.forward   s$   '3'9'9$
v'V-F%SbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYY;C
##&C&CJX]&^^J  $&=&=d>O>O&PPJr4   )	r   rX   r   r   r   r   r   r   r   F)r*   r+   r,   r-   r   rJ   r/   rn   intr   r0   rl   r3   ro   rp   s   @r5   r   r      sj    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf  r4   r   c                      ^  \ rS rSrU 4S jr S
S\R                  S\4S jjr S
S\R                  S\4S jjr	S\R                  4S jr
S	rU =r$ )BridgeTowerVisionTransformeri  c           
      4  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  UR                  S9U l        [        U5      U l
        [        R
                  " UR                  UR                  S9U l        UR                  U l        UR                  (       df  [        R                  " [        UR                  5       Vs/ sH,  n[        R
                  " UR                  UR                  S9PM.     sn5      U l        g g s  snf NrC   )rI   rJ   r   r   r   rN   rL   rO   ln_prerr   transformerln_postshare_layernormrw   rx   ru   ln_separater{   s      r5   rJ   %BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvVuQRf00f6K6KLVuv D &vs   2Dr   r   c                    U R                  X5      nU R                  U5      nUR                  SSS5      nU R                  XB5      n[        R
                  " USS9nUR                  SSSS5      nU R                  (       a  U R                  U5      nU$ / n[        X@R                  5       H  u  pFU" U5      nUR                  U5        M      [        R
                  " USS9nU$ )Nr   r   r   r   r	   )r   r   r   r   r/   stackr   r   zipr   r   )rW   r   r\   r   r'   hidden_states_stacklns          r5   rl   $BridgeTowerVisionTransformer.forward  s     OM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I! "= 1#**=9 &J "KK(;CMr4   c                 l    U R                  XS9nU R                  U5      nUR                  SSS5      nU$ )Nr   r   r   r   )r   r   r   )rW   r   r   r'   s       r5   forward_pre(BridgeTowerVisionTransformer.forward_pre+  s<    
 hM2%--aA6r4   r[   c                 N    UR                  SSS5      nU R                  U5      nU$ )Nr   r   r   )r   r   )rW   r[   visual_output_posts      r5   forward_post)BridgeTowerVisionTransformer.forward_post6  s-    )11!Q:!\\*<=!!r4   )r   r   r   r   r   r   r   )r*   r+   r,   r-   rJ   r/   rn   rd   rl   r   r   r3   ro   rp   s   @r5   r   r     s]    " */	ll #'	< */	ll	 #'	" " "r4   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerLinkToweri<  c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  S;   a  UR                  S:X  a0  [        R
                  " [        R                  " S5      5      U l        O?UR                  S:X  a/  [        R
                  " [        R                  " S5      5      U l	        [        R                  " U R                  UR                  S9U l
        g [        SUR                   S35      e)	N)add
scaled_addr   r         ?r   r   rC   link_tower_type  is not implemented)rI   rJ   link_tower_typerL   r   r   r/   tensorscaled_factorbetarN   rO   NotImplementedErrorrV   s     r5   rJ   BridgeTowerLinkTower.__init__=  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer4   c                 Z   U R                   S:X  a  U R                  X-   5      $ U R                   S:X  a   U R                  XR                  -  U-   5      $ U R                   S:X  a0  U R                  USU R                  -
  -  X R                  -  -   5      $ [	        SU R                    S35      e)Nr   r   r   r   r   r   )r   rN   r   r   r   )rW   r'   cross_modal_hidden_statesr\   s       r5   rl   BridgeTowerLinkTower.forwardJ  s    5(>>-"KLL!!\1>>-2D2D"DG`"`aa!!]2>>-1tyy="AD]`i`iDi"ijj%(89M9M8NNa&bccr4   )rN   r   rL   r   r   r*   r+   r,   r-   rJ   rl   r3   ro   rp   s   @r5   r   r   <  s    fd dr4   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerSelfOutputiV  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )rI   rJ   r   rR   rL   denserN   rO   Dropouthidden_dropout_probdropoutrV   s     r5   rJ   BridgeTowerSelfOutput.__init__W  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r4   r'   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rh   r   r   rN   rW   r'   r  s      r5   rl   BridgeTowerSelfOutput.forward]  5    

=1]3}'CDr4   rN   r   r   
r*   r+   r,   r-   rJ   r/   rn   rl   r3   ro   rp   s   @r5   r   r   V  6    >U\\  RWR^R^  r4   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerIntermediateie  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g rh   )rI   rJ   r   rR   rL   intermediate_sizer   
isinstance
hidden_actstrr
   intermediate_act_fnrV   s     r5   rJ    BridgeTowerIntermediate.__init__f  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r4   r'   r   c                 J    U R                  U5      nU R                  U5      nU$ rh   r   r  rW   r'   s     r5   rl   BridgeTowerIntermediate.forwardn  s&    

=100?r4   r  r  rp   s   @r5   r  r  e  s(    9U\\ ell  r4   r  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )BridgeTowerOutputiu  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )rI   rJ   r   rR   r  rL   r   rN   rO   r   r   r   rV   s     r5   rJ   BridgeTowerOutput.__init__v  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r4   r'   r  r   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ rh   r  r  s      r5   rl   BridgeTowerOutput.forward|  r  r4   r  r  rp   s   @r5   r  r  u  r	  r4   r  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )BridgeTowerPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g rh   )rI   rJ   r   rR   rL   r   Tanh
activationrV   s     r5   rJ   BridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r4   r'   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r!  )rW   r'   first_token_tensorpooled_outputs       r5   rl   BridgeTowerPooler.forward  s6     +1a40

#566r4   )r!  r   r  rp   s   @r5   r  r    s(    $
U\\ ell  r4   r  c                     ^  \ rS rSrSU 4S jjr      SS\R                  S\\R                     S\\R                     S\\R                     S\\	   S\\
   S	\\R                     S
\\R                     4S jjrSrU =r$ )BridgeTowerSelfAttentioni  c                   > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        U=(       d    [#        USS5      U l        U R$                  S:X  d  U R$                  S	:X  aG  UR&                  U l        [        R(                  " S
UR&                  -  S-
  U R                  5      U l        UR,                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()position_embedding_typeabsoluterelative_keyrelative_key_queryr   r   )rI   rJ   rL   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer   rR   querykeyvaluer   attention_probs_dropout_probr   getattrr,  max_position_embeddingsr   distance_embedding
is_decoder	layer_idxrW   rX   r,  r<  rY   s       r5   rJ   !BridgeTowerSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++"r4   r'   r\   	head_maskencoder_hidden_statespast_key_valueoutput_attentionscache_positionr   c                 `	   UR                   u  pn
U R                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUS LnUb]  [        U[        5      (       aF  UR                  R                  U R                  5      nU(       a  UR                  nOUR                  nOUnU(       a  UOUnU(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R!                  U5      nUR                  USU R                  U R                  5      R                  SS5      nU R#                  U5      nUR                  USU R                  U R                  5      R                  SS5      nUbN  U(       d  UOS nWR%                  UUU R                  SU05      u  nnU(       a  SUR                  U R                  '   [&        R(                  " UUR                  SS5      5      nU R*                  S:X  d  U R*                  S:X  Ga  UR                   S   UR                   S   nnUbB  [&        R,                  " US-
  [&        R.                  UR0                  S	9R                  SS5      nO>[&        R2                  " U[&        R.                  UR0                  S	9R                  SS5      n[&        R2                  " U[&        R.                  UR0                  S	9R                  SS5      nUU-
  nU R5                  UU R6                  -   S-
  5      nUR9                  UR:                  S
9nU R*                  S:X  a  [&        R<                  " SUU5      nUU-   nOHU R*                  S:X  a8  [&        R<                  " SUU5      n[&        R<                  " SUU5      nUU-   U-   nU[>        R@                  " U R                  5      -  nUb  UU-   n[B        RD                  RG                  USS9nU RI                  U5      nUb  UU-  n[&        R(                  " UU5      nURK                  SSSS5      RM                  5       nURO                  5       S S U RP                  4-   nUR                  U5      nUU4$ )Nr   r   r   rC  Tr.  r/  r^   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r   r	   ))r   r4  r   r0  r2  r   r  r   
is_updatedgetr<  cross_attention_cacheself_attention_cachelayerskeysri   r5  r6  updater/   matmulr,  r   longr`   r   r:  r9  rc   r_   einsummathsqrtr   r   softmaxr   r   
contiguousr   r3  )rW   r'   r\   r?  r@  rA  rB  rC  r   
seq_lengthr|   query_layeris_cross_attentionrF  curr_past_key_valuecurrent_states	key_layervalue_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapes                                  r5   rl    BridgeTowerSelfAttention.forward  sO    %2$7$7!
jj/!&&z2t7O7OQUQiQijttq
 3$>%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#2D.-."<+224>>BGGI-44T^^DKKK0I!z2t7O7OQUQiQijtt1I **^4K%**B 8 8$:R:Ri1o  )7It)<)C)C{DNN=M~<^*&	; &@DN--dnn= !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r4   )r3  r2  r:  r   r;  r5  r<  r9  r0  r,  r4  r6  NNNNNNFN)r*   r+   r,   r-   rJ   r/   rn   r   r0   r   rd   r2   rl   r3   ro   rp   s   @r5   r(  r(    s    #< 7;15=A*.,115d.||d. !!2!23d. E--.	d.
  ((9(9:d. !d. $D>d. !.d. 
u||	d. d.r4   r(  eagerc                     ^  \ rS rSrSU 4S jjrS r      SS\R                  S\\R                     S\\R                     S\\R                     S\\
   S	\\   S
\\R                     S\\R                     4S jjrSrU =r$ )BridgeTowerAttentioni  c                    > [         TU ]  5         [        UR                     " UUUS9U l        [        U5      U l        [        5       U l        g )Nr,  r<  )	rI   rJ   #BRIDGE_TOWER_SELF_ATTENTION_CLASSES_attn_implementationrW   r   outputsetpruned_headsr=  s       r5   rJ   BridgeTowerAttention.__init__  sF    78S8ST$;
	
 ,F3Er4   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rW   r0  r2  rt  r   r4  r5  r6  rr  r   r3  union)rW   headsindexs      r5   prune_heads BridgeTowerAttention.prune_heads(  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r4   r'   r\   r?  r@  rA  rB  rC  r   c           
      l    U R                  UUUUUUUS9nU R                  US   U5      n	U	4USS  -   n
U
$ )Nr\   r?  r@  rA  rB  rC  r   r   )rW   rr  )rW   r'   r\   r?  r@  rA  rB  rC  self_outputsattention_outputoutputss              r5   rl   BridgeTowerAttention.forward:  s\     yy)"7)/) ! 
  ;;|AF#%QR(88r4   )rr  rt  rW   ri  rj  )r*   r+   r,   r-   rJ   r{  r/   rn   r   r0   r   rd   r2   rl   r3   ro   rp   s   @r5   rm  rm    s    ";* 7;15=A*.,115|| !!2!23 E--.	
  ((9(9: ! $D> !. 
u||	 r4   rm  c                   H   ^  \ rS rSrSU 4S jjr      SS jrS rSrU =r$ )BridgeTowerBertCrossLayeriR  c                   > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        [	        XS9U l        [        U5      U l
        [        U5      U l        g )Nr   r<  )rI   rJ   chunk_size_feed_forwardseq_len_dimrm  re   r;  add_cross_attentioncrossattentionr  intermediater  rr  rW   rX   r<  rY   s      r5   rJ   "BridgeTowerBertCrossLayer.__init__S  sq    '-'E'E$-fJ ++#)#=#= 26O3F;'/r4   c	           
          U R                  UUS US S9n	U	S   n
U	SS  nU R                  U
UUUUUUS9nUS   n
XSS  -   n[        U R                  U R                  U R
                  U
5      nU4U-   nU$ )N)r\   r?  rB  rA  r   r   r~  )re   r  r   feed_forward_chunkr  r  )rW   r'   r@  r\   r?  encoder_attention_maskrA  rB  rC  self_attention_outputsr  r  cross_attention_outputslayer_outputs                 r5   rl   !BridgeTowerBertCrossLayer.forward^  s     "&)/ "0 "
 2!4 ),"&"5"51"7)/) #6 #
 315AB770##T%A%A4CSCSUe
  /G+r4   c                 J    U R                  U5      nU R                  X!5      nU$ rh   r  rr  rW   r  intermediate_outputr  s       r5   r  ,BridgeTowerBertCrossLayer.feed_forward_chunk  )    "//0@A{{#6Ir4   r  re   r  r  r  r;  rr  r  rh   rj  )	r*   r+   r,   r-   rJ   rl   r  r3   ro   rp   s   @r5   r  r  R  s,    	0 #+Z r4   r  c                   .  ^  \ rS rSrSU 4S jjr       SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\\
   S
\\R                     S\\R                     4S jjrS rSrU =r$ )BridgeTowerTextLayeri  c                 r  > [         TU ]  5         UR                  U l        SU l        [	        XS9U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        USUS9U l	        [        U5      U l        [        U5      U l        g )Nr   r  z> should be used as a decoder model if cross attention is addedr-  ro  )rI   rJ   r  r  rm  re   r;  r  r   r  r  r  r  rr  r  s      r5   rJ   BridgeTowerTextLayer.__init__  s    '-'E'E$-fJ ++#)#=#= ##?? D6)g!hii"6vWamv"wD3F;'/r4   r'   r\   r?  r@  r  rA  rB  rC  r   c	           
      z   U R                  UUUUUUS9n	U	S   n
U R                  (       a  U	SS nOU	SS  nU R                  (       aD  UbA  [        U S5      (       d  [        SU  S35      eU R	                  U
UUUUUUS9nUS   n
XSS -   n[        U R                  U R                  U R                  U
5      nU4U-   $ )	N)r\   r?  rB  rA  rC  r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r~  )	re   r;  r1  r   r  r   r  r  r  )rW   r'   r\   r?  r@  r  rA  rB  rC  r  r  r  r  r  s                 r5   rl   BridgeTowerTextLayer.forward  s    "&)/)) "0 "
 2!4 ??,Qr2G,QR0G??4@4!122 =dV DD D 
 '+&9&9 5#&;-"3- ': '#  7q9" ==G0##T%A%A4CSCSUe
 ((r4   c                 J    U R                  U5      nU R                  X!5      nU$ rh   r  r  s       r5   r  'BridgeTowerTextLayer.feed_forward_chunk  r  r4   r  rh   )NNNNNFN)r*   r+   r,   r-   rJ   r/   rn   r   r0   r   rd   r2   rl   r  r3   ro   rp   s   @r5   r  r    s    0" 7;15=A>B*.,1152)||2) !!2!232) E--.	2)
  ((9(9:2) !)):): ;2) !2) $D>2) !.2) 
u||	2)h r4   r  c                   v  ^  \ rS rSrSU 4S jjr          SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	\	\R                           S	\\
   S
\\
   S\\
   S\\
   S\\R                     S\\	\R                     \4   4S jjrSrU =r$ )BridgeTowerTextEncoderi  c           
         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l        SU l	        g s  snf )Nr  F)
rI   rJ   rX   r   rw   rx   ru   r  rk   gradient_checkpointing)rW   rX   r<  irY   s       r5   rJ   BridgeTowerTextEncoder.__init__  sX    ]]@EfF^F^@_`@_1!&6@_`

 ',# as   A#r'   r\   r?  r@  r  past_key_values	use_cacherB  output_hidden_statesreturn_dictrC  r   c                 `   U	(       a  SOS nU(       a  SOS nU(       a  U R                   R                  (       a  SOS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       a]  U R                   R                  (       aB  [        U[        5      (       d-  [        R                  S5        Sn[        R                  " U5      n[        U R                  5       Hf  u  nnU	(       a  X4-   nUb  UU   OS nU" UUUUUUUUS9nUS   nU(       d  M7  UUS   4-   nU R                   R                  (       d  M]  UUS	   4-   nMh     U	(       a  X4-   nU(       a  UR                  5       nU
(       d  [        S
 UUUUU4 5       5      $ [        UUUUUS9$ )Nr)   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.T)r  rA  rB  rC  r   r   r   c              3   .   #    U H  nUc  M  Uv   M     g 7frh   r)   .0vs     r5   	<genexpr>1BridgeTowerTextEncoder.forward.<locals>.<genexpr>%  s"      
A     	)last_hidden_stater  r'   r(   cross_attentions)rX   r  r  trainingloggerwarning_oncer;  r  r   r   from_legacy_cache	enumeraterk   to_legacy_cacher2   r   )rW   r'   r\   r?  r@  r  r  r  rB  r  r  rC  all_hidden_statesall_self_attentionsall_cross_attentionsreturn_legacy_cacher  layer_modulelayer_head_masklayer_outputss                       r5   rl   BridgeTowerTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#//
?TY8Z8Z\
 #'1CCOTO(4OA|#$58H$H!.7.CilO(%'=."3-	M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U(+  5.   14D D-==?O 
 "#%'(
 
 
 9+++*1
 	
r4   )rX   r  rk   rh   )
NNNNNNFFTN)r*   r+   r,   r-   rJ   r/   rn   r   r0   r2   rd   r   r   rl   r3   ro   rp   s   @r5   r  r    s(   , 7;15=A>BEI$(,1/4&*15R
||R
 !!2!23R
 E--.	R

  ((9(9:R
 !)):): ;R
 "%e.?.?(@"ABR
 D>R
 $D>R
 'tnR
 d^R
 !.R
 
uU\\"$MM	NR
 R
r4   r  c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )BridgeTowerTextEmbeddingsi:  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)padding_idxrC   r,  r-  r   r   Fr   token_type_idsr   )rI   rJ   r   r   
vocab_sizerL   pad_token_idword_embeddingsr9  position_embeddingstype_vocab_sizetoken_type_embeddingsrN   rO   r   r   r   r8  r,  r   r/   r   r   zerosr   r   rN  r  rV   s     r5   rJ   "BridgeTowerTextEmbeddings.__init__@  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r4   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr   r   r  r   r^   r-  )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   r1  r  r   r/   r  rN  r   r`   r  r  r,  r  rN   r   )rW   	input_idsr  r   inputs_embedspast_key_values_lengthinput_shaperT  buffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r5   rl   !BridgeTowerTextEmbeddings.forwardY  sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r4   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr   r   r^   r   )r   r/   r   r  rN  r`   r   r   )rW   r  r  sequence_lengthr   s        r5   r  @BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embeds  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r4   )rN   r   r  r,  r  r  r  )NNNNr   )
r*   r+   r,   r-   r.   rJ   rl   r  r3   ro   rp   s   @r5   r  r  :  s$    

4 rs&P= =r4   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )ner   r/   cumsumtype_asrN  )r  r  r  maskincremental_indicess        r5   r  r    sW     <<$((*D <<!4<<TBE[[_cc##%33r4   c                   V    \ rS rSr% \\S'   SrSrSS/rSr	S\
R                  4S	 jrS
rg)BridgeTowerPreTrainedModeli  rX   bridgetowerFr(  r?   r  modulec                 z   U R                   R                  n[        U[        5      (       Ga  U R                   R                  S-  SU R                   R
                  -  S-  -  nU R                   R                  S-  nSU R                   R                  -  S-  nUR                  R                   GH   n[        R                  R                  UR                  R                  XB-  S9  UR                  R                  R                  R                  5         [        R                  R                  UR                  R                   R"                  X2-  S9  [        R                  R                  UR$                  R&                  R"                  XR-  S9  [        R                  R                  UR$                  R(                  R"                  X2-  S9  GM#     [        R                  R                  UR*                  R,                  XB-  S9  [        R                  R                  UR*                  R.                  R"                  XB-  S9  GO[        U[        R0                  [        R2                  [        R4                  45      (       a(  UR"                  R                  R                  SSU-  S9  O[        U[        R6                  5      (       aJ  UR8                  R                  R                  5         UR"                  R                  R;                  S5        ON[        U[<        5      (       a9  UR>                  R                  R;                  U R                   R@                  5        [        U[        R0                  [B        45      (       a3  UR8                  b%  UR8                  R                  R                  5         g g g )Ng      r   )stdg        g?)meanr  r   )"rX   initializer_factorr  r   rL   ru   r   ry   r   initnormal_rM   in_proj_weightin_proj_biasdatazero_out_projr   rS   rE   rH   r   r   r   rR   r   r   rN   r   fill_!BridgeTowerForContrastiveLearninglogit_scalelogit_scale_init_valueBridgeTowerMLMHead)rW   r  r  proj_stdattn_stdfc_stdr   s          r5   _init_weights(BridgeTowerPreTrainedModel._init_weights  sz   kk,,f:;;//51t{{?\?\;\ae:efH{{..4H$++111d:F++55

 9 9x~N

'',,224

 3 3 : :O		 5 56<H		 0 0 7 7X^L 6 GGOOF--==8>ORGGOOF--@@GGX^O\BIIr|| DEEMM&&CTCZ&@--KK""$MM$$S) ABB##))$++*L*LMfryy*<=>>6;;CZKK""$ D[>r4   r)   N)r*   r+   r,   r-   r   r1   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr   Moduler  r3   r)   r4   r5   r  r    s6    %&+#35ST"3%BII %r4   r  c                   N   ^  \ rS rSr% \\S'   U 4S jr\S 5       rSS jr	Sr
U =r$ )BridgeTowerVisionModeli  rX   c                 D   > [         TU ]  U5        [        U5      U l        g rh   )rI   rJ   r   visualrV   s     r5   rJ   BridgeTowerVisionModel.__init__  s     26:r4   c                 j    U R                   R                  R                  R                  R                  $ rh   )r  r   r   r   r_   rW   s    r5   r_   BridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr4   c                 X    U R                  UR                  U R                  5      X#5      $ rh   )r  typer_   )rW   image
image_maskr   s       r5   rl   BridgeTowerVisionModel.forward  s     {{5::djj1:XXr4   )r  )NF)r*   r+   r,   r-   r   r1   rJ   propertyr_   rl   r3   ro   rp   s   @r5   r  r    s0    ##; C CY Yr4   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c            "          ^  \ rS rSr% \\S'   SU 4S jjrS rS rS r	\
              SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\R                     S\\\R                     \4   4S jj5       rSrU =r$ )BridgeTowerTextModeli  rX   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
rI   rJ   rX   r  r   r  encoderr  pooler	post_init)rW   rX   add_pooling_layerrY   s      r5   rJ   BridgeTowerTextModel.__init__  sL    
 	 3F;-f53D'/$ 	r4   c                 .    U R                   R                  $ rh   r   r  r
  s    r5   get_input_embeddings)BridgeTowerTextModel.get_input_embeddings  s    ...r4   c                 $    XR                   l        g rh   r  rW   r6  s     r5   set_input_embeddings)BridgeTowerTextModel.set_input_embeddings  s    */'r4   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rk   re   r{  )rW   heads_to_prunerk   ry  s       r5   _prune_heads!BridgeTowerTextModel._prune_heads  s<    
 +002LELLu%//;;EB 3r4   r  r\   r  r   r?  r  r@  r  r  r  rB  r  r  rC  r   c                 
   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  (       a  U
b  U
OU R                   R
                  n
OSn
Ub  Ub  [        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       S S nO[        S5      eUu  nnUb  UR                  OUR                  nSnU	b:  [        U	[        5      (       d  U	S   S   R                  S   OU	R                  5       nUc  [        R                  " UUU-   4US9nUcs  [!        U R"                  S5      (       a4  U R"                  R$                  S S 2S U24   nUR'                  UU5      nUnO$[        R(                  " U[        R*                  US	9nU R-                  X/5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R/                  U5      nOS nU R1                  XPR                   R2                  5      nU R#                  UUUUUS
9nU R5                  UUUUUU	U
UUUUS9nUS   nU R6                  b  U R7                  U5      OS nU(       d
  UU4USS  -   $ [9        UUUR:                  UR<                  UR>                  UR@                  S9$ )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   rE  r`   r  r^   )r  r   r  r  r  )
r\   r?  r@  r  r  r  rB  r  r  rC  r   )r  r&   r  r'   r(   r  )!rX   rB  r  use_return_dictr;  r  r   %warn_if_padding_and_no_attention_maskr   r`   r  r   r   get_seq_lengthr/   onesr1  r   r  r   r  rN  get_extended_attention_maskinvert_attention_maskget_head_maskru   r  r  r   r  r'   r(   r  ) rW   r  r\   r  r   r?  r  r@  r  r  r  rB  r  r  rC  r  r   rT  r`   r  r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr|   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr%  s                                    r5   rl   BridgeTowerTextModel.forward  s3   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66yQ#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"& "/599  "1%++B/$335 # !"ZZ*jCY6Y)ZdjkN!t(899*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5#) ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r4   )rX   r   r  r  )T)NNNNNNNNNNNNNN)r*   r+   r,   r-   r   r1   rJ   r  r   r%  r   r   r/   rn   listr0   rd   r   r2   r   rl   r3   ro   rp   s   @r5   r  r    s    "! /0C  -11515/3,0048<9==A$(,0/3&*15s
ELL)s
 !.s
 !.	s

 u||,s
 ELL)s
  -s
  (5s
 !) 6s
 "$u'8'8"9:s
 D>s
 $D>s
 'tns
 d^s
 !.s
  
uU\\"$PP	Q!s
 s
r4   r  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c            "         ^  \ rS rSrU 4S jrS rS r\              SS\\	R                     S\\	R                     S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\	R                     S\\	R                     S\\   S\\   S\\   S\\   S\\	R                     S\S\\\	R                      \4   4S jj5       rS rSrU =r$ )BridgeTowerModeli|  c           
        > [         TU ]  U5        Xl        UR                  nUR                  nUR
                  (       aa  [        R                  " UR                  UR                  5      U l	        [        R                  " UR                  UR                  5      U l
        O[        R                  " [        UR                  5       Vs/ sH.  n[        R                  " UR                  UR                  5      PM0     sn5      U l	        [        R                  " [        UR                  5       Vs/ sH.  n[        R                  " UR                  UR                  5      PM0     sn5      U l
        [        R                  " SUR                  5      U l        [!        U5      U l        [%        U5      U l        UR(                  (       d  UR*                  (       a  U R"                  R,                  R.                   H  nU R"                  R,                  R0                  R2                  R4                  UR2                  l        U R"                  R,                  R0                  R6                  R4                  UR6                  l        M     [        R                  " [        UR                  5       Vs/ sH  n[9        X6S9PM     sn5      U l        [        R                  " [        UR                  5       Vs/ sH  n[9        X6S9PM     sn5      U l        [?        U5      U l         [?        U5      U l!        [        RD                  " UR                  URF                  S9U l$        [        RD                  " UR                  URF                  S9U l%        URL                  (       a!  [O        U5      U l(        [O        U5      U l)        O[        R                  " [        UR                  S-
  5       Vs/ sH  n[O        U5      PM     sn5      U l(        [        R                  " [        UR                  S-
  5       Vs/ sH  n[O        U5      PM     sn5      U l)        U RU                  5         g s  snf s  snf s  snf s  snf s  snf s  snf )Nr   r  rC   r   )+rI   rJ   rX   vision_configtext_config$share_cross_modal_transformer_layersr   rR   rL   cross_modal_text_transformcross_modal_image_transformrw   rx   ru   r   r  r  vision_modelr  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   r  r   r  cross_modal_image_layerscross_modal_text_layersr  cross_modal_image_poolercross_modal_text_poolerrN   rO   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rW   rX   r=  r>  r|   r   r  rY   s          r5   rJ   BridgeTowerModel.__init__  sW    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqQpA;22F4F4FGQpq/D+ 02}}SXY_YqYqSrsSra=44f6H6HISrs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF!%!2!2!9!9!A!A!H!H!M!M		#0077??DDII G )+JOPVPhPhJijJiQ&{@Jij)
% (*}}JOPVPhPhJijJiQ&{@Jij(
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[7Z!%f-7Z[0D, 137<V=U=UXY=Y7Z[7Z!%f-7Z[1D- 	W r t k k  \ \s$   4Q!/4Q&+Q+0Q0&Q50Q:c                 6    U R                   R                  5       $ rh   )rC  r  r
  s    r5   r  %BridgeTowerModel.get_input_embeddings  s    3355r4   c                 :    U R                   R                  U5        g rh   )rC  r   r  s     r5   r   %BridgeTowerModel.set_input_embeddings  s    ,,U3r4   r  r\   r  r   
pixel_maskr?  r  r<   image_token_type_idxrB  r  r  labelsr   r   c           	      .   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU(       a  SOSnU(       a  SOSnU(       a  SOSnU(       a  SOSnU
(       a  SOSnUb  Uc  [        S5      eUb  UOU R                   R                  nU	(       a  U	OSn	UR                  5       nU R                  R                  US9nU(       a  UU4-  nUc.  [        R                  " U[        R                  UR                  S9nU R                  R                  UU5      R                  UR                  5      n[        U R                  R                  R                   5      U R                   R"                  -
  S-   nU R                  R                  R                   SU  H  nU" UU5      S   nU(       d  M  UU4-  nM      UcH  U R$                  R&                  R)                  UR+                  U R$                  R,                  5      US9nOUR/                  SSS	5      nU(       a  UU4-  nU R$                  R&                  R0                  R2                  SU  H  nU" U5      nU(       d  M  UU4-  nM     U R$                  R&                  R5                  UR+                  U R$                  R,                  5      5      nU R7                  U5      nU R9                  [        R:                  " S[        R                  UR                  S95      R=                  U5      nU R?                  UU-   5      nU RA                  U5      nU R9                  [        RB                  " S
U	[        R                  UR                  S95      R=                  U5      nUU-   nU RE                  U5      n[        R                  " UR                  S5      UR                  S5      4[        R                  UR                  S9nU R                  R                  XUR                  5       5      R                  UR                  5      nU RF                  S   " UUUUU
S9n U S   n!U RH                  S   " UUUUU
S9n"U"S   n#U(       a  UU!U#44-  nU
(       a  UU S   U"S   44-  nSn$[K        U[        U R                  R                  R                   5      5       GHx  n%U R                  R                  R                   U%   " UU5      S   nU R$                  R&                  R0                  R2                  U%   " U5      R+                  U R$                  R,                  5      nU RA                  U R$                  R&                  R5                  U5      5      U-   nU RL                  U$   n&U RN                  U$   n'U&" U R7                  U5      U-   U!U5      n(U'" UU#U5      n)U RF                  U$S-      " U(U)UUU
S9n U S   n!U RH                  U$S-      " U)U(UUU
S9n"U"S   n#U$S-  n$U(       a  UU4-  nUU4-  nUU!U#44-  nU
(       d  GMj  UU S   U"S   44-  nGM{     U!U#n+n*U RQ                  U*U+5      n,U(       a  UUU4nU(       d  [S        S U*U+U,UU4 5       5      $ [U        U*U+U,UUS9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
image_token_type_idx (`int`, *optional*):
    - The token type ids for images.
output_hidden_states (`bool`, *optional*):
    If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
    cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
    hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
    modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
    `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
    `cross_modal_image_hidden_states` of each brdige layer.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels are currently not supported.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerModel
>>> from PIL import Image
>>> import requests

>>> # prepare image and text
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> text = "hello world"
>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
>>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

>>> inputs = processor(image, text, return_tensors="pt")
>>> outputs = model(**inputs)
>>> outputs.keys()
odict_keys(['text_features', 'image_features', 'pooler_output'])
```Nr)   zYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r   )r  r^   r   r   r   r   )r\   r  rB  c              3   .   #    U H  nUc  M  Uv   M     g 7frh   r)   r  s     r5   r  +BridgeTowerModel.forward.<locals>.<genexpr>  s      nA nr  )r$   r%   r&   r'   r(   )+rX   rB  r  r   r)  r   rC  r   r/   r,  rN  r`   r-  rc   rw  r  rk   ru   rB  r  r   r  r_   r   r   ry   r   r@  r  r  	expand_asrJ  rA  fullrK  rG  rF  rx   rM  rN  get_cls_featuresr2   r"   )-rW   r  r\   r  r   rT  r?  r  r<   rU  rB  r  r  rV  r   all_hidden_states_textall_hidden_states_imageall_hidden_states_crossr  r  r  r;   extend_text_maskssplit_indexrk   r   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towercross_text_features_cross_image_features_r$   r%   cls_featuress-                                                r5   rl   BridgeTowerModel.forward  s   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 (<(<"$(<"$"6BD$5b4$):%k  &1%<k$++B]B]7K3QRnn&oo0090E"{n4"!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@E->?BK##&;.8&	 A ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L#6# &&--99CCL[QE .L##'L?:' R
  $0077DD\EVEVW[WhWhWnWnEop  ::;G%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#??@TU&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHUdUdUfgjj
 "99!<,#5/
 13";;A>-#4/
  315#)<>R(S'UU#%7%:<OPQ<R$S#UU {C(?(?(E(E$FGA//1177:;HYZ[\]K,,33??II!L\Z__!!''L 001B1B1I1I1V1VWc1de-. !
 #>>?OPO#@@AQR $3//<?YY#!$ 
 %55IK_as$t! "&!=!=>NQR>R!S$%0'9"3" #5Q"7"&"?"?@PST@T"U%$1'8"3# $7q#9 !#&;.8&'L?:''-@BV,W+YY'  #);A)>@STU@V(W'YY#a Hf )<=Q~,,]NK!79PRi j 'GXZmn   &')&+*
 	
r4   c                 r    U R                  U5      nU R                  U5      n[        R                  " X4/SS9$ )Nr   r   )rI  rH  r/   r   )rW   r$   r%   cls_features_textcls_features_images        r5   r]  !BridgeTowerModel.get_cls_features  s9     88G!::>Jyy+@bIIr4   )rX   rK  rF  rN  rH  rA  rJ  rG  rM  rI  r@  rC  r  rB  )NNNNNNNNNNNNNF)r*   r+   r,   r-   rJ   r  r   r   r   r/   
LongTensorr0   r   rd   r   r2   rn   r"   rl   r]  r3   ro   rp   s   @r5   r;  r;  |  s   6p64  156:594815155948.2,0/3&*-1).j
E,,-j
 !!2!23j
 !!1!12	j

 u001j
 U--.j
 E--.j
   1 12j
 u001j
 'smj
 $D>j
 'tnj
 d^j
 ))*j
 #'j
  
uU\\"$::	;!j
 j
XJ Jr4   r;  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )"BridgeTowerPredictionHeadTransformi  c                 p  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l
        OUR                  U l
        [        R                  " UR                  UR                  S9U l        g r   )rI   rJ   r   rR   rL   r   r  r  r  r
   transform_act_fnrN   rO   rV   s     r5   rJ   +BridgeTowerPredictionHeadTransform.__init__  s~    YYv1163E3EF
f''--$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr4   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ rh   )r   r|  rN   r  s     r5   rl   *BridgeTowerPredictionHeadTransform.forward  s4    

=1--m<}5r4   )rN   r   r|  r   rp   s   @r5   rz  rz    s    U r4   rz  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )r  i  c                 n  > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  R                  SS9U l
        [
        R                  " [        R                  " UR                  R                  5      5      U l        Ub  X R                  l        g g )NF)r   )rI   rJ   rX   rz  	transformr   rR   rL   r>  r  decoderr   r/   r  r   r   )rW   rX   r   rY   s      r5   rJ   BridgeTowerMLMHead.__init__  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(LL r4   c                 d    U R                  U5      nU R                  U5      U R                  -   nU$ rh   )r  r  r   )rW   x	mlm_scores      r5   rl   BridgeTowerMLMHead.forward  s-    NN1%	LL+dii7	r4   )r   rX   r  r  rh   r   rp   s   @r5   r  r    s    ) r4   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerITMHeadi  c                 Z   > [         TU ]  5         [        R                  " US5      U l        g Nr   rI   rJ   r   rR   fc)rW   rL   rY   s     r5   rJ   BridgeTowerITMHead.__init__  s     ))K+r4   c                 (    U R                  U5      nU$ rh   r  )rW   r  	itm_scores      r5   rl   BridgeTowerITMHead.forward  s    GGAJ	r4   r  r   rp   s   @r5   r  r    s    , r4   r  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                     ^  \ rS rSrS/rU 4S jrS rS r\            SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\
R                     S\\\\
R                     4   4S jj5       rSrU =r$ )BridgeTowerForMaskedLMi  zmlm_score.decoder.weightc                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g rh   )rI   rJ   r;  r  r  r  r  rV   s     r5   rJ   BridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r4   c                 .    U R                   R                  $ rh   r  r  r
  s    r5   get_output_embeddings,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r4   c                 $    XR                   l        g rh   r  )rW   new_embeddingss     r5   set_output_embeddings,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r4   r  r\   r  r   rT  r?  r  r<   rB  r  r  rV  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU R                  U(       a  UR                  OUS   5      nSnUbk  [        5       nUR                  UR                  5      nU" UR                  SU R                   R                  R                  5      UR                  S5      5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
>>> from PIL import Image
>>> import requests

>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
>>> text = "a <mask> looking out of the window"

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # prepare inputs
>>> encoding = processor(image, text, return_tensors="pt")

>>> # forward pass
>>> outputs = model(**encoding)

>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

>>> print(results)
.a cat looking out of the window.
```N
r\   r  r   rT  r?  r  r<   rB  r  r  r   r   r9   r:   r'   r(   )rX   r)  r  r  r$   r   rc   r`   r   r>  r  r2   r   r'   r(   )rW   r  r\   r  r   rT  r?  r  r<   rB  r  r  rV  r  
mlm_logitsmasked_lm_lossloss_fctrr  s                     r5   rl   BridgeTowerForMaskedLM.forward  s   d &1%<k$++B]B]""))%!'%/!5# # 
 ^^[G$9$9gVWjY
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN:&F3A3M^%.YSYY!//))	
 	
r4   )r  r  NNNNNNNNNNNN)r*   r+   r,   r-   _tied_weights_keysrJ   r  r  r   r   r/   rx  r0   rd   r   r   r2   rl   r3   ro   rp   s   @r5   r  r    sj    55&0  156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
~uU%6%677	8Q
 Q
r4   r  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\R                     S\\\\R                     4   4S jj5       rSrU =r$ )#BridgeTowerForImageAndTextRetrievaliK  c                    > [         TU ]  U5        [        U5      U l        [	        UR
                  S-  5      U l        U R                  5         g r  )rI   rJ   r;  r  r  rL   r  r  rV   s     r5   rJ   ,BridgeTowerForImageAndTextRetrieval.__init__R  s@     +F3+F,>,>,BC 	r4   r  r\   r  r   rT  r?  r  r<   rB  r  r  rV  r   c                    Ub  UOU R                   R                  nU R                  UUUUUUUUU	U
US9nU(       a  UR                  OUS   nU R	                  U5      nSnUb-  [        5       nUR                  UR                  5      nU" X5      nU(       d  [        U5      nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )aM  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
    Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
    The pairs with 0 will be skipped for calculation.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
>>> import requests
>>> from PIL import Image

>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

>>> # forward pass
>>> scores = dict()
>>> for text in texts:
...     # prepare inputs
...     encoding = processor(image, text, return_tensors="pt")
...     outputs = model(**encoding)
...     scores[text] = outputs.logits[0, 1].item()
```Nr  r   r  )rX   r)  r  r&   r  r   rc   r`   r2   r   r'   r(   )rW   r  r\   r  r   rT  r?  r  r<   rB  r  r  rV  r  r&   r:   itm_lossr  rr  s                      r5   rl   +BridgeTowerForImageAndTextRetrieval.forward\  s    \ &1%<k$++B]B]""))%!'%/!5# # 
 2=--'!*.')HYYv}}-F/H6]F-5-AXK&(MvM'!//))	
 	
r4   )r  r  r  )r*   r+   r,   r-   rJ   r   r   r/   rx  r0   rd   r   r   r2   rl   r3   ro   rp   s   @r5   r  r  K  sV     156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
'u/@/@)AA	BQ
 Q
r4   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )BridgeTowerContrastiveHeadi  c                 X   > [         TU ]  5         [        R                  " X5      U l        g rh   r  )rW   rL   
embed_sizerY   s      r5   rJ   #BridgeTowerContrastiveHead.__init__  s    ))K4r4   c                 (    U R                  U5      nU$ rh   r  )rW   r  s     r5   rl   "BridgeTowerContrastiveHead.forward  s    GGAJr4   r  r   rp   s   @r5   r  r    s    5 r4   r  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                     ^  \ rS rSrU 4S jr\            SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\
   S\\
   S\\
   S\\
   S\\\\R                     4   4S jj5       rSrU =r$ )r  i  c                   > [         TU ]  U5        [        U5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  UR                  5      U l        [	        UR
                  S-  UR                  5      U l	        [        R                  " [        R                  " U R                  R                  5      5      U l        U R#                  5         g r  )rI   rJ   r;  r  r  rL   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr   r   r/   r   rX   r  r  r  rV   s     r5   rJ   *BridgeTowerForContrastiveLearning.__init__  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr4   r  r\   r  r   rT  r?  r  r<   rB  r  r  return_lossr   c                 >   Ub  UOU R                   R                  nU R                  UUUUUUUUU	SUS9nU(       a  UR                  OUS   nU(       a  UR                  OUS   u  nnnUS   nUS   nU R                  R
                  R                  R                  U5      nU R                  R                  [        R                  " SS[        R                  U R                  R                  R                  R                  S	95      R                  U5      nU R                  R                  U5      U-   n[         R"                  R%                  U R'                  USS2S
SS24   5      SSS9n[         R"                  R%                  U R)                  USS2S
SS24   5      SSS9R+                  UR                  S9n[         R"                  R%                  U R-                  U5      SSS9R+                  UR                  S9n[        R.                  " UUU/SS9nU R0                  R3                  5       R+                  UR                  S9n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  n[        R4                  " UUR7                  5       5      U-  nSnU(       a  [        R8                  " [;        U5      UR                  S9n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      n[         R"                  R=                  UU5      nUU-   U-   S-  nU(       d  UUUU4USS -   n Ub  U4U -   $ U $ [?        UUUUUUR                  UR@                  S9$ )a  
image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
    Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
    This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
>>> import requests
>>> from PIL import Image
>>> import torch

>>> image_urls = [
...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
...     "http://images.cocodataset.org/val2017/000000039769.jpg",
... ]
>>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
>>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]

>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

>>> inputs = processor(images, texts, padding=True, return_tensors="pt")
>>> loss = model(**inputs, return_loss=True).loss

>>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
>>> loss_swapped = model(**inputs, return_loss=True).loss

>>> print("Loss", round(loss.item(), 4))
Loss 0.0019

>>> print("Loss with swapped images", round(loss_swapped.item(), 4))
Loss with swapped images 2.126
```NTr  r   r	   r   rX  r   r^   r   )r   pr(  rE  r   g      @)r9   r:   r;   r<   r=   r'   r(   )!rX   r)  r  r&   r'   rB  r  r   r  r/   r\  rN  r   r`   r[  rA  r   r   	normalizer  r  rc   r  r   r  exprM  tr   rw  cross_entropyr7   r(   )!rW   r  r\   r  r   rT  r?  r  r<   rB  r  r  r  r  r&   hidden_states_txthidden_states_imghidden_states_cross_modalr;   rc  rf  r=   r:   r  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossrV  text_to_image_losstext_to_cross_lossimage_to_cross_lossrr  s!                                    r5   rl   )BridgeTowerForContrastiveLearning.forward  sz   j &1%<k$++B]B]""))%!'%/!%# # 
 2=--'!*%0G!!gaj 	H,.G (+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 k<FBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\Hk<FQRQSTF-5-AXK&(MvM+#%%!//))
 	
r4   )r  r  r  r  r  )NNNNNNNNNTNN)r*   r+   r,   r-   rJ   r   r   r/   rx  r0   rd   r   r7   r2   rl   r3   ro   rp   s   @r5   r  r    sO     156:594815155948,0/3&*&*x
E,,-x
 !!2!23x
 !!1!12	x

 u001x
 U--.x
 E--.x
   1 12x
 u001x
 $D>x
 'tnx
 d^x
 d^x
 
+U53D3D-EE	Fx
 x
r4   r  )r  r  r  r;  r  )r   )Nr.   rP  collectionsr   dataclassesr   typingr   r   r/   torch.utils.checkpointr   torch.nnr   activationsr
   r   cache_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_bridgetowerr   r   r   
get_loggerr*   r  _TOKENIZER_FOR_DOCr"   r7   r  r?   rr   r   r   r   r   r  r  r  r(  rp  rm  r  r  r  r  r  r  r  r  r;  rz  r  r  r  r  r  r  __all__r)   r4   r5   <module>r     s      # ! "    % 6 5 9  I Q 7 7 h h 
		H	%'  
:[ : :$ 
:; : :4)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		  @.ryy @.H %' #2299 2j<		 <~F5 FT[
RYY [
~V=		 V=t4  % % %DY7 Y U
5 U
U
p 
oJ1 oJ
oJf	 "    
d
7 d

d
N ]
*D ]
]
@  
G
(B G

G
Tr4   