
    <h~                        S r SSKrSSKJr  SSKJr  SSKrSSKrSSKJr  SSK	J
r
  SSKJr  SS	KJrJrJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJr  \R8                  " \5      r\\ " S S\5      5       5       r " S S\R@                  5      r! " S S\R@                  5      r" " S S\R@                  5      r# " S S\R@                  5      r$ " S S\R@                  5      r% " S S\R@                  5      r& " S S\R@                  5      r' " S  S!\5      r( " S" S#\R@                  5      r) " S$ S%\R@                  5      r*\ " S& S'\5      5       r+ " S( S)\R@                  5      r, " S* S+\R@                  5      r-\,\-S,.r.\" S-S.9 " S/ S0\+5      5       r/ " S1 S2\R@                  5      r0\" S3S.9 " S4 S5\+5      5       r1/ S6Qr2g)7zPyTorch TVP Model    N)	dataclass)Optional)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)auto_docstringlogging)load_backbone   )	TvpConfigc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
TvpVideoGroundingOutput&   a\  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Temporal-Distance IoU loss for video grounding.
logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
    Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
    input texts.
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
Nlosslogits.hidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   tupler   __static_attributes__r       \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   &   sq    	 )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>r%   r   c                   D   ^  \ rS rSrSrU 4S jrS rS rS rS r	Sr
U =r$ )	TvpLoss:   ab  
This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
ground-truth / prediction (supervise class and box).

Args:
    losses (`list[str]`):
        List of all the losses to be applied.
c                    > [         TU ]  5         U R                  U R                  U R                  S.U l        U H!  nX R
                  ;  d  M  [        SU S35      e   Xl        g )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr6   r   	__class__s      r&   r0   TvpLoss.__init__E   sa    ==****

 D==( 5n!=>>  r%   c                     [         R                  " XB5      [         R                  " X15      -
  n[         R                  " XB5      [         R                  " X15      -
  nSUR                  SS9U-  -
  nU$ )z&
Measure the intersection over union.
r   r   min)r    r<   maxclamp)	r7   
start_timeend_timecandidates_start_timecandidates_end_timer.   interunionr,   s	            r&   r1   TvpLoss.loss_iouR   s_     		-8599EZ;gg		-8599EZ;gg%++!+$u,,
r%   c                 P   [         R                  " [         R                  " X45      S5      n[         R                  " [         R                  " X5      S5      n[         R                  " [         R                  " Xg5      [         R                  " Xg5      -
  U5      R                  SS9nU$ )z%
Measure the distance of mid points.
g       @g?r;   )r    divaddr=   r<   r>   )	r7   r?   r@   rA   rB   r.   mid_candidatesmid_groundtruthdistance_diffs	            r&   r2   TvpLoss.loss_distance\   sy     599-B#XZ]^))EIIj$CSI		IIn6>9ccem

%C%. 	 r%   c                     [         R                  " XC5      n[         R                  " X!5      n[         R                  " [         R                  " [         R                  " Xg5      U5      5      nUR	                  SS9nU$ )z%
Measure the difference of duration.
g?r;   )r    subsquarerG   r>   )	r7   r?   r@   rA   rB   r.   duration_candidatesduration_groundtruthduration_diffs	            r&   r3   TvpLoss.loss_durationh   s`     $ii(;S$yy>UYYuyy9L/cem%no%+++4r%   c                    Uu  p4n[         R                  " X5      nUSS2S4   R                  5       USS2S4   R                  5       p0 n	U R                   H*  n
U	R	                  XR
                  U
   " XEXxU5      05        M,     U	$ )a5  
This performs the loss computation.

Args:
    logits (`torch.FloatTensor`):
        The output logits of head module.
    labels (`list[torch.FloatTensor]`):
        List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
Nr   r   )r    mulfloatr6   updater4   )r7   r   labelsr.   r?   r@   
candidatesrA   rB   losses_dictr   s              r&   forwardTvpLoss.forwards   s     *0&hYYv0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KKD}}T*:AVmuvw  
 r%   )r4   r6   )r   r   r   r   r   r0   r1   r2   r3   r[   r$   __classcell__r8   s   @r&   r(   r(   :   s&    
	 r%   r(   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVisionModel   c           
        > [         TU ]  5         [        U5      U l        UR                  b  UR                  R
                  S   nO[        U R                  S5      (       aI  [        U R                  R                  S5      (       a$  U R                  R                  R
                  S   nOl[        U R                  S5      (       aF  [        U R                  R                  S5      (       a!  U R                  R                  R                  nO[        S5      e[        R                  " UUR                  SSSSSS	9U l        g )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r/   r0   r   backbonebackbone_configre   hasattrrd   rf   r5   r   Conv2dgrid_encoder_conv)r7   rd   in_channelsr8   s      r&   r0   TvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H--'$--:N:NP^2_2_--..;;B?KT]]H--'$--:N:NP]2^2^--..::K899!#"
r%   c                    UR                   u  p#pEnUR                  X#-  XEU5      nU R                  U5      S   S   nU R                  U5      n[        R
                  R                  USSS9n[        R
                  R                  USS9nUR                   SS  u  pnUR                  X#XU5      nUR                  SSS	S
S5      nU$ )Nfeature_mapsr      )rg   rh   T)inplacer   r      )	shapeviewrl   rp   r   
functional
max_pool2drelupermute)r7   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r&   r[   TvpVisionModel.forward   s    >J>P>P;
e#(()@,X]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*yy)T||Aq!Q*r%   )rl   rp   r   r   r   r   r0   r[   r$   r]   r^   s   @r&   r`   r`      s    
. r%   r`   c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSS	\
4S jjrSrU =r$ )TvpVisualInputEmbedding   z3
Takes input of both image and video (multi-frame)
c                 x  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  UR
                  5      U l
        [        R                  " SUR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        UR                  U l        UR                  U l	        g )Nr   eps)r/   r0   r   	Embeddingmax_position_embeddingsrf   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr7   rd   r8   s     r&   r0    TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r%   	embeddingr   r   returnc                    S=pEX R                   :  a  X R                   -  nX0R                  :  a  X0R                  -  nUR                  SSSS5      n[        R                  R                  UXE4SSS9nUR                  SSSS5      nU$ )z
This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
resolution images (high resolution videos).

r   r   r   ru   bicubicFscale_factormodealign_corners)r   r   r~   r   r{   interpolate)r7   r   r   r   h0w0s         r&   interpolate_pos_encoding0TvpVisualInputEmbedding.interpolate_pos_encoding   s     999???B888>>>B%%aAq1	MM--	 . 
	 %%aAq1	r%   r   c                    UR                   u  p4pV[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      n	S[        UR                   5      S-
  -  USU4-   n
U	R                  " U
6 n	[        U R                  U5      n[        R                  " U[        R
                  UR                  S9nU R                  U5      nUSX4nUR                  " U6 nX-   nU(       a4  X@R                  :  d  XPR                  :  a  XR                  XU5      -   nU$ X-   nU$ )a.  
Args:
    grid: (batch_size, height, width, hidden_dim)
    interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.
Returns:
    grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
dtypedevice)r   r   r   )ry   r<   r   r    arangelongr   r   lenrz   r   r   r   )r7   r   r   r   r   r   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r&   add_2d_positional_embeddings4TvpVisualInputEmbedding.add_2d_positional_embeddings   s7    15

-
E >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	"9">">	"J ==uE	 <<	DKKX"&">">?O"PI:	"9">">	"J 7 Q $:::eFkFk>k778MW\]]D  /Dr%   c                 x   UR                   u  p4pVnUR                  S5      nU R                  XS9nUR                  USU5      nUR                   SS n	UR                  n
[
        R                  " U	[
        R                  U
S9nU R                  U5      nX-   nU R                  U5      nU R                  U5      nU$ )a  
Args:
    grid: Array of shape (batch_size, num_frames, height, width, num_channels).
        It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
        num_frames can be 1
    interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
        Whether to interpolate the pre-trained position encodings.

Returns:
    embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

r   r   rc   Nr   )ry   meanr   rz   r   r    zerosr   r   r   r   )r7   r   r   r   r   r   r   r   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r&   r[   TvpVisualInputEmbedding.forward  s     ?Cjj;
|yy|000i		*b,?+11#26%% %8

SYZ $ : :> J":
__Z0
\\*-
r%   )r   r   r   r   r   r   r   r   F)r   r   r   r   r   r0   r    Tensorintr   boolr   r[   r$   r]   r^   s   @r&   r   r      sY    
X%,,  TW \a\h\h .'4 'Rd  r%   r   c                   6   ^  \ rS rSrSrU 4S jrSS jrSrU =r$ )TvpTextInputEmbeddingsi#  zGConstruct the embeddings from word, position and token_type embeddings.c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                   5      U l        g )N)padding_idxr   )r/   r0   r   r   
vocab_sizerf   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r&   r0   TvpTextInputEmbeddings.__init__&  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r%   c                 .   Ub  UR                  5       nOUR                  5       S S nUS   nUb  UR                  OUR                  nUcD  [        R                  " U[        R                  US9nUR                  S5      R                  U5      nUc$  [        R                  " U[        R                  US9nUc  U R                  U5      nU R                  U5      nU R                  U5      n	XH-   U	-   n
U R                  U
5      n
U R                  U
5      n
U
$ )Nrc   r   r   r   )sizer   r    r   r   	unsqueezeexpandr   r   r   r   r   r   )r7   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r&   r[   TvpTextInputEmbeddings.forward.  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"8;PP
__Z0
\\*-
r%   )r   r   r   r   r   )NNNN	r   r   r   r   r   r0   r[   r$   r]   r^   s   @r&   r   r   #  s    Q> r%   r   c                   v   ^  \ rS rSrU 4S jrS rS\R                  S\S\4S jr	   SS\
\   4S	 jjrS
rU =r$ )TvpAttentioniG  c                   > [         TU ]  5         UR                  UR                  -  S:w  a6  [	        US5      (       d%  [        SUR                   SUR                   35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l        [        R$                  " UR                  UR&                  S9U l        [        R                  " UR*                  5      U l        [/        5       U l        g )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r/   r0   rf   num_attention_headsrn   r5   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r&   r0   TvpAttention.__init__H  s    : ::a?PVXhHiHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=Er%   c                 T  ^ [        U5      S:X  a  g [        R                  " U R                  U R                  5      n[        U5      U R                  -
  nU H*  mT[        U4S jU R                   5       5      -
  mSUT'   M,     UR                  S5      R                  5       R                  S5      n[        R                  " [        U5      5      U   R                  5       n[        U R                  U5      U l        [        U R                  U5      U l        [        U R                   U5      U l        [        U R"                  USS9U l        U R                  [        U5      -
  U l        U R                  U R                  -  U l        U R                  R'                  U5      U l        g )Nr   c              3   4   >#    U H  oT:  a  S OSv   M     g7f)r   r   Nr   ).0hheads     r&   	<genexpr>+TvpAttention.prune_heads.<locals>.<genexpr>d  s     N<Mqt8a2<Ms   rc   r   dim)r   r    onesr   r   r   r   sumrz   
contiguouseqr   r   r   r   r   r   r   r   rD   )r7   headsmaskindexr   s       @r&   prune_headsTvpAttention.prune_heads]  sI   u:?zz$22D4L4LME
T...D#ND<M<MNNNDDJ  yy}''),,Q/SY'-224 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r%   tensorsequence_lengthr   c                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr   ru   )rz   r   r   	transposer   )r7   r  r  r   s       r&   _reshapeTvpAttention._reshapet  s5    KK
T5M5MtOgOghYq!_Z\	
r%   output_attentionsc                 2   UR                   S S u  pVU R                  U5      nU R                  U5      nU R                  U5      n	U R	                  XvU5      n
U R	                  XU5      nU R	                  XU5      n[
        R                  " XR                  SS5      5      nU[        R                  " U R                  5      -  nUb  X-   n[        R                  R                  USS9nU R                  U5      nUb  X-  n[
        R                  " X5      nUR                  SS5      R                  5       nUR!                  XVU R"                  5      nU R%                  U5      nU R'                  U5      nU R)                  X-   5      nU(       a  X4nU$ U4nU$ )Nru   rc   r   r   )ry   r   r   r   r  r    matmulr  mathsqrtr   r   r{   softmaxr   r   reshaper   r   r   r   )r7   r   attention_mask	head_maskr
  r   r  mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r&   r[   TvpAttention.forward{  s    '4&9&9"1&=#
 JJ}5((=1 JJ}5mm$5
SMM/JO	mm$5
S !<<5H5HR5PQ+dii8P8P.QQ%/@ --//0@b/I ++O<  -9Oll?@!++Aq1<<>!))*tGYGYZjj-ll;/ook&AB4E;0 MX>r%   )r   r   r   r   r   r   r   r   r   r   r   NNN)r   r   r   r   r0   r  r    r   r   r  r   r   r[   r$   r]   r^   s   @r&   r   r   G  sN    "*;.
u|| 
c 
s 
 ,0+
 $D>+ +r%   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )TvpIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g N)r/   r0   r   r   rf   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r&   r0   TvpIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r%   r   r   c                 J    U R                  U5      nU R                  U5      nU$ r#  r   r(  )r7   r   s     r&   r[   TvpIntermediate.forward  s&    

=100?r%   r+  
r   r   r   r   r0   r    r   r[   r$   r]   r^   s   @r&   r!  r!    s(    9U\\ ell  r%   r!  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )TvpOutputLayeri  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l	        [        R                  " UR                  5      U l        g )Nr   )r/   r0   r   r   r$  rf   r   r   r   r   r   r   r   r   s     r&   r0   TvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r%   r   input_tensorr   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r#  r   r   r   )r7   r   r2  s      r&   r[   TvpOutputLayer.forward  s5    

=1]3(DEr%   r4  r-  r^   s   @r&   r/  r/    s6    >U\\  RWR^R^  r%   r/  c                   F   ^  \ rS rSrU 4S jr   SS\\   4S jjrSrU =r	$ )TvpEncodeLayeri  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        U5      U l        g r#  )r/   r0   r   	attentionr!  intermediater/  outputr   s     r&   r0   TvpEncodeLayer.__init__  s3    %f-+F3$V,r%   r
  c                     U R                  UUUUS9nUS   nUSS  nU R                  U5      nU R                  X5      n	U	4U-   nU$ )N)r
  r   r   r9  r:  r;  )
r7   r   r  r  r
  self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r&   r[   TvpEncodeLayer.forward  so     "&/	 "0 "
 2!4(,"//0@A{{#6I/G+r%   r>  r  )
r   r   r   r   r0   r   r   r[   r$   r]   r^   s   @r&   r7  r7    s+    - ,0
 $D> r%   r7  c            
       |   ^  \ rS rSrU 4S jr     S	S\\R                     S\\   S\\   S\\   4S jjr	Sr
U =r$ )

TvpEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r/   r0   rd   r   
ModuleListrangenum_hidden_layersr7  layergradient_checkpointing)r7   rd   _r8   s      r&   r0   TvpEncoder.__init__  sR    ]]E&JbJbDc#dDcqN6$:Dc#de
&+# $es   A%r  r
  output_hidden_statesreturn_dictc                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nSnSn[	        U R
                  5       H3  u  pU(       a  Xq4-   nU
" XX9   U5      nUS   nU(       d  M+  XS   4-   nM5     U(       a  Xq4-   nU(       d  U4nU(       a  X4-   nU(       a  X4-   nU$ [        UU(       a  UOS U(       a  US9$ S S9$ )Nr   r   r   )last_hidden_stater   r   )rd   rO  r
  rN  	enumeraterJ  r	   )r7   r   r  r  r
  rN  rO  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r&   r[   TvpEncoder.forward  s    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4OA#$58H$H!(	VghM)!,M  !/3C2E!E  5   14D D$&G#!$88 !$55N+/C+):~
 	
 AE
 	
r%   )rd   rK  rJ  )NNNNN)r   r   r   r   r0   r   r    r!   r   r[   r$   r]   r^   s   @r&   rE  rE    sb    , 15,0/3&*+
 E--.	+

 $D>+
 'tn+
 d^+
 +
r%   rE  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	TvpPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r#  )r/   r0   r   r   rf   r   Tanh
activationr   s     r&   r0   TvpPooler.__init__  s9    YYv1163E3EF
'')r%   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r]  )r7   r   first_token_tensorpooled_outputs       r&   r[   TvpPooler.forward  s6     +1a40

#566r%   )r]  r   r-  r^   s   @r&   rZ  rZ    s(    $
U\\ ell  r%   rZ  c                   J    \ rS rSr% \\S'   SrSrS\R                  4S jr
Srg)	TvpPreTrainedModeli(  rd   modelTmodulec                    [        U[        R                  [        R                  45      (       a:  UR                  R
                  R                  SU R                  R                  S9  GO'[        U[        R                  5      (       aJ  UR                  R
                  R                  5         UR                  R
                  R                  S5        O[        U[        R                  5      (       aa  [        R                  R                  UR                  SSS9  UR                  b*  [        R                  R!                  UR                  S5        O>[        U["        5      (       a)  [        R                  R                  UR$                  5        [        U[        R                  5      (       a1  UR                  b$  UR                  R
                  R                  5         ['        US	5      (       a)  [        R                  R                  UR(                  5        ['        US
5      (       a)  [        R                  R                  UR*                  5        ['        US5      (       a)  [        R                  R                  UR,                  5        ['        US5      (       a*  [        R                  R                  UR.                  5        gg)zInitialize the weights        )r   stdg      ?fan_outr}   )r   nonlinearityNr   pad_uppad_downpad_left	pad_right)r%  r   r   r   weightdatanormal_rd   initializer_ranger   rk   zero_fill_ro   initkaiming_normal_	constant_TvpModeltext_promptrn   rl  rm  rn  ro  )r7   rf  s     r&   _init_weights TvpPreTrainedModel._init_weights.  s   fryy",,788 MM&&CT[[5R5R&S--KK""$MM$$S)		**GG##FMM	PV#W{{&!!&++q1))GGOOF../fbii((V[[-DKK""$68$$GGOOFMM*6:&&GGOOFOO,6:&&GGOOFOO,6;''GGOOF,,- (r%   r   N)r   r   r   r   r   r"   base_model_prefixsupports_gradient_checkpointingr   Moduler{  r$   r   r%   r&   rd  rd  (  s$    &*#.BII .r%   rd  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )TvpFrameDownPadPrompteriJ  z6
Pad frames extracted from videos only at the bottom.
c           	        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                  U l        UR                   U l         [        R                  " [        R                  " SUR
                  SUR                  UR                  /5      5      U l        g )NrH   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr5   r/   r0   visual_prompt_size	frame_nummax_img_sizer   	Parameterr    randnrm  r   s     r&   r0    TvpFrameDownPadPrompter.__init__O  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r%   c                    U R                   S:w  ao  [        R                  " U R                  U R                  /UR                  UR
                  S9nSX R                  U R                  -
  U R                  2S S 24'   X-  nU R                   S:w  a  [        R                  " UR                  S   UR                  S   SU R                  U R                  /UR
                  S9nU R                  U R                  -
  nU R                  US S 2S S 2S S 2X@R                  2S S 24'   XR                  UR                  5      -  nU$ )	NrH   r   rh  r  r   r   r   r   )r  r    r   r  r   r   r  r   ry   rm  to)r7   r   visual_prompt_maskpromptstart_points        r&   r[   TvpFrameDownPadPrompter.forward]  s(   %%.!&""D$5$56l>P>PYeYlYl" fi0043J3JJTM^M^^`aab.L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK*;*;;Q>?IIl&8&899Lr%   )r  r  rm  r  r  r   r^   s   @r&   r  r  J  s    
 r%   r  c                      ^  \ rS rSrSrU 4S jrS\R                  S\S\S\R                  4S jr	SS	\
4S
 jjrSrU =r$ )TvpFramePadPrompterio  z7
Pad frames extracted from videos in the surroundings.
c           
        > UR                   S;  a  [        S5      e[        TU ]  5         UR                  U l        UR
                  U l        UR                   U l         UR
                  UR                  S-  -
  U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR                  UR
                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        [        R                  " [        R                  " SUR                  SUR
                  UR                  S-  -
  UR                  /5      5      U l        g )Nr  r  ru   r   r   )r  r5   r/   r0   r   r  r  	base_sizer   r  r    r  rl  rm  rn  ro  r   s     r&   r0   TvpFramePadPrompter.__init__t  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r%   r  r   r   r   c                     X R                   -  X0R                   -  pTUR                  u  pgpn
UR                  Xg-  XU
5      n[        R                  R                  UXE4SSS9nUR                  XgXU5      nU$ )z
This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
resolution images (high resolution videos).

r   Fr   )r  ry   r  r   r{   r   )r7   r  r   r   r   r   batchr   channelsprompt_heightprompt_widths              r&   interpolate_pad_encoding,TvpFramePadPrompter.interpolate_pad_encoding  s     +++U5F5F-FBCI<<@8L  2H\Z**	 + 
 8UKr%   r  c           	      ^   U(       a  UR                   S   UR                   S   4OU R                  U R                  4u  p4U R                  S;  a  [        SU R                   35      eU R                  S;   a/  [        R
                  " X4/UR                  UR                  S9nX-  nU R                  S;   a  [        R                  " SU R                  S	U R                  U R                  UR                  S
9n[        R                  " U R                  X`R                  /SS9n[        R                  " U R                  XpR                  /S	S9n[        R                  " UR!                  S5      U/-  5      nU(       a  U R#                  XsU5      nXR%                  UR                  5      -   nU$ )Nr  rc   )rH   r  r  z$Invalid visual_prompter_apply value )r  r  r   )r  rH   r   r   r  rx   r   r   )ry   r  r  r5   r    r   r   r   r   r   r  catrn  ro  rl  rm  r   r  r  )r7   r   r  r   r   r  baser  s           r&   r[   TvpFramePadPrompter.forward  sl    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VO<CUCU^j^q^q!r.L%%);;;;q$//1dnndnn]i]p]pqDYYt^^D!LFYYV]]CKFYY|003vh>?F'66vuM'))L4F4F*GGLr%   )r  r  r   rm  rn  ro  rl  r  r   )r   r   r   r   r   r0   r    r   r   r  r   r[   r$   r]   r^   s   @r&   r  r  o  sL    $
Lu|| S QT Y^YeYe 0d  r%   r  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                      ^  \ rS rSrU 4S jrS rS rS r\        SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\   S\	\   S\	\   S\4S jj5       rSrU =r$ )ry  i  c                 ,  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l
        [        U5      U l        [        R                  " [        R                   " SSUR"                  /5      5      U l        [        R&                  " UR(                  5      U l        UR,                  [.        ;  a  [1        S5      e[.        UR,                     " U5      U l        U R5                  5         g )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r/   r0   rd   r`   vision_modelr   r   r   visual_embeddingsrE  encoderrZ  poolerr   r  r    r  rf   rz  r   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr5   visual_prompter	post_initr   s     r&   r0   TvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r%   c                 .    U R                   R                  $ r#  r   r   )r7   s    r&   get_input_embeddingsTvpModel.get_input_embeddings  s    ...r%   c                 $    XR                   l        g r#  r  )r7   r   s     r&   set_input_embeddingsTvpModel.set_input_embeddings  s    */'r%   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)zPrunes heads of the model.
heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
N)itemsr  rJ  r9  r  )r7   heads_to_prunerJ  r   s       r&   _prune_headsTvpModel._prune_heads  s<     +002LELLu%//;;EB 3r%   r   r   r  r  r
  rN  rO  r   c	           	         Ub  UOU R                   R                  nU R                  U R                  X(S95      nU R	                  US9n	U R                  X(S9n
Ub  UR                  U
R                  SS 5      n[        R                  " UR                  S   S5      R                  UR                  UR                  S9n[        R                  " XU/S	S
9nU R                  X1R                  5       5      R                  UR                  5      nU R                   R#                  U	R                  S   S	S	5      n[        R                  " XU
/SS
9nU R%                  UUU R'                  X@R                   R(                  5      UUUS9nU(       a  UR*                  OUS   nU R-                  U5      nU R/                  U5      nU R/                  U5      nU(       d
  UU4USS -   $ [1        UUUR2                  UR4                  S9$ )a  
Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpModel

>>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  )r   r   ru   r   r  )r   r   rc   r   r   )r  r  r
  rN  rO  )rQ  pooler_outputr   r   )rd   rO  r  r  r   r  new_onesry   r    r   r  r   r   r  get_extended_attention_maskr   rz  r   r  get_head_maskrI  rQ  r  r   r
   r   r   )r7   r   r   r  r  r
  rN  rO  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskrz  embedding_outputencoder_outputsrQ  ra  s                     r&   r[   TvpModel.forward  s   4 &1%<k$++BYBY((   a
 !%) D"&"8"8 #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==nnnN^_bbclcscstN&&--.C.I.I!.LbRTU 99kJa%bhij,,)((KK4Q4QR/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r%   )	rd   r   r   r  r  rz  r  r  r  )NNNNNNNF)r   r   r   r   r0   r  r  r  r   r   r    
LongTensorr!   r   r[   r$   r]   r^   s   @r&   ry  ry    s     /0C  15485915,0/3&*).F
E,,-F
 u001F
 !!1!12	F

 E--.F
 $D>F
 'tnF
 d^F
 #'F
 F
r%   ry  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )TvpVideoGroundingHeadi<  c                 B  > [         TU ]  5         [        R                  " UR                  UR                  S-  5      U l        [        R                  " UR                  S-  S5      U l        [        R                  " 5       U l        [        R                  " 5       U l
        g )Nru   )r/   r0   r   r   rf   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r&   r0   TvpVideoGroundingHead.__init__=  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr%   c                     U R                  U R                  U5      5      nU R                  U R                  U5      5      nU$ r#  )r  r  r  r  )r7   r  r   s      r&   r[   TvpVideoGroundingHead.forwardD  s9    ""4<<#>?""4<<#78r%   )r  r  r  r  r   r^   s   @r&   r  r  <  s    ) r%   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                     ^  \ rS rSrU 4S jr\         SS\\R                     S\\R                     S\\R                     S\\
\R                        S\\R                     S\\   S	\\   S
\\   S\4S jj5       rSrU =r$ )TvpForVideoGroundingiJ  c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r#  )r/   r0   rd   ry  re  r  video_grounding_headr  r   s     r&   r0   TvpForVideoGrounding.__init__P  s8     f%
$9&$A!r%   r   r   r  rX   r  r
  rN  rO  r   c
                    Ub  UOU R                   R                  nU R                  UUUUUUUU	S9n
U
S   nU R                  U5      nSnUbo  [	        / SQ5      nUR                  U R                  5        U" X5      nUS   U R                   R                  US   -  -   U R                   R                  US   -  -   nU(       d  U4U
SS -   n
Ub  U4U
-   n
U
$ [        UUU
R                  U
R                  S	9$ )
a  
labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
    The labels contains duration, start time, and end time of the video corresponding to the text.

Examples:
```python
>>> import torch
>>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

>>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

>>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

>>> pixel_values = torch.rand(1, 1, 3, 448, 448)
>>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
>>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
```N)r  r
  rN  rO  r   r   r+   r,   r-   r.   ru   )r   r   r   r   )rd   rO  re  r  r(   r  r   distance_loss_weightduration_loss_weightr   r   r   )r7   r   r   r  rX   r  r
  rN  rO  r   r  r  r   r   	criterion	loss_dicts                   r&   r[   TvpForVideoGrounding.forwardX  s*   < &1%<k$++BYBY**/!5#%=  	
  
**=9 ?@ILL%!&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r%   )rd   re  r  )	NNNNNNNNF)r   r   r   r   r0   r   r   r    r  r!   r#   r   r   r[   r$   r]   r^   s   @r&   r  r  J  s      1548590415,0/3&*).@
E,,-@
 u001@
 !!1!12	@

 u||,-@
 E--.@
 $D>@
 'tn@
 d^@
 #'@
 @
r%   r  )ry  rd  r  )3r   r  dataclassesr   typingr   r    torch.utils.checkpointr   activationsr   modeling_layersr   modeling_outputsr	   r
   r   modeling_utilsr   pytorch_utilsr   utilsr   r   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   r  r(   r`   r   r   r   r!  r/  r7  rE  rZ  rd  r  r  r  ry  r  r  __all__r   r%   r&   <module>r     s     !     ! 9 X X - / , 1 ( 
		H	% ?k ?  ?$Mbii M`%RYY %Pnbii nb!RYY !H_299 _Fbii RYY / 82
 2
l		  . . .B"bii "JW")) Wv ,#   
e
! e

e
PBII  
J
- J

J
Z Er%   