
    <hC5                       S SK r S SKJr  S SKJrJrJrJr  S SKrS SKJ	r	  SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJr  SSKJrJr  SSKJr  SSKJrJrJr  SSK J!r!J"r"J#r#J$r$J%r%J&r&  SSK'J(r(J)r)J*r*  SSK+J,r,J-r-J.r.  \%R^                  " \05      r1 " S S\	Rd                  5      r3 SFS\	Rd                  S\Rh                  S\Rh                  S\Rh                  S\\Rh                     S\5S\54S jjr6 " S S\	Rd                  5      r7 " S S \	Rd                  5      r8 " S! S"\5      r9 " S# S$\	Rd                  5      r: " S% S&\	Rd                  5      r; " S' S(\	Rd                  5      r< " S) S*\	Rd                  5      r= " S+ S,\	Rd                  5      r> " S- S.\	Rd                  5      r? " S/ S0\5      r@ " S1 S2\	Rd                  5      rA " S3 S4\	Rd                  5      rB\# " S5 S6\5      5       rC " S7 S8\C5      rD " S9 S:\C5      rE\\#" S;S<9 " S= S>\!5      5       5       rF\#" S?S<9 " S@ SA\C5      5       rG\#" SBS<9 " SC SD\C\5      5       rH/ SEQrIg)G    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                      ^  \ rS rSrS\4U 4S jjrS\R                  S\S\S\R                  4S jr	SS	\R                  S
\S\R                  4S jjrSrU =r$ )!InstructBlipVideoVisionEmbeddings6   configc                 r  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        UR                  U l        [        R                  " [        R                  " SSU R                  5      5      U l        [        R                  " SU R                  U R                  U R                  S9U l        U R
                  U R                  -  S-  U l        U R                  S-   U l        [        R                  " [        R                  " SU R                  U R                  5      5      U l        g )Nr"   r	   )in_channelsout_channelskernel_sizestrider   )super__init__r)   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr)   	__class__s     x/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr0   *InstructBlipVideoVisionEmbeddings.__init__7   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  -  n	X0R
                  -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r"   Ng      ?r   r	   r   bicubicF)sizemodealign_cornersdim)shaper=   r6   jit
is_tracingr4   r   reshapepermuter   
functionalinterpolateviewcat)r?   rD   rE   rF   r;   r<   class_pos_embedpatch_pos_embedrO   
new_height	new_widthsqrt_num_positionss               rA   interpolate_pos_encoding:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingI   sS    !&&q)A-//55a81< yy##%%+*F6?***11!RaR%811!QR%8r".
__,	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCrC   pixel_valuesr^   c                    UR                   u  p4pVU R                  R                  R                  nU R                  UR	                  US95      nUR                  S5      R                  SS5      nU R                  R                  USS5      R	                  U5      n	[        R                  " X/SS9n
U(       a  U R                  XU5      nOU R                  nXS S 2S U
R                  S5      2S S 24   R	                  U5      -   n
U
$ )Ndtyper   r"   rI   rN   )rP   r:   weightrc   toflatten	transposer8   expandr6   rX   r^   r=   rK   )r?   r`   r^   
batch_size_rE   rF   target_dtypepatch_embedsclass_embedsrD   r=   s               rA   forward)InstructBlipVideoVisionEmbeddings.forwardq   s    '3'9'9$
v++2288++LOO,O,OP#++A.88A>++22:q"EHHVYY;C
#!%!>!>zSX!Y!%!8!8Q8L*//!:L8La5O"P"S"ST`"aa
rC   )	r8   r)   r2   r3   r;   r<   r:   r4   r=   F)__name__
__module____qualname____firstlineno__r%   r0   r6   Tensorintr^   FloatTensorboolrn   __static_attributes____classcell__r@   s   @rA   r'   r'   6   sr    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn  rC   r'   modulequerykeyvalueattention_maskscalingdropoutc                 `   [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  USS9n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )NrI   rN   )ptrainingr"   r   )	r6   matmulrg   r   rU   softmaxr   r   
contiguous)
r|   r}   r~   r   r   r   r   kwargsattn_weightsattn_outputs
             rA   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2(>L==((6??([L,,|3K''1-88:K$$rC   c                     ^  \ rS rSrSrU 4S jrS\R                  S\S\4S jr	  SS\R                  S	\
\R                     S
\
\   S\\R                  \
\R                     \
\\R                        4   4S jjrSrU =r$ )InstructBlipVideoAttention   z=Multi-headed attention from 'Attention Is All You Need' paperc                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        SU l
        UR                  U l        [        R                  " U R                  SU R                  -  SS9U l        UR                  (       ai  [        R                   " ["        R$                  " U R                  5      5      n[        R                   " ["        R$                  " U R                  5      5      nOS nS nUbQ  ["        R&                  " U["        R(                  " USS9U45      n[        R                   " U5      U R                  l        [        R                  " U R                  U R                  5      U l        g )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )bias)requires_grad)r/   r0   r)   r1   r2   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasr5   r6   zerosrX   
zeros_liker   
projection)r?   r)   q_biasv_biasr   r@   s        rA   r0   #InstructBlipVideoAttention.__init__   ss   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrC   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ )Nr"   r   )rW   r   r   rg   r   )r?   r   r   r   s       rA   _shape!InstructBlipVideoAttention._shape   s5    {{3GQQRSUVWbbddrC   hidden_states	head_maskoutput_attentionsrG   c                    UR                  5       u  pVnU R                  U5      nUR                  XVSU R                  XpR                  -  5      R	                  SSSSS5      nUS   US   US   pn	[
        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU" U U	U
U4S	U R                  (       d  S
OU R                  U R                  S.UD6u  pUR                  XVS5      R                  5       nU R                  U5      nU(       a  X4nU$ US	4nU$ )z#Input shape: Batch x Time x Channelr	   r   r   r"      eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.N        )r   r   r   rI   )rK   r   rS   r   rT   r   r)   _attn_implementationloggerwarning_oncer   r   r   r   r   r   )r?   r   r   r   r   r   tgt_lenr2   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   outputss                   rA   rn   "InstructBlipVideoAttention.forward   sc    #0"4"4"6iHH]+	%%cAt~~yTbTbGbckkq!Q
	 2;1y|YWX\,(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7		%

  #}}C$2H2HJJ	%
 	%
! "))#;FFHook21B;- JUVZH[rC   )	r   r)   r2   r   r   r   r   r   r   NF)rq   rr   rs   rt   __doc__r0   r6   ru   rv   r   r   rx   tuplern   ry   rz   r{   s   @rA   r   r      s    GD>eU\\ eC ec e -1,1	,||, ELL), $D>	, 
u||Xell3XeELL>Q5RR	S, ,rC   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoMLP   c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g N)r/   r0   r)   r
   
hidden_actactivation_fnr   r   r1   intermediate_sizefc1fc2r>   s     rA   r0   InstructBlipVideoMLP.__init__   sb    #F$5$5699V//1I1IJ99V55v7I7IJrC   r   rG   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r?   r   s     rA   rn   InstructBlipVideoMLP.forward   s4    /**=9/rC   )r   r)   r   r   
rq   rr   rs   rt   r0   r6   ru   rn   ry   rz   r{   s   @rA   r   r      s)    KU\\ ell  rC   r   c            
          ^  \ rS rSrS\4U 4S jjr S
S\R                  S\R                  S\\	   S\
\R                     4S jjrS	rU =r$ )InstructBlipVideoEncoderLayer   r)   c                 <  > [         TU ]  5         UR                  U l        [	        U5      U l        [        R                  " U R                  UR                  S9U l	        [        U5      U l        [        R                  " U R                  UR                  S9U l        g Neps)r/   r0   r1   r2   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r>   s     rA   r0   &InstructBlipVideoEncoderLayer.__init__   sm    ++3F;<<F<Q<QR'/<<F<Q<QRrC   r   r   r   rG   c                     UnU R                  U5      nU R                  UUUS9u  pX-   nUnU R                  U5      nU R                  U5      nX-   nU4nU(       a  Xe4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
        `(config.encoder_attention_heads,)`.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r   r   )r   r   r   r   )r?   r   r   r   residualr   r   s          rA   rn   %InstructBlipVideoEncoderLayer.forward  s      !((7&*nn'$/ '5 '
#
 &0 ((7/%0 "&GrC   )r2   r   r   r   r   rp   )rq   rr   rs   rt   r#   r0   r6   ru   r   rx   r   rw   rn   ry   rz   r{   s   @rA   r   r      s^    S6 S -2	$||$ $ $D>	$
 
u  	!$ $rC   r   c                      ^  \ rS rSrSrS\4U 4S jjr    SS\\R                     S\\
   S\\
   S\\
   S	\\\4   4
S
 jjrSrU =r$ )InstructBlipVideoEncoderi+  a
  
Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
[`InstructBlipVideoEncoderLayer`].

Args:
    config (`InstructBlipVideoConfig`):
        The corresponding vision configuration for the `InstructBlipVideoEncoder`.
r)   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf r   )
r/   r0   r)   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r?   r)   rj   r@   s      rA   r0   !InstructBlipVideoEncoder.__init__5  sU    mmTYZ`ZrZrTs$tTsq%B6%JTs$tu&+# %u   A%r   r   output_hidden_statesreturn_dictrG   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSnUn[	        U R
                  5       H/  u  pU(       a  Xh4-   nU
" UUUS9nUS   nU(       d  M'  X{S   4-   nM1     U(       a  Xh4-   nU(       d  [        S XU4 5       5      $ [        XUS9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
N )r   r   r   r"   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r   .0vs     rA   	<genexpr>3InstructBlipVideoEncoder.forward.<locals>.<genexpr>t  s     e$Sq$Ss   	)last_hidden_stater   
attentions)r)   r   r   use_return_dict	enumerater   r   r   )r?   inputs_embedsr   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss               rA   rn    InstructBlipVideoEncoder.forward;  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8C#!/2B!B)-"3M *!,M  !/3C2E!E #9  +.>>Ne]N$Seee+Vd
 	
rC   )r)   r   r   )NNNN)rq   rr   rs   rt   r   r#   r0   r   r6   ru   rx   r   r   r   rn   ry   rz   r{   s   @rA   r   r   +  s~    ,6 , 26,0/3&*<
 !.<
 $D>	<

 'tn<
 d^<
 
uo%	&<
 <
rC   r   c                   ^   ^  \ rS rSrS
U 4S jjrS rS rS rS rS r	     SS jr
S	rU =r$ )*InstructBlipVideoQFormerMultiHeadAttentioniz  c                   > [         TU ]  5         Xl        UR                  UR                  -  S:w  a5  [        US5      (       d$  [        SUR                  UR                  4-  5      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        U(       aa  [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        O`[        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                   " UR"                  5      U l        ['        USS5      U l        U R(                  S:X  d  U R(                  S:X  aG  UR*                  U l        [        R,                  " SUR*                  -  S	-
  U R                  5      U l        S
U l        g )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr   r"   F)r/   r0   r)   r1   r   hasattrr   rv   attention_head_sizeall_head_sizer   r   r}   encoder_hidden_sizer~   r   Dropoutattention_probs_dropout_probr   getattrr  max_position_embeddings	Embeddingdistance_embeddingsave_attentionr?   r)   is_cross_attentionr@   s      rA   r0   3InstructBlipVideoQFormerMultiHeadAttention.__init__{  s    : ::a?PVXhHiHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD##rC   c                     Xl         g r   attn_gradients)r?   r  s     rA   save_attn_gradients>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s    ,rC   c                     U R                   $ r   r  r?   s    rA   get_attn_gradients=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rC   c                     Xl         g r   attention_map)r?   r   s     rA   save_attention_map=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s    *rC   c                     U R                   $ r   r  r  s    rA   get_attention_map<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rC   c                     UR                  5       S S U R                  U R                  4-   nUR                  " U6 nUR	                  SSSS5      $ )NrI   r   r   r"   r	   )rK   r   r  rW   rT   )r?   xnew_x_shapes      rA   transpose_for_scores?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFFK yyAq!$$rC   c                    US LnU(       aC  U R                  U R                  U5      5      nU R                  U R                  U5      5      n	UnO@U R                  U R                  U5      5      nU R                  U R                  U5      5      n	U R                  U5      n
U R                  U
5      n[        R
                  " XR                  SS5      5      nU R                  S:X  d  U R                  S:X  GaC  UR                  5       S   n[        R                  " U[        R                  UR                  S9R                  SS5      n[        R                  " U[        R                  UR                  S9R                  SS5      nX-
  nU R                  UU R                  -   S-
  5      nUR                  UR                   S9nU R                  S:X  a  [        R"                  " SUU5      nUU-   nOHU R                  S:X  a8  [        R"                  " SUU5      n[        R"                  " S	UU5      nUU-   U-   nU[$        R&                  " U R(                  5      -  nUR                   nUb  X-   n[*        R,                  " SS
9" U5      R                  U5      nU(       a=  U R.                  (       a,  U R1                  U5        UR3                  U R4                  5        U R7                  U5      nUb  UU-  n[        R
                  " UU	5      nUR9                  SSSS5      R;                  5       nUR                  5       S S U R<                  4-   nUR                  " U6 nU(       a  UU4nU$ U4nU$ )NrI   r   r  r  r"   rc   devicerb   zbhld,lrd->bhlrzbhrd,lrd->bhlrrN   r   r   r	   )r)  r~   r   r}   r6   r   rg   r  rK   arangelongr-  rW   r  r  re   rc   einsummathsqrtr  r   Softmaxr  r!  register_hookr  r   rT   r   r	  )r?   r   r   r   encoder_hidden_statesencoder_attention_maskr   r  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shaper   s                              rA   rn   2InstructBlipVideoQFormerMultiHeadAttention.forward  s.    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ!1!7!7%/@ **,-=>AABXY$"5"5##O4))$*B*BC #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CD6G=/2 O\M]rC   )r	  r  r   r  r)   r  r   r~   r  r   r  r}   r  r   rp   NNNNF)rq   rr   rs   rt   r0   r  r  r!  r$  r)  rn   ry   rz   r{   s   @rA   r   r   z  s>    $8-#+"% "#K KrC   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )"InstructBlipVideoQFormerSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g r   )r/   r0   r   r   r1   denser   r   r  hidden_dropout_probr   r>   s     rA   r0   +InstructBlipVideoQFormerSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rC   r   input_tensorrG   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rN  r   r   r?   r   rQ  s      rA   rn   *InstructBlipVideoQFormerSelfOutput.forward  5    

=1]3}'CDrC   r   rN  r   r   r{   s   @rA   rL  rL    6    >U\\  RWR^R^  rC   rL  c                     ^  \ rS rSrSU 4S jjrS r     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
   S
\\R                     4S jjrSrU =r$ )!InstructBlipVideoQFormerAttentioni  c                    > [         TU ]  5         [        X5      U l        [	        U5      U l        [        5       U l        g r   )r/   r0   r   	attentionrL  outputsetpruned_headsr  s      rA   r0   *InstructBlipVideoQFormerAttention.__init__  s0    CF_8@ErC   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r"   rN   )lenr   r\  r   r  r_  r   r}   r~   r   r]  rN  r	  union)r?   headsindexs      rA   prune_heads-InstructBlipVideoQFormerAttention.prune_heads  s   u:?7>>55t~~7Y7Y[_[l[l

  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:rC   r   r   r   r5  r6  r   rG   c           	      j    U R                  UUUUUUS9nU R                  US   U5      nU4USS  -   n	U	$ )N)r   r   r   r5  r6  r   r   r"   )r\  r]  )
r?   r   r   r   r5  r6  r   self_outputsattention_outputr   s
             rA   rn   )InstructBlipVideoQFormerAttention.forward  sY     ~~')"7#9/ & 
  ;;|AF#%QR(88rC   )r\  r]  r_  rp   rJ  )rq   rr   rs   rt   r0   rf  r6   ru   r   rw   rx   r   rn   ry   rz   r{   s   @rA   rZ  rZ    s    ";* 7;15=A>B,1|| !!2!23 E--.	
  ((9(9: !)):): ; $D> 
u||	 rC   rZ  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )$InstructBlipVideoQFormerIntermediatei3  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r/   r0   r   r   r1   r   rN  
isinstancer   strr
   intermediate_act_fnr>   s     rA   r0   -InstructBlipVideoQFormerIntermediate.__init__4  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$rC   r   rG   c                 J    U R                  U5      nU R                  U5      nU$ r   rN  rq  r   s     rA   rn   ,InstructBlipVideoQFormerIntermediate.forward<  s&    

=100?rC   rt  r   r{   s   @rA   rm  rm  3  s(    9U\\ ell  rC   rm  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )InstructBlipVideoQFormerOutputiB  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g r   )r/   r0   r   r   r   r1   rN  r   r   r  rO  r   r>   s     rA   r0   'InstructBlipVideoQFormerOutput.__init__C  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rC   r   rQ  rG   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rS  rT  s      rA   rn   &InstructBlipVideoQFormerOutput.forwardI  rV  rC   rW  r   r{   s   @rA   rw  rw  B  rX  rC   rw  c                   J   ^  \ rS rSrU 4S jr      SS jrS rS rSrU =r	$ )InstructBlipVideoQFormerLayeriP  c                 ^  > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        X l        X!R                  -  S:X  a  [	        USS9U l        SU l	        OSU l	        [        U5      U l        [        U5      U l        [        U5      U l        [        U5      U l        g )Nr"   r   T)r  F)r/   r0   chunk_size_feed_forwardseq_len_dimrZ  r\  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionrm  intermediaterw  r]  intermediate_queryoutput_queryr?   r)   r  r@   s      rA   r0   &InstructBlipVideoQFormerLayer.__init__Q  s    '-'E'E$:6B"7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrC   c           
         U R                  UUUUS9nUS   n	USS  n
US:  a  U	S S 2S U2S S 24   nU R                  (       a.  Uc  [        S5      eU R                  UUUUUUS9nUS   nXSS  -   n
[	        U R
                  U R                  U R                  U5      nU	R                  S   U:  ag  [	        U R                  U R                  U R                  U	S S 2US 2S S 24   5      R                  UR                  5      n[        R                  " X/SS9nO,[	        U R                  U R                  U R                  U	5      nU4U
-   n
U
$ )N)r   r   r   r   r"   z>encoder_hidden_states must be given for cross-attention layers)r   r   r5  r6  r   rN   )r\  r  r   r  r   feed_forward_chunk_queryr  r  rP   feed_forward_chunkre   r-  r6   rX   )r?   r   r   r   r5  r6  r   query_lengthself_attention_outputsrj  r   query_attention_outputcross_attention_outputslayer_outputlayer_output_texts                  rA   rn   %InstructBlipVideoQFormerLayer.forwarde  s    "&)/	 "0 "
 2!4(,!%5a,6I%J"''(0$%eff*.*=*=*#1'*?+A&7 +> +' *A)C&!AB$??4--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,)JPQR4'',,   	L  /G+rC   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r]  r?   rj  intermediate_outputr  s       rA   r  0InstructBlipVideoQFormerLayer.feed_forward_chunk  s)    "//0@A{{#6IrC   c                 J    U R                  U5      nU R                  X!5      nU$ r   )r  r  r  s       rA   r  6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s+    "556FG(()<OrC   )
r\  r  r  r  r  r  r  r]  r  r  )NNNNFr   )
rq   rr   rs   rt   r0   rn   r  r  ry   rz   r{   s   @rA   r}  r}  P  s2    C. "#=~
 rC   r}  c                   B   ^  \ rS rSrU 4S jr        SS jrSrU =r$ )InstructBlipVideoQFormerEncoderi  c           	         > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        SU l	        g s  snf r   )
r/   r0   r)   r   r   r   r   r}  layerr   r  s      rA   r0   (InstructBlipVideoQFormerEncoder.__init__  sY    ]]OTU[UmUmOnoOn)*6=Ono

 ',# pr   c
                    U(       a  SOS n
U(       a  SOS nU(       a  SOS n[        U R                  R                  5       Hn  nU R                  U   nU(       a  X4-   n
Ub  X=   OS nU" UUUUUUU	S9nUS   nU(       d  MA  UUS   4-   nU	S:  d  MR  UR                  (       d  Me  UUS   4-   nMp     U(       a  X4-   n
U(       d  [        S UU
UU4 5       5      $ [        UU
UUS9$ )Nr   )r6  r   r  r   r"   r   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r   r   s     rA   r   :InstructBlipVideoQFormerEncoder.forward.<locals>.<genexpr>  s"      	A  s   	)r   r   r   cross_attentions)r   r)   r   r  r  r   r   )r?   r   r   r   r5  r6  r   r   r   r  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_maskr   s                    rA   rn   'InstructBlipVideoQFormerEncoder.forward  s)    #7BD$5b4%6rDt{{445A::a=L#$58H$H!.7.CilO(%'="3)M *!,M  &9]1=M<O&O#!#(H(H(H+?=QRCSBU+U(+ 6.   14D D 	 "%'(		 	 	 9++*1	
 	
rC   )r)   r   r  )NNNNFFTr   )rq   rr   rs   rt   r0   rn   ry   rz   r{   s   @rA   r  r    s,    , "#":
 :
rC   r  c                   >   ^  \ rS rSrSrU 4S jr    SS jrSrU =r$ )"InstructBlipVideoQFormerEmbeddingsi  z;Construct the embeddings from word and position embeddings.c                 F  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        U R!                  S["        R$                  " UR                  5      R'                  S5      SS9  [)        USS5      U l        Xl        g )	N)padding_idxr   position_ids)r"   rI   F)
persistentr  r  )r/   r0   r   r  
vocab_sizer1   pad_token_idword_embeddingsr  position_embeddingsr   r   	layernormr  rO  r   register_bufferr6   r.  rh   r  r  r)   r>   s     rA   r0   +InstructBlipVideoQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$rC   c                    Ub  UR                  5       S   nOSnUc%  U R                  S S 2XEU-   24   R                  5       nUbi  U R                  U5      nU R                  S:X  a.  U R                  UR                  UR                  5      5      nXg-   nUb  [        R                  " X64SS9nOUnUR                  U R                  R                  R                  5      nU R                  U5      nU R                  U5      nU$ )Nr"   r   r  rN   )rK   r  cloner  r  r  re   r-  r6   rX   r  rd   rc   r   )r?   	input_idsr  query_embedspast_key_values_lengthr<  rD   r  s           rA   rn   *InstructBlipVideoQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|zO`O`?a&b#'=
'"YY'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rC   )r)   r   r  r  r  r  )NNNr   )	rq   rr   rs   rt   r   r0   rn   ry   rz   r{   s   @rA   r  r    s#    E$   rC   r  c                   J    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSr/ SQrS rSrg)	 InstructBlipVideoPreTrainedModeli(  r)   blipT)r  r   r   rL  c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       a%  UR                  R                  R                  SUS9  g[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       aS  [        R                  R!                  UR"                  SUS9  [        R                  R!                  UR$                  SUS9  g[        U[&        [(        45      (       a%  UR*                  R                  R                  5         gg)zInitialize the weightsr   )meanstdN      ?)r)   initializer_rangero  r   r   r9   rd   datanormal_r   zero_r  r   fill_r'   inittrunc_normal_r=   r8   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelquery_tokens)r?   r|   factors      rA   _init_weights.InstructBlipVideoPreTrainedModel._init_weights;  sX   ..fryy"))455MM&&CV&<{{&  &&( '--MM&&CV&<--KK""$MM$$S) ABBGG!!&";";#6!RGG!!&"8"8s!O!JLb cdd$$**, erC   r   N)rq   rr   rs   rt   r#   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr  ry   r   rC   rA   r  r  (  s>    ##&*#"&N!-rC   r  c                      ^  \ rS rSr% Sr\\S'   S\4U 4S jjr\     SS\	\
R                     S\	\   S\	\   S\	\   S\S	\\\4   4S
 jj5       rS rSrU =r$ )InstructBlipVideoVisionModeliO  r`   r)   c                    > [         TU ]  U5        Xl        UR                  n[	        U5      U l        [        U5      U l        [        R                  " X!R                  S9U l        U R                  5         g r   )r/   r0   r)   r1   r'   rD   r   encoderr   r   r   post_layernorm	post_init)r?   r)   r2   r@   s      rA   r0   %InstructBlipVideoVisionModel.__init__S  sY     &&	;FC/7 ll9:O:OPrC   r   r   r   r^   rG   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUc  [	        S5      eU R                  XS9nU R                  UUUUS9nUS   nU R                  U5      nUS S 2SS S 24   n	U R                  U	5      n	U(       d	  X4USS  -   $ [        UU	UR                  UR                  S9$ )Nz You have to specify pixel_values)r^   )r   r   r   r   r   r"   )r   pooler_outputr   r   )r)   r   r   r   r   rD   r  r  r   r   r   )
r?   r`   r   r   r   r^   r   encoder_outputsr   pooled_outputs
             rA   rn   $InstructBlipVideoVisionModel.forward^  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@h,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%58KKK)/')77&11	
 	
rC   c                     U R                   $ r   )rD   r  s    rA   get_input_embeddings1InstructBlipVideoVisionModel.get_input_embeddings  s    rC   )r)   rD   r  r  rJ  )rq   rr   rs   rt   main_input_namer%   r  r0   r   r   r6   rw   rx   r   r   r   rn   r  ry   rz   r{   s   @rA   r  r  O  s    $O))	< 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
T rC   r  c                     ^  \ rS rSrSrSrSrSrSrS\	4U 4S jjr
S rS rS r SS	\R                  S
\\   S\R$                  S\S\R                  4
S jjr         SS\R*                  S	\\R.                     S\\R*                     S\\R                     S\\R.                     S\\R.                     S\\R.                     S\\   S\\   S\\   S\\\R.                     \4   4S jjrSrU =r$ )InstructBlipVideoQFormerModeli  z
Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
instruction as input.
Fr)   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U R                  5         g r   )r/   r0   r)   r  rD   r  r  r  r>   s     rA   r0   &InstructBlipVideoQFormerModel.__init__  s7     <VD6v>rC   c                 .    U R                   R                  $ r   rD   r  r  s    rA   r  2InstructBlipVideoQFormerModel.get_input_embeddings  s    ...rC   c                 $    XR                   l        g r   r  r?   r   s     rA   set_input_embeddings2InstructBlipVideoQFormerModel.set_input_embeddings  s    */'rC   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  r  r\  rf  )r?   heads_to_pruner  rd  s       rA   _prune_heads*InstructBlipVideoQFormerModel._prune_heads  s<    
 +002LELLu%//;;EB 3rC   r   input_shaper-  	has_queryrG   c                    UR                  5       S:X  a  USS2SSS2SS24   nO>UR                  5       S:X  a  USS2SSSS24   nO[        SU SUR                   S35      eUR                  U R                  S9nSU-
  S	-  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`tuple[int]`):
        The shape of the input to the model.
    device: (`torch.device`):
        The device of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
r	   Nr   z!Wrong shape for input_ids (shape z) or attention_mask (shape )rb   r  g     )rO   r   rP   re   rc   )r?   r   r  r-  r  extended_attention_masks         rA   get_extended_attention_mask9InstructBlipVideoQFormerModel.get_extended_attention_mask  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rC   r  r  r  r   r5  r6  r   r   r   c                 V   Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Uc  Uc  [	        S5      eUb  UR
                  S   OSnU R                  UUUS9nUR                  5       SS nUu  pUR                  nUc  [        R                  " X4US9nU R                  X-U5      nUb  [        U[        5      (       a  US   R                  5       u  nnnOUR                  5       u  nnnUU4n[        U[        5      (       a!  U Vs/ sH  nU R                  U5      PM     nnO>Uc'  [        R                  " UUS9nU R                  U5      nOU R                  U5      nOSnU R                  XPR                   R                   5      nU R#                  UUUUUUU	U
US9	nUS   nUSS2SSS24   nU
(       d
  UU4USS -   $ [%        UUUR&                  UR(                  UR*                  UR,                  S	9$ s  snf )
an  
encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
    the model is configured as a decoder.
encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
    the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
past_key_values (`Cache` of length `config.n_layers` with each tuple having 4 tensors of:
    shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and
    value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are
    used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key
    value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape
    `(batch_size, sequence_length)`.
use_cache (`bool`, *optional*):
    If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
    `past_key_values`).
Nz7You have to specify query_embeds when input_ids is Noner"   r   )r  r  r  rI   )r-  )r   r   r5  r6  r   r   r   r  )r   r  past_key_valuesr   r   r  )r)   r   r   r   r   rP   rD   rK   r-  r6   onesr  ro  listinvert_attention_maskget_head_maskr   r  r   r  r   r   r  )r?   r  r   r  r  r   r5  r6  r   r   r   r  embedding_outputr  ri   r<  r-  r  encoder_batch_sizeencoder_sequence_lengthrj   encoder_hidden_shapemaskencoder_extended_attention_maskr  sequence_outputr  s                              rA   rn   %InstructBlipVideoQFormerModel.forward  s   @ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]!5VWW0<0H|))!,a??%% + 
 '++-cr2!,
!((!"ZZ*)A6RN #'"B"B>`f"g !,/66AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$77`v2w`vX\43M3Md3S`v/2w/'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y++2O2OP	,,2"7#B/!5#% ' 

 *!,'1a0#]3oab6III;-'+;;)77&11,==
 	
C 3xs   7H&)r)   rD   r  rp   )	NNNNNNNNN)rq   rr   rs   rt   r   r  r  r  r  r$   r0   r  r  r  r6   ru   r   rv   r-  rx   r  
LongTensorr   rw   r   r   rn   ry   rz   r{   s   @rA   r  r    s   
 #( N= /0C  )')' 3Z)' 	)'
 )' 
)'\ 7;37/315=A>B,0/3&*n
##n
 !!2!23n
 u//0	n

 u||,n
 E--.n
  ((9(9:n
 !)):): ;n
 $D>n
 'tnn
 d^n
 
uU&&')UU	Vn
 n
rC   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                      \ rS rSr% SrSr\\\R                        \
S'   Sr\\\R                        \
S'   Sr\\R                     \
S'   Sr\\\R                        \
S'   Sr\\\R                        \
S'   S	\\   4S
 jrSrg)4InstructBlipVideoForConditionalGenerationModelOutputiL  a~  
loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
    Language modeling loss from the language model.
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head of the language model.
vision_outputs (`BaseModelOutputWithPooling`):
    Outputs of the vision encoder.
qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
    Outputs of the Q-Former (Querying Transformer).
language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
    Outputs of the language model.
Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrG   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f)r  r  r  N)r  to_tuple)r   kr?   s     rA   r   PInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>g  sC      
 ! WW Gq!**,- !s   14)r   keysr  s   `rA   r  =InstructBlipVideoForConditionalGenerationModelOutput.to_tuplef  s%     
 YY[	
 
 	
rC   r   )rq   rr   rs   rt   r   r  r   r   r6   rw   r  r  r  r  r  r   r  ry   r   rC   rA   r  r  L  s     04D(5**+
,315FHU5,,-.526NHU../6:>OXeE$5$567>AEHU5+<+<%=>E
%* 
rC   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c            #         ^  \ rS rSrSrS/rS\4U 4S jjrS rS r	S r
S	 rS
\R                  S\R                  4S jr\\           SS\R                  S\R                  S\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\R&                     S\\   S\\   S\\   S\S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )r  io  r`   r  r)   c                 4  > [         TU ]  U5        [        UR                  5      U l        [
        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        UR                  5      U l        [
        R                  " UR                  R                  UR                   R                  5      U l        [$        R&                  " UR                   5      U l        U R(                  R*                  b/  U R*                  R-                  U R(                  R*                  5        U R(                  R.                  b/  U R.                  R-                  U R(                  R.                  5        U R1                  5         g Nr"   )r/   r0   r  vision_configvision_modelr   r5   r6   r   num_query_tokensqformer_configr1   r  r  qformerr   text_configlanguage_projectionr   from_configlanguage_modelr  extend_keep_in_fp32_modulesr  r>   s     rA   r0   InstructBlipVideoModel.__init__x  s    89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG00<""))$*=*=*O*OP44@&&--d.A.A.W.WX 	rC   c                 6    U R                   R                  5       $ r   r+  r  r  s    rA   r  +InstructBlipVideoModel.get_input_embeddings      ""7799rC   c                 :    U R                   R                  U5        g r   r+  r  r  s     rA   r  +InstructBlipVideoModel.set_input_embeddings      007rC   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   r)   use_decoder_only_language_modelr+  sharedr  embed_tokensdecoderr  s    rA   _tie_weights#InstructBlipVideoModel._tie_weights  T    {{::7;7J7J7Q7QD''47;7J7J7Q7QD''4 ;rC   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggz
Some pre-processing hacks to make the model `accelerate` compatible. Check
https://github.com/huggingface/transformers/pull/21707 for more details.
r"   r+  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maprb  r6   cudadevice_countr   warningr  r+  rB  io_same_devicer?   rD  s     rA   _preprocess_accelerate-InstructBlipVideoModel._preprocess_accelerate  |    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
33:>D((7 4rC   r  r   c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ zI
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`.
r,  rI   )r  r6   r   r)   image_token_idr/  r-  all	unsqueeze	expand_asre   r?   r  r   special_image_masks       rA   get_placeholder_mask+InstructBlipVideoModel.get_placeholder_mask       !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H/99"=GGVYYZgZnZno!!rC   qformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskr   r   r   r^   	use_cacher   rG   c                    Ub  UOU R                   R                  nUR                  u  nnnnnUR                  UU-  UUU5      nU R	                  UU	U
UUS9nUS   n[
        R                  " UR                  5       SS [
        R                  UR                  S9nU R                  R                  UR                  S   SS5      n[
        R                  " UR                  5       SS [
        R                  UR                  S9nUc  [
        R                  " U5      nUR                  USS9nUR                  USS9n[
        R                  " UU/SS9nU R                  UUUUUU	U
US9nUS   SS2SUR                  S5      2SS24   nU R!                  U5      nUR                  XR                   R"                  U-  S5      nUcR  U R$                  R'                  5       " U5      nX@R                   R(                  :H  nUc  [
        R                  " U5      nOiXR'                  5       " [
        R*                  " U R                   R(                  [
        R                  UR                  S95      :H  nUR-                  S5      nUR/                  S5      R1                  U5      R3                  UR                  5      nUR3                  UR                  UR4                  5      nUR7                  UU5      nU R                   R8                  (       a  U R$                  " SUUU	U
UUS	.UD6nOU R$                  " SUUUUU	U
UUS
.UD6n[;        UUUS9$ )a  
qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
    to serve as text prompt, which the Q-Former model will encode.

    Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.

    [What are attention masks?](../glossary#attention-mask)
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be
    provided to serve as text prompt, which the language model can continue.

    Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
    details.

    [What are input IDs?](../glossary#input-ids)
decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
    be used by default.

    Only relevant in case an encoder-decoder language model (like T5) is used.
N)r`   r   r   r   r^   r   rI   r,  rN   r"   )r  r   r  r5  r6  r   r   r   r   r   r   r   r   r\  )r   r   rZ  r[  r   r   r   r\  r  r   )r)   r   rP   rS   r$  r6   r  rK   r/  r-  r  rh   	ones_likerepeat_interleaverX   r'  r)  r%  r+  r  video_token_idr   rP  rQ  rR  re   rc   masked_scatterr9  r  )r?   r`   rX  rY  r  r   rZ  r[  r   r   r   r   r^   r\  r   ri   frameschannelrE   rF   r  image_embedsimage_attention_maskr  query_attention_maskquery_outputsquery_outputlanguage_model_inputsrT  r   s                                 rA   rn   InstructBlipVideoModel.forward  sw   b &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t  //DDFyQM!*kk.H.H!H%!&!;!.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)) +-"3%9'# G )) 
+-"3'="3%9'#
 
G D))#*
 	
rC   r+  r)  r'  r  r$  )NNNNNNNNNFN)rq   rr   rs   rt   r  r-  r#   r0   r  r  r=  rJ  r6   r  rw   rU  r   r   r   ru   rx   r   r   r   r   r  rn   ry   rz   r{   s   @rA   r  r  o  s    %O+,6 &:8R
?("e.>.> "uO`O` " 
 >B15598<=A04,0/3&*).$(G
''G
 !,,G
 !))9)9 :	G

 E--.G
 !!1!12G
 $E$4$45G
 !))9)9 :G
  -G
 $D>G
 'tnG
 d^G
 #'G
 D>G
 -.G
  
uJJ	K!G
  G
rC   r  a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c            %       H  ^  \ rS rSr% \\S'   SrSrS/rS\4U 4S jjr	S r
S rS	 rS
\R                  4S jrS rS rS rS r   S$S\R*                  S\R,                  S\\R,                     S\\   S\\   4
S jjrS\R,                  S\R*                  4S jr\\            S%S\R*                  S\R*                  S\\R,                     S\\R*                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\\   S\\   S\\R,                     S\\   S\S\\   S\\   S
\\ \!4   4 S  jj5       5       r"\RF                  " 5             S&S\R*                  S\\R,                     S\\R,                     S\\R,                     S\\R,                     S\\R*                     S\S
\R,                  4S! jj5       r$   S$S\R*                  S\R,                  S\\R,                     S\\   S\\   4
S" jjr%S#r&U =r'$ )'r  iE  r)   r`   Tr  c                 r  > [         TU ]  U5        [        R                  UR                  5      U l        [        R                  " [        R                  " SUR                  UR                  R                  5      5      U l        [        R                  UR                  5      U l        [        R                   " UR                  R                  UR"                  R                  5      U l        UR&                  (       a!  [(        R*                  " UR"                  5      nO [,        R*                  " UR"                  5      nUR.                  b%  U R.                  R1                  UR.                  5        UR2                  b%  U R2                  R1                  UR2                  5        X l        U R7                  5         g r"  )r/   r0   r  _from_configr#  r$  r   r5   r6   r   r%  r&  r1   r  r  r'  r   r(  r)  r9  r    r*  r!   r  r,  r-  r+  r  )r?   r)   r+  r@   s      rA   r0   2InstructBlipVideoForConditionalGeneration.__init__U  s1    8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN++7"")).*J*JK//;&&--n.R.RS, 	rC   c                 6    U R                   R                  5       $ r   r0  r  s    rA   r  >InstructBlipVideoForConditionalGeneration.get_input_embeddingso  r2  rC   c                 :    U R                   R                  U5        g r   r4  r  s     rA   r  >InstructBlipVideoForConditionalGeneration.set_input_embeddingsr  r6  rC   c                 :    U R                   R                  U5        g r   )r+  set_output_embeddings)r?   new_embeddingss     rA   rv  ?InstructBlipVideoForConditionalGeneration.set_output_embeddingsu  s    11.ArC   rG   c                 6    U R                   R                  5       $ r   )r+  get_output_embeddingsr  s    rA   rz  ?InstructBlipVideoForConditionalGeneration.get_output_embeddingsx  s    ""88::rC   c                 6    U R                   R                  5       $ r   )r+  get_encoderr  s    rA   r}  5InstructBlipVideoForConditionalGeneration.get_encoder{      ""..00rC   c                 6    U R                   R                  5       $ r   )r+  get_decoderr  s    rA   r  5InstructBlipVideoForConditionalGeneration.get_decoder~  r  rC   c                     U R                   R                  (       d_  U R                  R                  U R                  R                  l        U R                  R                  U R                  R                  l        g g r   r8  r  s    rA   r=  6InstructBlipVideoForConditionalGeneration._tie_weights  r?  rC   c                 "   U R                   n[        U5      S:  a=  SU;  a7  [        R                  R	                  5       S:  a  [
        R                  S5        [        U R                  S5      (       a  SU R                  R                  l
        ggrA  rC  rI  s     rA   rJ  @InstructBlipVideoForConditionalGeneration._preprocess_accelerate  rL  rC   rX  rY  r^   r   c                     g)
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
Nr   )r?   r`   rX  rY  r^   r   s         rA   get_image_features<InstructBlipVideoForConditionalGeneration.get_image_features  s     	rC   r  r   c           	         Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ rN  )r  r6   r   r)   ra  r/  r-  rP  rQ  rR  re   rS  s       rA   rU  >InstructBlipVideoForConditionalGeneration.get_placeholder_mask  rW  rC   r   rZ  r[  r   r   labelsr\  r   c                 x   Ub  UOU R                   R                  nU R                  UUUUSS9u  nnnU(       d  UR                  5       OUnU(       d  UR                  5       OUnUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                   R                  (       aj  U R                  " SUUU	U
UUS.UD6nU(       a  UR                  OUS   nSnUb3  U R                  " SUXR                   R                   R"                  S.UD6nOLU R                  " SUUUUU	U
UUUS.	UD6nU(       a  UR$                  OUS   nU(       a  UR                  OUS	   n['        UUUUUS
9$ )a	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```NTrX  rY  r^   r   r   r^  r   )r  r  r  )	r   r   rZ  r[  r   r   r   r  r\  r"   )r  r  r  r  r  r   )r)   r   get_video_featuresr  r  r6   r_  re   r-  rc   rU  rb  r9  r+  r  loss_functionr(  r  r  r  )r?   r`   rX  rY  r  r   rZ  r[  r   r   r   r  r   r^   r\  r   rj  r  rh  rT  r   r  r  s                          rA   rn   1InstructBlipVideoForConditionalGeneration.forward  s   f &1%<k$++B]B]?C?V?V/#9%= @W @
<~} ;F002>8C..0  557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445GI^_;;66)) +-"3%9'# G (3W^^
FD!)) !&[[=T=T=_=_ci
 )) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC))#*
 	
rC   c                 J   [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUSS9u  pnUc  Uc  U R                  R
                  /U R                  R                  -  S-  nXR                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                  " U5      nU
R!                  UR                  UR"                  5      n
U R%                  XFS9nUR'                  X5      nXeS	.nU R(                  R                  R*                  (       d  UUS
'   U R(                  R,                  " S0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
rD  r   Tr  r   r,  r"   r  )r   r   r  r   )r  rJ  rP   r  r)   video_token_indexr%  r(  bos_token_idr6   r   r/  r-  repeatr  r_  re   rc   rU  rb  r+  is_encoder_decodergenerate)r?   r`   rX  rY  r  r   r   r^   generate_kwargsri   rj  r  rh  video_tokensstart_tokensrT  inputsr   s                     rA   r  2InstructBlipVideoForConditionalGeneration.generateL  s   D 4))'')!''*
?C?V?V/#9%= @W @
<}    $ = =>A]A]]`aa+{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	 557	BM!"__Y7N 5 8 89M9M}ObOb c!66y6^%445G_#0S""))<<"+F;%%..KK?KrC   c           	      ^   UR                   u  pgpn
UR                  Xg-  XU
5      nU R                  UUSS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " X/SS9nU R                  UUUUUSS	9nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  X`R                  R                   U-  S5      nU(       a  UUU4$ U$ )
r  T)r`   r^   r   r   NrI   r,  rN   r"   )r  r   r  r5  r6  r   )rP   rS   r$  r6   r  rK   r/  r-  r  rh   r_  r`  rX   r'  r)  r)   r%  )r?   r`   rX  rY  r^   r   ri   rc  rd  rE   rF   r  re  rf  r  rg  rh  ri  rj  s                      rA   r  <InstructBlipVideoForConditionalGeneration.get_video_features  s   " 6B5G5G2
GU#++J,?RWX**%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j++JfJfioJoqs t(.-GG$$rC   rl  )NFF)NNNNNNNNNNFN)NNNNNF)(rq   rr   rs   rt   r#   r  r  r  r-  r0   r  r  rv  r   Modulerz  r}  r  r=  rJ  r6   rw   r  r   rx   r  rU  r   r   r   r   r   r   r  rn   no_gradr  r  ry   rz   r{   s   @rA   r  r  E  s}    $#$O!+,6 4:8B;ryy ;11R
?0 >B38&+'' !++ !))9)9 :	
 #+4. d^""e.>.> "uO`O` " 
 >B15598<=A59,0/3-1&*).$(N
''N
 !,,N
 !))9)9 :	N

 E--.N
 !!1!12N
 $E$4$45N
 !))9)9 :N
   1 12N
 $D>N
 'tnN
 ))*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
  N
` ]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C CR >B38&+9%''9% !++9% !))9)9 :	9%
 #+4.9% d^9% 9%rC   r  )r  r  r  r  r  )r   )Jr1  dataclassesr   typingr   r   r   r   r6   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   autor   r    r!   configuration_instructblipvideor#   r$   r%   
get_loggerrq   r   r  r'   ru   floatr   r   r   r   r   r   rL  rZ  rm  rw  r}  r  r  r  r  r  r  r  r  __all__r   rC   rA   <module>r     so  ,  ! 1 1   ! ) B 9  G & l l j j I I  
		H	%G		 Gd %II%<<% 
% <<	%
 U\\*% % %.Q Qh299 -$> -`L
ryy L
^y yx ,		 ,^299 RYY \$> \~C
bii C
L0 0f #- #- #-L;#C ;||
$D |
~ 

; 
 
: 
N
= N

N
b }%0PRa }%}%@rC   