
    PhY                       d dl Z d dlmZ d dlmZmZmZmZ d dlZd dlm	Z	 ddl
mZ ddlmZ ddlmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z) ddl*m+Z+m,Z,m-Z- ddl.m/Z/m0Z0m1Z1  e%jd                  e3      Z4 G d de	jj                        Z6e# G d de             Z7	 dGde	jj                  dejp                  dejp                  dejp                  deejp                     de9de9fdZ: G d  d!e	jj                        Z; G d" d#e	jj                        Z< G d$ d%e      Z= G d& d'e	jj                        Z> G d( d)e7      Z? G d* d+e	jj                        Z@ G d, d-e	jj                        ZA G d. d/e	jj                        ZB G d0 d1e	jj                        ZC G d2 d3e	jj                        ZD G d4 d5e      ZE G d6 d7e	jj                        ZF G d8 d9e	jj                        ZG G d: d;e7      ZHe e#d<=       G d> d?e!                    ZI e#d@=       G dA dBe7             ZJ e#dC=       G dD dEe7e             ZKg dFZLy)H    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipVideoConfigInstructBlipVideoQFormerConfigInstructBlipVideoVisionConfigc                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )!InstructBlipVideoVisionEmbeddingsconfigc                 p   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  dd| j                              | _        t        j                  d| j                  | j                  | j                        | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j                  t        j                  d| j                  | j                              | _        y )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__r*   hidden_size	embed_dim
image_size
patch_sizer   	Parametertorchrandnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingselfr*   	__class__s     z/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/modeling_instructblipvideo.pyr1   z*InstructBlipVideoVisionEmbeddings.__init__8   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"b    
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaper>   r7   jit
is_tracingr5   r   reshapepermuter   
functionalinterpolateviewcat)r@   rD   rE   rF   r<   r=   class_pos_embedpatch_pos_embedrO   
new_height	new_widthsqrt_num_positionss               rB   interpolate_pos_encodingz:InstructBlipVideoVisionEmbeddings.interpolate_pos_encodingJ   s`    !&&q)A-//55a81< yy##%+*F6UZ?***11!RaR%811!QR%8r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCrC   pixel_valuesr^   c                    |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      j	                  |      }	t        j                  |	|gd      }
|r| j                  |
||      }n| j                  }|
|d d d |
j                  d      d d f   j	                  |      z   }
|
S )Ndtyper    r$   rI   rN   )rP   r;   weightrb   toflatten	transposer9   expandr7   rX   r^   r>   rK   )r@   r_   r^   
batch_size_rE   rF   target_dtypepatch_embedsclass_embedsrD   r>   s               rB   forwardz)InstructBlipVideoVisionEmbeddings.forwardr   s    '3'9'9$
Avu++2288++LOO,O,OP#++A.88A>++22:q"EHHVYYl;C
#!%!>!>z6SX!Y!%!8!8"4Q8L*//!:L8La5O"P"S"ST`"aa
rC   F)__name__
__module____qualname__r'   r1   r7   Tensorintr^   FloatTensorboolrm   __classcell__rA   s   @rB   r)   r)   7   sm    c< c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn rC   r)   c                   B    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZd Zy) InstructBlipVideoPreTrainedModelr*   blipT)"InstructBlipVideoQFormerEmbeddingsInstructBlipVideoAttention*InstructBlipVideoQFormerMultiHeadAttention"InstructBlipVideoQFormerSelfOutputc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        r(|j                  j                  j                  d|       yt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              rYt        j                  j!                  |j"                  d|       t        j                  j!                  |j$                  d|       yt        |t&        t(        f      r%|j*                  j                  j                          yy)zInitialize the weights        )meanstdN      ?)r*   initializer_range
isinstancer   Linearr:   rc   datanormal_biaszero_	Embedding	LayerNormfill_r)   inittrunc_normal_r>   r9   )InstructBlipVideoForConditionalGenerationInstructBlipVideoModelquery_tokens)r@   modulefactors      rB   _init_weightsz.InstructBlipVideoPreTrainedModel._init_weights   sI   ..fryy"))45MM&&CV&<{{&  &&( '-MM&&CV&<-KK""$MM$$S) ABGG!!&";";#6!RGG!!&"8"8s!O!JLb cd$$**, erC   N)ro   rp   rq   r%   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr    rC   rB   ry   ry      s>    ##&*#"&N!-rC   ry   r   querykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrI   rN   )ptrainingr$   r    )	r7   matmulrf   r   rU   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             rB   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rC   c                        e Zd ZdZ fdZdej                  dedefdZ	 ddej                  de	ej                     d	e
ej                  e	ej                     e	e
ej                        f   fd
Z xZS )r|   z=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        d| _
        |j                  | _        t        j                  | j                  d| j                  z  d      | _        |j                  ret        j                   t#        j$                  | j                              }t        j                   t#        j$                  | j                              }nd }d }|Qt#        j&                  |t#        j(                  |d      |f      }t        j                   |      | j                  _        t        j                  | j                  | j                        | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )r   )requires_grad)r0   r1   r*   r2   r3   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   r   qkvqkv_biasr6   r7   zerosrX   
zeros_liker   
projection)r@   r*   q_biasv_biasr   rA   s        rB   r1   z#InstructBlipVideoAttention.__init__   su   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCrC   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr$   r    )rW   r   r   rf   r   )r@   r   r   r   s       rB   _shapez!InstructBlipVideoAttention._shape   s7    {{3GQQRSUVWbbddrC   hidden_states	head_maskrG   c                 6   |j                         \  }}}| j                  |      }|j                  ||d| j                  || j                  z        j	                  ddddd      }|d   |d   |d   }
}	}t
        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
fd| j                  sdn| j                  | j                  d	|\  }}|j                  ||d
      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerNr   )r   r   r   rI   )rK   r   rS   r   rT   r   r*   _attn_implementationr   r   r   r   r   r   )r@   r   r   r   r   tgt_lenr3   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 rB   rm   z"InstructBlipVideoAttention.forward   s-    #0"4"4"6WiHH]+	%%c7At~~yTXTbTbGbckkq!Q
	 2;1y|YWX\,j(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
!\ "))#w;FFHook2L((rC   N)ro   rp   rq   __doc__r1   r7   rr   rs   r   r   tuplerm   rv   rw   s   @rB   r|   r|      s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$)rC   r|   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InstructBlipVideoMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r0   r1   r*   r
   
hidden_actactivation_fnr   r   r2   intermediate_sizefc1fc2r?   s     rB   r1   zInstructBlipVideoMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJrC   r   rG   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r@   r   s     rB   rm   zInstructBlipVideoMLP.forward  s4    /**=9/rC   ro   rp   rq   r1   r7   rr   rm   rv   rw   s   @rB   r   r     s$    KU\\ ell rC   r   c            	            e Zd Zdef fdZedej                  dej                  dee	   dej                  fd       Z xZS )InstructBlipVideoEncoderLayerr*   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r0   r1   r2   r3   r|   	self_attnr   r   layer_norm_epslayer_norm1r   mlplayer_norm2r?   s     rB   r1   z&InstructBlipVideoEncoderLayer.__init__  sm    ++3F;<<F<Q<QR'/<<F<Q<QRrC   r   r   r   rG   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r   )r   r   r   r   )r@   r   r   r   residualri   s         rB   rm   z%InstructBlipVideoEncoderLayer.forward$  s     !((7)4>> 
'$
 
q
 &0 ((7/%0rC   )ro   rp   rq   r%   r1   r   r7   rr   r   r   rt   rm   rv   rw   s   @rB   r   r     s`    S6 S ||  +,	
 
		 rC   r   c            
       t     e Zd ZdZdef fdZe	 ddeej                     de
e   deeef   fd       Z xZS )	InstructBlipVideoEncodera"  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipVideoEncoderLayer`].

    Args:
        config (`InstructBlipVideoConfig`):
            The corresponding vision configuration for the `InstructBlipVideoEncoder`.
    r*   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r0   r1   r*   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r@   r*   ri   rA   s      rB   r1   z!InstructBlipVideoEncoder.__init__G  sW    mmTYZ`ZrZrTs$tTsq%B6%JTs$tu&+# %us   A#r   r   rG   c                 V    |}| j                   D ]  } ||fd|i|} t        |      S )Nr   last_hidden_state)r   r   )r@   inputs_embedsr   r   r   encoder_layers         rB   rm   z InstructBlipVideoEncoder.forwardM  sC     &![[M)- M ) ??rC   r   )ro   rp   rq   r   r%   r1   r   r   r7   rr   r   r   r   r   r   rm   rv   rw   s   @rB   r   r   =  sj    ,6 ,  26@ !.@ +,	@
 
uo%	&@ @rC   r   c                        e Zd ZU dZeed<   eedZdef fdZ	e
e	 	 d
deej                     dedee   deeef   fd              Zd	 Z xZS )InstructBlipVideoVisionModelr_   r*   )r   
attentionsc                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )r0   r1   r*   r2   r)   rD   r   encoderr   r   r   post_layernorm	post_init)r@   r*   r3   rA   s      rB   r1   z%InstructBlipVideoVisionModel.__init__g  s]     &&	;FC/7 ll9&:O:OPrC   r^   r   rG   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r^   r   r   r   pooler_outputr   )r   rD   r   r   r  r   )r@   r_   r^   r   r   encoder_outputsr   pooled_outputs           rB   rm   z$InstructBlipVideoVisionModel.forwardr  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rC   c                     | j                   S r   )rD   r@   s    rB   get_input_embeddingsz1InstructBlipVideoVisionModel.get_input_embeddings  s    rC   r   )ro   rp   rq   main_input_namer'   r   r   r|   _can_record_outputsr1   r   r   r   r7   rt   ru   r   r   r   r   r   rm   r
  rv   rw   s   @rB   r   r   _  s    $O))60
	< 	  59).
u001
 #'
 +,	

 
u00	1
  
6rC   r   c                   Z     e Zd Zd	 fd	Zd Zd Zd Zd Zd Z	 	 	 	 d
de	e
   fdZ xZS )r}   c                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        t'        |dd      | _        | j(                  dk(  s| j(                  dk(  rF|j*                  | _        t        j,                  d|j*                  z  d	z
  | j                        | _        d
| _        y )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)r0   r1   r*   r2   r   hasattrr   rs   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   getattrr  max_position_embeddingsr   distance_embeddingsave_attentionr@   r*   is_cross_attentionrA   s      rB   r1   z3InstructBlipVideoQFormerMultiHeadAttention.__init__  s    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD##rC   c                     || _         y r   attn_gradients)r@   r"  s     rB   save_attn_gradientsz>InstructBlipVideoQFormerMultiHeadAttention.save_attn_gradients  s
    ,rC   c                     | j                   S r   r!  r	  s    rB   get_attn_gradientsz=InstructBlipVideoQFormerMultiHeadAttention.get_attn_gradients  s    """rC   c                     || _         y r   attention_map)r@   r(  s     rB   save_attention_mapz=InstructBlipVideoQFormerMultiHeadAttention.save_attention_map  s
    *rC   c                     | j                   S r   r'  r	  s    rB   get_attention_mapz<InstructBlipVideoQFormerMultiHeadAttention.get_attention_map  s    !!!rC   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )NrI   r   r    r$   r	   )rK   r   r  rW   rT   )r@   xnew_x_shapes      rB   transpose_for_scoresz?InstructBlipVideoQFormerMultiHeadAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$rC   r   c                    |d u}|rC| j                  | j                  |            }| j                  | j                  |            }	|}n@| j                  | j                  |            }| j                  | j                  |            }	| j                  |      }
| j                  |
      }t	        j
                  ||j                  dd            }| j                  dk(  s| j                  dk(  rF|j                         d   }t	        j                  |t        j                  |j                        j                  dd      }t	        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                         }| j                  dk(  rt	        j"                  d||      }||z   }nE| j                  dk(  r6t	        j"                  d||      }t	        j"                  d	||      }||z   |z   }|t%        j&                  | j(                        z  }|j                   }|||z   } t+        j,                  d
      |      j                  |      }|r8| j.                  r,| j1                  |       |j3                  | j4                         | j7                  |      }|||z  }t	        j
                  ||	      }|j9                  dddd      j;                         }|j                         d d | j<                  fz   } |j                  | }||fS )NrI   r   r  r  r$   rb   devicera   zbhld,lrd->bhlrzbhrd,lrd->bhlrrN   r   r    r	   )r/  r   r   r   r7   r   rf   r  rK   arangelongr2  rW   r  r  rd   rb   einsummathsqrtr  r   Softmaxr  r)  register_hookr#  r   rT   r   r  )r@   r   r   r   encoder_hidden_statesencoder_attention_maskr   r  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             rB   rm   z2InstructBlipVideoQFormerMultiHeadAttention.forward  s!    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ!1!7!7%/.@ -"**,-=>AABXY$"5"5##O4))$*B*BC #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--rC   rn   NNNN)ro   rp   rq   r1   r#  r%  r)  r+  r/  r   r   rm   rv   rw   s   @rB   r}   r}     sF    $8-#+"% "#I. +,I.rC   r}   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r~   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y r   )r0   r1   r   r   r2   denser   r   r  hidden_dropout_probr   r?   s     rB   r1   z+InstructBlipVideoQFormerSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=rC   r   input_tensorrG   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rQ  r   r   r@   r   rS  s      rB   rm   z*InstructBlipVideoQFormerSelfOutput.forward  7    

=1]3}|'CDrC   r   rw   s   @rB   r~   r~     1    >U\\  RWR^R^ rC   r~   c                        e Zd Zd fd	Zd Z	 	 	 	 ddej                  deej                     deej                     deej                     deej                     de	e
   d	ej                  fd
Z xZS )!InstructBlipVideoQFormerAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y r   )r0   r1   r}   	attentionr~   outputsetpruned_headsr  s      rB   r1   z*InstructBlipVideoQFormerAttention.__init__  s3    CFL^_8@ErC   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r$   rN   )lenr   r\  r   r  r_  r   r   r   r   r]  rQ  r  union)r@   headsindexs      rB   prune_headsz-InstructBlipVideoQFormerAttention.prune_heads"  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:rC   r   r   r   r:  r;  r   rG   c           	      `     | j                   d|||||d|\  }}| j                  ||      }	|	S )N)r   r   r   r:  r;  r   )r\  r]  )
r@   r   r   r   r:  r;  r   r   ri   attention_outputs
             rB   rm   z)InstructBlipVideoQFormerAttention.forward4  sO     ( 
')"7#9
 
Q  ;;{MBrC   rn   rN  )ro   rp   rq   r1   re  r7   rr   r   rt   r   r   rm   rv   rw   s   @rB   rZ  rZ    s    ";* 7;15=A>B ||  !!2!23  E--.	 
  ((9(9:  !)):): ;  +,  
 rC   rZ  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )$InstructBlipVideoQFormerIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r0   r1   r   r   r2   r   rQ  r   r   strr
   intermediate_act_fnr?   s     rB   r1   z-InstructBlipVideoQFormerIntermediate.__init__J  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$rC   r   rG   c                 J    | j                  |      }| j                  |      }|S r   )rQ  rl  r   s     rB   rm   z,InstructBlipVideoQFormerIntermediate.forwardR  s&    

=100?rC   r   rw   s   @rB   ri  ri  I  s#    9U\\ ell rC   ri  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )InstructBlipVideoQFormerOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r0   r1   r   r   r   r2   rQ  r   r   r  rR  r   r?   s     rB   r1   z'InstructBlipVideoQFormerOutput.__init__Y  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=rC   r   rS  rG   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   rU  rV  s      rB   rm   z&InstructBlipVideoQFormerOutput.forward_  rW  rC   r   rw   s   @rB   ro  ro  X  rX  rC   ro  c                   H     e Zd Z fdZ	 	 	 	 	 ddee   fdZd Zd Z xZ	S )InstructBlipVideoQFormerLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        || _        ||j                  z  dk(  rt	        |d      | _        d| _	        nd| _	        t        |      | _        t        |      | _        t        |      | _        t        |      | _        y )Nr$   r   T)r  F)r0   r1   chunk_size_feed_forwardseq_len_dimrZ  r\  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionri  intermediatero  r]  intermediate_queryoutput_queryr@   r*   rw  rA   s      rB   r1   z&InstructBlipVideoQFormerLayer.__init__g  s    '-'E'E$:6B"v7771<"CF_c"dD'+D$',D$@H4V<"Fv"N:6BrC   r   c           
      f    | j                   |f||d|}|dkD  r|d d d |d d f   }	| j                  r%|t        d       | j                  |	f||||d|}	t	        | j
                  | j                  | j                  |	      }
|j                  d   |kD  rjt	        | j                  | j                  | j                  |d d |d d d f         j                  |
j                        }t        j                  |
|gd      }
|
S t	        | j                  | j                  | j                  |      }
|
S )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   r:  r;  r$   rN   )r\  rz  r   ry  r   feed_forward_chunk_queryru  rv  rP   feed_forward_chunkrd   r2  r7   rX   )r@   r   r   r   r:  r;  query_lengthr   rg  query_attention_outputlayer_outputlayer_output_texts               rB   rm   z%InstructBlipVideoQFormerLayer.forward{  sv    *4>>
)
 	
 !%5a,6I%J"''(0$%eff)<)<)<**#1'*?+A* *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,8I)JPQR  5'',,   	L rC   c                 L    | j                  |      }| j                  ||      }|S r   )r{  r]  r@   rg  intermediate_outputr  s       rB   r  z0InstructBlipVideoQFormerLayer.feed_forward_chunk  s,    "//0@A{{#68HIrC   c                 L    | j                  |      }| j                  ||      }|S r   )r|  r}  r  s       rB   r  z6InstructBlipVideoQFormerLayer.feed_forward_chunk_query  s.    "556FG(()<>NOrC   NNNNr   )
ro   rp   rq   r1   r   r   rm   r  r  rv   rw   s   @rB   rs  rs  f  s:    C. "#6 +,6p
rC   rs  c                   F     e Zd Z fdZe	 	 	 	 	 ddee   fd       Z xZS )InstructBlipVideoQFormerEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w r   )
r0   r1   r*   r   r   r   r   rs  layerr   r~  s      rB   r1   z(InstructBlipVideoQFormerEncoder.__init__  s]    ]]OTU[UmUmOnoOn)*69=Ono

 ',# ps   A$r   c                     t        | j                  j                        D ])  }| j                  |   }	|||   nd }
 |	|||
|f||d|}+ t	        |      S )N)r;  r  r   )r   r*   r   r  r   )r@   r   r   r   r:  r;  r  r   ilayer_modulelayer_head_masks              rB   rm   z'InstructBlipVideoQFormerEncoder.forward  sy     t{{445A::a=L.7.CilO(%	
 (>) M	 6 9+
 	
rC   r  )	ro   rp   rq   r1   r   r   r   rm   rv   rw   s   @rB   r  r    s=    ,  "#
 +,
 
rC   r  c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )r{   z;Construct the embeddings from word and position embeddings.c                 P   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j!                  dt#        j$                  |j                        j'                  d      d       t)        |dd      | _        || _        y )	N)padding_idxr   position_ids)r$   rI   F)
persistentr  r  )r0   r1   r   r   
vocab_sizer2   pad_token_idword_embeddingsr  position_embeddingsr   r   	layernormr  rR  r   register_bufferr7   r3  rg   r  r  r*   r?   s     rB   r1   z+InstructBlipVideoQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$rC   c                    ||j                         d   }nd}|&| j                  d d |||z   f   j                         }|k| j                  |      }| j                  dk(  r/| j                  |j                  |j                              }||z   }|t        j                  ||fd      }n|}|j                  | j                  j                  j                        }| j                  |      }| j                  |      }|S )Nr$   r   r  rN   )rK   r  cloner  r  r  rd   r2  r7   rX   r  rc   rb   r   )r@   	input_idsr  query_embedspast_key_values_lengthrA  rD   r  s           rB   rm   z*InstructBlipVideoQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|zO`O`?a&b#'*==
'"YYj'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
rC   )NNNr   )ro   rp   rq   r   r1   rm   rv   rw   s   @rB   r{   r{     s    E$  rC   r{   c                       e Zd ZdZdZdZdZdZe e	e
dd      g e	e
dd      gdZdef fd	Zd
 Zd Zd Z	 ddej$                  dee   dej*                  dedej$                  f
dZee	 	 	 	 	 	 ddej4                  deej8                     deej4                     deej$                     deej8                     deej8                     deej8                     dee   deeej8                     e f   fd              Z! xZ"S )InstructBlipVideoQFormerModelz
    Querying Transformer (Q-Former), used in InstructBlipVideo. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr$   z
.attention)rd  
layer_namez.crossattention)r   r   cross_attentionsr*   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )r0   r1   r*   r{   rD   r  r   r  r?   s     rB   r1   z&InstructBlipVideoQFormerModel.__init__-  s9     <VD6v>rC   c                 .    | j                   j                  S r   rD   r  r	  s    rB   r
  z2InstructBlipVideoQFormerModel.get_input_embeddings7  s    ...rC   c                 &    || j                   _        y r   r  r@   r   s     rB   set_input_embeddingsz2InstructBlipVideoQFormerModel.set_input_embeddings:  s    */'rC   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr   r  r\  re  )r@   heads_to_pruner  rc  s       rB   _prune_headsz*InstructBlipVideoQFormerModel._prune_heads=  s>    
 +002LE5LLu%//;;EB 3rC   r   input_shaper2  	has_queryrG   c                    |j                         dk(  r|dddddddf   }n=|j                         dk(  r|ddddddf   }nt        d| d|j                   d      |j                  | j                        }d|z
  d	z  }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )ra   r   g     )rO   r   rP   rd   rb   )r@   r   r  r2  r  extended_attention_masks         rB   get_extended_attention_maskz9InstructBlipVideoQFormerModel.get_extended_attention_maskE  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&rC   r  r  r  r   r:  r;  r   c           	      D   ||t        d      ||j                  d   nd}	| j                  |||      }
|
j                         dd }|\  }}|
j                  }|t        j                  ||f|      }| j                  |||      }|t        |t              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t              r|D cg c]  }| j                  |       }}n?|)t        j                  ||      }| j                  |      }n| j                  |      }nd}| j                  || j                  j                        } | j                  |
f|||||	d|}|j                  }|dddddf   }t!        ||	      S c c}w )
a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rI   )r2  )r   r   r:  r;  r  r  )r   rP   rD   rK   r2  r7   onesr  r   listinvert_attention_maskget_head_maskr*   r   r   r   r   )r@   r  r   r  r  r   r:  r;  r   r  embedding_outputr  rh   rA  r2  r  encoder_batch_sizeencoder_sequence_lengthri   encoder_hidden_shapemaskencoder_extended_attention_maskr  sequence_outputr  s                            rB   rm   z%InstructBlipVideoQFormerModel.forwardp  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
J!((!"ZZ*j)A6RN #'"B"B>S^`f"g !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2w`vX\43M3Md3S`v/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y$++2O2OP	+74<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
9 3xs   Frn   )NNNNNN)#ro   rp   rq   r   r   r   r   r   rs  r   r}   r  r&   r1   r
  r  r  r7   rr   r   rs   r2  ru   r  r   r   
LongTensorr   rt   r   r   r   r   rm   rv   rw   s   @rB   r  r    s   
 #( N 7EQ[gh
 EQ[lm
= /0C  )')' 3Z)' 	)'
 )' 
)'V  7;37/315=A>BO
##O
 !!2!23O
 u//0	O

 u||,O
 E--.O
  ((9(9:O
 !)):): ;O
 +,O
 
uU&&')UU	VO
  O
rC   r  zV
    Class defining the outputs of [`InstructBlipVideoForConditionalGeneration`].
    )custom_introc                      e Zd ZU dZdZeeej                        e	d<   dZ
eeej                        e	d<   dZeej                     e	d<   dZeeej                        e	d<   dZeeej                        e	d<   dee   fd	Zy)
4InstructBlipVideoForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsrG   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r  r  r  N)r  to_tuple).0kr@   s     rB   	<genexpr>zPInstructBlipVideoForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>  sD      
 ! WW Gq!**,- !s   -0)r   keysr	  s   `rB   r  z=InstructBlipVideoForConditionalGenerationModelOutput.to_tuple  s%     
 YY[	
 
 	
rC   )ro   rp   rq   r   r  r   r   r7   rt   r   r  r  r  r  r   r  r   rC   rB   r  r    s     04D(5**+
,315FHU5,,-.526NHU../6:>OXeE$5$567>AEHU5+<+<%=>E
%* 
rC   r  z`
    InstructBlipVideo base Model consisting of language model, qformer and vision encoder.
    c            #           e Zd ZdZdgZdef fdZd Zd Zd Z	d Z
d	ej                  d
ej                  fdZee	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     d	eej                     deej                     deej                     deej                     d
eej$                     dee   dee   dee   dedee   dee   deeef   fd              Z xZS )r   r_   r   r*   c                 (   t         |   |       t        |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        |j                        | _        t        j                  |j                  j                  |j                   j                        | _        t%        j&                  |j                         | _        | j(                  j*                  /| j*                  j-                  | j(                  j*                         | j(                  j.                  /| j.                  j-                  | j(                  j.                         | j1                          y Nr$   )r0   r1   r   vision_configvision_modelr   r6   r7   r   num_query_tokensqformer_configr2   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr   extend_keep_in_fp32_modulesr  r?   s     rB   r1   zInstructBlipVideoModel.__init__  s    89M9MNLLQ8O8OQWQfQfQrQr)st4V5J5JK#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG00<""))$*=*=*O*OP44@&&--d.A.A.W.WX 	rC   c                 6    | j                   j                         S r   r  r
  r	  s    rB   r
  z+InstructBlipVideoModel.get_input_embeddings      ""7799rC   c                 :    | j                   j                  |       y r   r  r  r  s     rB   r  z+InstructBlipVideoModel.set_input_embeddings      007rC   c                     | j                   j                  s_| j                  j                  | j                  j                  _        | j                  j                  | j                  j                  _        y y r   r*   use_decoder_only_language_modelr  sharedr   embed_tokensdecoderr	  s    rB   _tie_weightsz#InstructBlipVideoModel._tie_weights	  T    {{::7;7J7J7Q7QD''47;7J7J7Q7QD''4 ;rC   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyz
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_mapra  r7   cudadevice_countloggerwarningr  r  r  io_same_devicer@   r  s     rB   _preprocess_acceleratez-InstructBlipVideoModel._preprocess_accelerate  y    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
3:>D((7 4rC   r  r   c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        r1  rI   )r
  r7   r   r*   image_token_idr4  r2  all	unsqueeze	expand_asrd   r@   r  r   special_image_masks       rB   get_placeholder_maskz+InstructBlipVideoModel.get_placeholder_mask"       !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!rC   qformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskoutput_attentionsoutput_hidden_statesreturn_dictr^   	use_cacher   rG   c                    ||n| j                   j                  }|j                  \  }}}}}|j                  ||z  |||      }| j	                  ||	|
||      }|d   }t        j                  |j                         dd t
        j                  |j                        }| j                  j                  |j                  d   dd      }t        j                  |j                         dd t
        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  ||||||	|
|      }|d   ddd|j                  d      ddf   }| j!                  |      }|j                  || j                   j"                  |z  d      }|Q | j$                  j'                         |      }|| j                   j(                  k(  }|t        j                  |      }nl| | j'                         t        j*                  | j                   j(                  t
        j                  |j                              k(  }|j-                  d      }|j/                  d      j1                  |      j3                  |j                        }|j3                  |j                  |j4                        }|j7                  ||      }| j                   j8                  r | j$                  d|||	|
||d	|}n | j$                  d|||||	|
||d
|}t;        |||      S )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipVideoProcessor`]. See [`InstructBlipVideoProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        N)r_   r  r  r  r^   r   rI   r1  rN   r$   )r  r   r  r:  r;  r  r  r  r   r   r  r  r  r  )r   r   r	  r
  r  r  r  r  r  r   )r*   use_return_dictrP   rS   r  r7   r  rK   r4  r2  r   rg   	ones_likerepeat_interleaverX   r  r  r  r  r
  video_token_idr   r   r  r  rd   rb   masked_scatterr  r  )r@   r_   r  r  r  r   r	  r
  r   r  r  r  r^   r  r   rh   frameschannelrE   rF   r  image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                                 rB   rm   zInstructBlipVideoModel.forward1  s   R &1%<k$++B]B] 6B5G5G2
FGVU#++J,?&RWX**%/!5#%= + 
 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7/!5# % 	
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t FD//DDFyQM!*dkk.H.H!H%!&!;!.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;/99"=GGVYYZgZnZno 5 8 89M9M}ObOb c%445GI^_;;66)d)) +-"3%9'# G *d)) 
+-"3'="3%9'#
 
G D))#*
 	
rC   )NNNNNNNNNFN)ro   rp   rq   r  r  r%   r1   r
  r  r  r  r7   r  rt   r  r   r   r   rr   ru   r   r   r   r   r  rm   rv   rw   s   @rB   r   r     s    %O+,6 &:8R
?("e.>.> "uO`O` " 
 >B15598<=A04,0/3&*).$(
''
 !,,
 !))9)9 :	

 E--.
 !!1!12
 $E$4$45
 !))9)9 :
  -
 $D>
 'tn
 d^
 #'
 D>
 -.
  
uJJ	K!
  
rC   r   a  
    InstructBlipVideo Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c            %       (    e Zd ZU eed<   dZdZdgZdef fdZd Z	d Z
d Zd	ej                  fd
Zd Zd Zd Zd Z	 	 	 d"dej(                  dej*                  deej*                     dee   dee   f
dZdej*                  dej(                  fdZee	 	 	 	 	 	 	 	 	 	 	 	 d#dej(                  dej(                  deej*                     deej(                     deej*                     deej*                     deej*                     deej(                     dee   dee   deej*                     dee   dedee   dee   d	eee f   f d              Z! ejD                         	 	 	 	 	 	 d$dej(                  deej*                     deej*                     deej*                     deej*                     deej(                     ded	ej*                  fd        Z#	 	 	 d"dej(                  dej*                  deej*                     dee   dee   f
d!Z$ xZ%S )%r   r*   r_   Tr   c                 \   t         |   |       t        j                  |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        j                  |j                        | _        t        j                   |j                  j                  |j"                  j                        | _        |j&                  r t)        j*                  |j"                        }nt-        j*                  |j"                        }|j.                  %| j.                  j1                  |j.                         |j2                  %| j2                  j1                  |j2                         || _        | j7                          y r  )r0   r1   r   _from_configr  r  r   r6   r7   r   r  r  r2   r   r  r  r   r  r  r  r"   r  r#   r   r  r  r  r  )r@   r*   r  rA   s      rB   r1   z2InstructBlipVideoForConditionalGeneration.__init__  s3    8EEfFZFZ[LLQ8O8OQWQfQfQrQr)st4AA&BWBWX#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN++7"")).*J*JK//;&&--n.R.RS, 	rC   c                 6    | j                   j                         S r   r  r	  s    rB   r
  z>InstructBlipVideoForConditionalGeneration.get_input_embeddings  r  rC   c                 :    | j                   j                  |       y r   r  r  s     rB   r  z>InstructBlipVideoForConditionalGeneration.set_input_embeddings  r  rC   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)r@   new_embeddingss     rB   r%  z?InstructBlipVideoForConditionalGeneration.set_output_embeddings  s    11.ArC   rG   c                 6    | j                   j                         S r   )r  get_output_embeddingsr	  s    rB   r(  z?InstructBlipVideoForConditionalGeneration.get_output_embeddings  s    ""88::rC   c                 6    | j                   j                         S r   )r  get_encoderr	  s    rB   r*  z5InstructBlipVideoForConditionalGeneration.get_encoder      ""..00rC   c                 6    | j                   j                         S r   )r  get_decoderr	  s    rB   r-  z5InstructBlipVideoForConditionalGeneration.get_decoder  r+  rC   c                     | j                   j                  s_| j                  j                  | j                  j                  _        | j                  j                  | j                  j                  _        y y r   r  r	  s    rB   r  z6InstructBlipVideoForConditionalGeneration._tie_weights  r  rC   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyr  r  r  s     rB   r  z@InstructBlipVideoForConditionalGeneration._preprocess_accelerate  r  rC   r  r  r^   r  c                      y)$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        Nr   )r@   r_   r  r  r^   r  s         rB   get_image_featuresz<InstructBlipVideoForConditionalGeneration.get_image_features
  s     	rC   r  r   c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S r  )r
  r7   r   r*   r  r4  r2  r   r  r  rd   r  s       rB   r  z>InstructBlipVideoForConditionalGeneration.get_placeholder_mask  r  rC   r   r	  r
  r  r  labelsr  r   c                 H   ||n| j                   j                  }| j                  ||||d      \  }}}|s|j                         n|}|s|j                         n|}| | j	                         |      }|t        j                  |      }|j                  |j                  |j                        }| j                  ||      }|j                  ||      }| j                   j                  re | j                  d|||	|
||d|}|r|j                  n|d   }d}|w | j                  d||| j                   j                   j"                  d|}nB | j                  d|||||	|
|||d	|}|r|j$                  n|d   }|r|j                  n|d	   }t'        |||||
      S )a  
        qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
            The sequence used as a prompt to be fed to the Q-Former module.
        qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
            Mask to avoid performing attention on padding token indices.

        Examples:

        ```python
        >>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
        >>> import torch
        >>> from huggingface_hub import hf_hub_download
        >>> import av
        >>> import numpy as np

        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

        >>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
        >>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> file_path = hf_hub_download(
        ...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample uniformly 4 frames from the videWhy is this video funny?o
        >>> total_frames = container.streams.video[0].frames
        >>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
        >>> clip = read_video_pyav(container, indices)

        >>> prompt = "What is happening in the video?"
        >>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        "A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
        ```NTr  r  r^   r  r   r  r   )r  r4  r  )	r   r   r	  r
  r  r  r  r4  r  r$   )r  r  r  r  r  r   )r*   r  get_video_featuresr  r
  r7   r  rd   r2  rb   r  r  r  r  r  loss_functionr  r  r  r  )r@   r_   r  r  r  r   r	  r
  r   r  r  r4  r  r^   r  r   r  r  r  r  r  r  r  s                          rB   rm   z1InstructBlipVideoForConditionalGeneration.forward*  s   f &1%<k$++B]B]?C?V?V/#9%= @W @
<~} ;F002>8C..0 7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +-"3%9'# G (3W^^
FD!)t)) !&T[[=T=T=_=_ci
 *d)) +-"3'="3%9'# G $/7<<GAJD'2W^^
FC))#*
 	
rC   c                 H   t        | d      r| j                          |j                  d   }	| j                  ||||d      \  }
}}||| j                  j
                  g| j                  j                  z  dz  }|| j                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                  |      }|
j!                  |j                  |j"                        }
| j%                  ||      }|j'                  ||
      }||d	}| j(                  j                  j*                  s||d
<    | j(                  j,                  di ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
                (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr6  r   r1  r$   r7  )r   r   r  r   )r  r  rP   r8  r*   video_token_indexr  r  bos_token_idr7   r   r4  r2  repeatr
  r  rd   rb   r  r  r  is_encoder_decodergenerate)r@   r_   r  r  r  r   r   r^   generate_kwargsrh   r  r  r  video_tokensstart_tokensr  inputsr  s                     rB   r?  z2InstructBlipVideoForConditionalGeneration.generate  s   D 4)'')!''*
?C?V?V/#9%= @W @
<~}    $ = =>A]A]]`aa+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?KrC   c                    |j                   \  }}}}	}
|j                  ||z  ||	|
      }| j                  ||d      }|d   }t        j                  |j                         dd t        j                  |j                        }| j                  j                  |j                   d   dd      }t        j                  |j                         dd t        j                  |j                        }|t        j                  |      }|j                  |d      }|j                  |d      }t        j                  ||gd      }| j                  |||||d	      }|d   ddd|j                  d      ddf   }| j                  |      }|j                  || j                  j                   |z  d      }|r|||fS |S )
r1  T)r_   r^   r  r   NrI   r1  rN   r$   )r  r   r  r:  r;  r  )rP   rS   r  r7   r  rK   r4  r2  r   rg   r  r  rX   r  r  r*   r  )r@   r_   r  r  r^   r  rh   r  r  rE   rF   r  r  r  r   r  r  r  r  s                      rB   r8  z<InstructBlipVideoForConditionalGeneration.get_video_features  s   " 6B5G5G2
FGVU#++J,?&RWX**%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"-??A?N!7!I!I&VW!I!X!&,@BX+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F !6 = =j$++JfJfioJoqs t(.-GG$$rC   )NFF)NNNNNNNNNNFN)NNNNNF)&ro   rp   rq   r%   r   r  r   r  r1   r
  r  r%  r   Moduler(  r*  r-  r  r  r7   rt   r  r   ru   r2  r  r   r   r   r   r   r   r  rm   no_gradr?  r8  rv   rw   s   @rB   r   r     sz    $#$O!+,6 4:8B;ryy ;11R
?0 >B38&+'' !++ !))9)9 :	
 #+4. d^""e.>.> "uO`O` " 
 >B15598<=A59,0/3-1&*).$(N
''N
 !,,N
 !))9)9 :	N

 E--.N
 !!1!12N
 $E$4$45N
 !))9)9 :N
   1 12N
 $D>N
 'tnN
 ))*N
 d^N
 #'N
 D>N
  +,!N
" 
uJJ	K#N
  N
` U]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C CR >B38&+9%''9% !++9% !))9)9 :	9%
 #+4.9% d^9%rC   r   )r   ry   r  r   r   )r   )Mr6  dataclassesr   typingr   r   r   r   r7   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipvideor%   r&   r'   
get_loggerro   r  rE  r)   ry   rr   floatr   r|   r   r   r   r   r}   r~   rZ  ri  ro  rs  r  r{   r  r  r   r   __all__r   rC   rB   <module>rX     sr  ,  ! 1 1   ! ) B 9  G & l l j j ? I I  
		H	%G		 GT #- #- #-\ %II%<<% 
% <<	%
 U\\*% % %.I) I)X299 $> D@ryy @D1#C 1hw. w.t + 		 + \299 RYY U$> Up$
bii $
N0 0fi
$D i
X 

; 
 
: 
F
= F

F
R }%0PRa }%}%@rC   