ó
    <±h·j  ã                   ó–  • S SK JrJr  S SKrS SKrS SKJrJr  S SKJ	r	J
r
JrJrJrJrJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  \R:                  " \5      r " S S\5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\
5      r& " S S\5      r' " S S\	5      r(/ SQr)g) é    )ÚOptionalÚUnionN)ÚInstructBlipQFormerConfigÚInstructBlipVisionConfig)Ú$InstructBlipForConditionalGenerationÚ/InstructBlipForConditionalGenerationModelOutputÚInstructBlipModelÚInstructBlipPreTrainedModelÚInstructBlipQFormerModelÚInstructBlipVisionModelÚTransformersKwargsé   )ÚPretrainedConfig)ÚFlashAttentionKwargs)Ú!MODEL_FOR_CAUSAL_LM_MAPPING_NAMES)ÚUnpack)Úloggingé   )ÚCONFIG_MAPPINGÚ
AutoConfigc                   ó   • \ rS rSrSrg)ÚInstructBlipVideoVisionConfigé.   © N©Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__static_attributes__r   ó    Úw/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/instructblipvideo/modular_instructblipvideo.pyr   r   .   ó   † Úr!   r   c                   ó   • \ rS rSrSrg)ÚInstructBlipVideoQFormerConfigé2   r   Nr   r   r!   r"   r%   r%   2   r#   r!   r%   c                   ór   ^ • \ rS rSrSrSrSS0r\\\	S.r
     SU 4S jjr\S\	S	\S
\4S j5       rSrU =r$ )ÚInstructBlipVideoConfigé6   a
  
[`InstructBlipVideoConfig`] is the configuration class to store the configuration of a
[`InstructBlipVideoForConditionalGeneration`]. It is used to instantiate a Instructblipvideo model according to the specified
arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
the defaults will yield a similar configuration to that of the Instructblipvideo
[Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vision_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoVisionConfig`].
    qformer_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize [`InstructBlipVideoQFormerConfig`].
    text_config (`dict`, *optional*):
        Dictionary of configuration options used to initialize any [`PretrainedConfig`].
    num_query_tokens (`int`, *optional*, defaults to 32):
        The number of query tokens passed through the Transformer.

    video_token_index (`int`, *optional*):
        Token index of special video token.
    kwargs (*optional*):
        Dictionary of keyword arguments.

Example:

```python
>>> from transformers import (
...     InstructBlipVideoVisionConfig,
...     InstructBlipVideoQFormerConfig,
...     OPTConfig,
...     InstructBlipVideoConfig,
...     InstructBlipVideoForConditionalGeneration,
... )

>>> # Initializing a InstructBlipVideoConfig with Salesforce/instruct-blip-flan-t5 style configuration
>>> configuration = InstructBlipVideoConfig()

>>> # Initializing a InstructBlipVideoForConditionalGeneration (with random weights) from the Salesforce/instruct-blip-flan-t5 style configuration
>>> model = InstructBlipVideoForConditionalGeneration(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config

>>> # We can also initialize a InstructBlipVideoConfig from a InstructBlipVideoVisionConfig, InstructBlipVideoQFormerConfig and any PretrainedConfig

>>> # Initializing Instructblipvideo vision, Instructblipvideo Q-Former and language model configurations
>>> vision_config = InstructBlipVideoVisionConfig()
>>> qformer_config = InstructBlipVideoQFormerConfig()
>>> text_config = OPTConfig()

>>> config = InstructBlipVideoConfig.from_text_vision_configs(vision_config, qformer_config, text_config)
```ÚinstructblipvideoÚvideo_token_idÚvideo_token_index)Útext_configÚqformer_configÚvision_configc                 ó  >• [         TU ]  " S0 UD6  Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        Uc  0 n[        R                  S5        [	        S0 UD6U l        [        S0 UD6U l        UR                  SS5      n[        U   " S0 UD6U l
        X@l        XPl        U R
                  R                  U R                  l        U R                  R                  [         ;   U l        SU l        SU l        g )	NzZvision_config is None. initializing the InstructBlipVideoVisionConfig with default values.z\qformer_config is None. Initializing the InstructBlipVideoQFormerConfig with default values.zTtext_config is None. Initializing the text config with default values (`OPTConfig`).Ú
model_typeÚoptg      ð?g{®Gáz”?r   )ÚsuperÚ__init__ÚloggerÚinfor   r/   r%   r.   Úgetr   r-   Únum_query_tokensr,   Úhidden_sizeÚencoder_hidden_sizer1   r   Úuse_decoder_only_language_modelÚinitializer_factorÚinitializer_range)	Úselfr/   r.   r-   r8   r,   ÚkwargsÚtext_model_typeÚ	__class__s	           €r"   r4   Ú InstructBlipVideoConfig.__init__x   sò   ø€ ô 	‰ÒÑ"˜6Ò"àÑ ØˆMÜK‰KÐtÔuàÑ!ØˆNÜK‰KÐvÔwàÑØˆKÜK‰KÐnÔoä:ÑK¸]ÑKˆÔÜ<ÑN¸~ÑNˆÔØ%Ÿ/™/¨,¸Ó>ˆÜ)¨/Ò:ÑI¸[ÑIˆÔà 0ÔØ!2ÔØ26×2DÑ2D×2PÑ2Pˆ×ÑÔ/Ø/3×/?Ñ/?×/JÑ/JÔNoÑ/oˆÔ,Ø"%ˆÔØ!%ˆÕr!   r/   r.   r-   c                 ón   • U " SUR                  5       UR                  5       UR                  5       S.UD6$ )zä
Instantiate a [`InstructBlipVideoConfig`] (or a derived class) from a InstructBlipVideo vision model, Q-Former and
language model configurations.

Returns:
    [`InstructBlipVideoConfig`]: An instance of a configuration object
)r/   r.   r-   r   )Úto_dict)Úclsr/   r.   r-   r?   s        r"   Ú from_vision_qformer_text_configsÚ8InstructBlipVideoConfig.from_vision_qformer_text_configs›   sD   € ñ  ð 
Ø'×/Ñ/Ó1Ø)×1Ñ1Ó3Ø#×+Ñ+Ó-ñ
ð ñ	
ð 	
r!   )r<   r=   r8   r.   r-   r;   r,   r/   )NNNé    N)r   r   r   r   Ú__doc__r1   Úattribute_mapr   r%   r   Úsub_configsr4   Úclassmethodr   rF   r    Ú__classcell__)rA   s   @r"   r(   r(   6   sv   ø† ñ5ðn %€JàÐ-ð€Mð "Ø8Ø6ñ€Kð ØØØØ÷!&ðF ð
à4ð
ð 7ð
ð &ó	
ó ö
r!   r(   c                   ó   • \ rS rSrSrg)Ú InstructBlipVideoPreTrainedModelé³   r   Nr   r   r!   r"   rO   rO   ³   r#   r!   rO   c                   ó   • \ rS rSrSrg)ÚInstructBlipVideoVisionModelé·   r   Nr   r   r!   r"   rR   rR   ·   r#   r!   rR   c                   ó   • \ rS rSrSrg)ÚInstructBlipVideoQFormerModelé»   r   Nr   r   r!   r"   rU   rU   »   r#   r!   rU   c                   ó   • \ rS rSrSrg)Ú4InstructBlipVideoForConditionalGenerationModelOutputé¿   r   Nr   r   r!   r"   rX   rX   ¿   r#   r!   rX   c            !       ó`  • \ rS rSr           SS\R
                  S\R
                  S\\R                     S\\R
                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\	   S\\	   S\\	   S\	S\\	   S\
\   S\\\4   4S jjrSrg)ÚInstructBlipVideoModeléÃ   NÚpixel_valuesÚqformer_input_idsÚqformer_attention_maskÚ	input_idsÚattention_maskÚdecoder_input_idsÚdecoder_attention_maskÚinputs_embedsÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚinterpolate_pos_encodingÚ	use_cacher?   Úreturnc                 óž  • Ub  UOU R                   R                  nUR                  u  nnnnnUR                  UU-  UUU5      nU R	                  UU	U
UUS9nUS   n[
        R                  " UR                  5       S S [
        R                  UR                  S9nU R                  R                  UR                  S   SS5      n[
        R                  " UR                  5       S S [
        R                  UR                  S9nUc  [
        R                  " U5      nUR                  USS9nUR                  USS9n[
        R                  " UU/SS9nU R                  UUUUUU	U
US9nUS   S S 2S UR                  S5      2S S 24   nU R!                  U5      nUR                  XðR                   R"                  U-  S5      nUcR  U R$                  R'                  5       " U5      nX@R                   R(                  :H  nUc  [
        R                  " U5      nOiX€R'                  5       " [
        R*                  " U R                   R(                  [
        R                  UR                  S95      :H  nUR-                  S5      nUR/                  S5      R1                  U5      R3                  UR                  5      nUR3                  UR                  UR4                  5      nUR7                  UU5      nU R                   R8                  (       a  U R$                  " SUUU	U
UUS.UD6nOU R$                  " SUUUUU	U
UUS	.UD6n[;        UUUS
9$ )N)r]   re   rf   rg   rh   r   éÿÿÿÿ©ÚdtypeÚdevice©Údimé   )r`   ra   Úquery_embedsÚencoder_hidden_statesÚencoder_attention_maskre   rf   rg   ©rd   ra   re   rf   rg   ri   )rd   ra   rb   rc   re   rf   rg   ri   )Úvision_outputsÚqformer_outputsÚlanguage_model_outputsr   )ÚconfigÚuse_return_dictÚshapeÚreshapeÚvision_modelÚtorchÚonesÚsizeÚlongro   Úquery_tokensÚexpandÚ	ones_likeÚrepeat_interleaveÚcatÚqformerÚlanguage_projectionr8   Úlanguage_modelÚget_input_embeddingsr+   ÚtensorÚallÚ	unsqueezeÚ	expand_asÚtorn   Úmasked_scatterr;   rX   )r>   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   rg   rh   ri   r?   Ú
batch_sizeÚframesÚchannelÚheightÚwidthrw   Úimage_embedsÚimage_attention_maskrƒ   Úquery_attention_maskÚquery_outputsÚquery_outputÚlanguage_model_inputsÚspecial_image_maskÚoutputss                                 r"   ÚforwardÚInstructBlipVideoModel.forwardÄ   sv  € ð" &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆð 6B×5GÑ5GÑ2ˆ
F˜G V¨UØ#×+Ñ+¨J¸Ñ,?ÀÈ&ÐRWÓXˆà×*Ñ*Ø%Ø/Ø!5Ø#Ø%=ð +ð 
ˆð & aÑ(ˆô  %Ÿzšz¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÑqÐð ×(Ñ(×/Ñ/°×0BÑ0BÀ1Ñ0EÀrÈ2ÓNˆÜ$Ÿzšz¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÑqÐà!Ñ)Ü%*§_¢_Ð5FÓ%GÐ"à-×?Ñ?ÀÈAÐ?ÐNÐØ!7×!IÑ!IÈ&ÐVWÐ!IÐ!XÐÜ!&§¢Ð,@ÐBXÐ+YÐ_`Ñ!aÐØŸ™Ø'Ø1Ø%Ø".Ø#7Ø/Ø!5Ø#ð %ð 	
ˆð % QÑ'ªÐ+A¨\×->Ñ->¸qÓ-AÐ+AÂ1Ð(DÑEˆð !%× 8Ñ 8¸Ó FÐð !6× =Ñ =¸jÏ+É+×JfÑJfÐioÑJoÐqsÓ tÐØÑ Ø ×/Ñ/×DÑDÔFÀyÓQˆMØ!*¯k©k×.HÑ.HÑ!HÐØÑ%Ü!&§¢°Ó!;øà!.×2KÑ2KÔ2MÜ—’˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÑgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ðà/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐØ 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ%×4Ñ4Ð5GÐI^Ó_ˆà;‰;×6×6Ø×)Ò)ð Ø+Ø-Ø"3Ø%9Ø'Ø#ñð ñ‰Gð ×)Ò)ð 
Ø+Ø-Ø"3Ø'=Ø"3Ø%9Ø'Ø#ñ
ð ñ
ˆGô DØ)Ø)Ø#*ñ
ð 	
r!   r   )NNNNNNNNNFN)r   r   r   r   r   ÚFloatTensorr   Ú
LongTensorÚTensorÚboolr   r   r   ÚtuplerX   rŸ   r    r   r!   r"   r[   r[   Ã   sB  † ð
 >BØ15Ø59Ø8<Ø=AØ04Ø,0Ø/3Ø&*Ø).Ø$(ñi
à×'Ñ'ði
ð !×,Ñ,ði
ð !)¨×)9Ñ)9Ñ :ð	i
ð
 ˜E×-Ñ-Ñ.ði
ð ! ×!1Ñ!1Ñ2ði
ð $ E×$4Ñ$4Ñ5ði
ð !)¨×)9Ñ)9Ñ :ði
ð   §¡Ñ-ði
ð $ D™>ði
ð ' t™nði
ð ˜d‘^ði
ð #'ði
ð ˜D‘>ði
ð Ð-Ñ.ði
ð  
ˆuÐJÐJÑ	K÷!i
ð i
r!   r[   c            #       ó²  • \ rS rSr   SS\R
                  S\R                  S\\R                     S\\   S\\   4
S jjr	   SS\R
                  S\R                  S\\R                     S\\   S\\   4
S	 jjr
S
\R                  S\R
                  4S jr            SS\R
                  S\R
                  S\\R                     S
\\R
                     S\\R                     S\\R                     S\\R                     S\\R
                     S\\   S\\   S\\R                     S\\   S\S\\   S\\   S\\\4   4 S jjr\R$                  " 5             SS\R
                  S\\R                     S\\R                     S
\\R                     S\\R                     S\\R
                     S\S\R                  4S jj5       rSrg)Ú)InstructBlipVideoForConditionalGenerationi0  Nr]   r^   r_   rh   rg   c           	      ó^  • UR                   u  pgp‰n
UR                  Xg-  X‰U
5      nU R                  UUSS9nUS   n[        R                  " UR                  5       SS [        R                  UR                  S9nU R                  R                  UR                   S   SS5      n[        R                  " UR                  5       SS [        R                  UR                  S9nUc  [        R                  " U5      nUR                  USS9nUR                  USS9n[        R                  " Xó/SS9nU R                  UUUUUSS	9nUS   SS2SUR                  S5      2SS24   nU R                  U5      nUR                  X`R                  R                   U-  S5      nU(       a  UUU4$ U$ )
zü
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
T)r]   rh   rg   r   Nrl   rm   rp   rr   )r`   ra   rs   rt   ru   rg   )r|   r}   r~   r   r€   r   r‚   ro   rƒ   r„   r…   r†   r‡   rˆ   r‰   rz   r8   )r>   r]   r^   r_   rh   rg   r’   r“   r”   r•   r–   rw   r—   r˜   rƒ   r™   rš   r›   rœ   s                      r"   Úget_video_featuresÚ<InstructBlipVideoForConditionalGeneration.get_video_features1  sÔ  € ð" 6B×5GÑ5GÑ2ˆ
˜G¨UØ#×+Ñ+¨JÑ,?ÀÐRWÓXˆà×*Ñ*Ø%Ø%=Øð +ð 
ˆð
 & aÑ(ˆô  %Ÿzšz¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÑqÐð ×(Ñ(×/Ñ/°×0BÑ0BÀ1Ñ0EÀrÈ2ÓNˆÜ$Ÿzšz¨,×*;Ñ*;Ó*=¸c¸rÐ*BÌ%Ï*É*Ð]i×]pÑ]pÑqÐà!Ñ)Ü%*§_¢_Ð5FÓ%GÐ"à-×?Ñ?ÀÈAÐ?ÐNÐØ!7×!IÑ!IÈ&ÐVWÐ!IÐ!XÐÜ!&§¢Ð,@Ð+YÐ_`Ñ!aÐØŸ™Ø'Ø1Ø%Ø".Ø#7Øð %ð 
ˆð % QÑ'ªÐ+A¨\×->Ñ->¸qÓ-AÐ+AÂ1Ð(DÑEˆð !%× 8Ñ 8¸Ó FÐð !6× =Ñ =¸jÏ+É+×JfÑJfÐioÑJoÐqsÓ tÐÞØ(¨.¸-ÐGÐGØ$Ð$r!   c                 ó   • g )Nr   )r>   r]   r^   r_   rh   rg   s         r"   Úget_image_featuresÚ<InstructBlipVideoForConditionalGeneration.get_image_featuresm  s   € ð 	r!   r`   rd   c           	      ó‚  • Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  S5      R                  U5      R                  UR                  5      nU$ )zI
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`.
rm   rl   )r‹   r   rŒ   rz   r+   r‚   ro   r   rŽ   r   r   )r>   r`   rd   r   s       r"   Úget_placeholder_maskÚ>InstructBlipVideoForConditionalGeneration.get_placeholder_maskw  s¢   € ð ÑØ!.×2KÑ2KÔ2MÜ—’˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÑgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ñà!*¯k©k×.HÑ.HÑ!HÐà/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐØ!Ð!r!   ra   rb   rc   re   rf   Úlabelsri   r?   rj   c                 óx  • Ub  UOU R                   R                  nU R                  UUUUSS9u  nnnU(       d  UR                  5       OUnU(       d  UR                  5       OUnUc  U R	                  5       " U5      nUc  [
        R                  " U5      nUR                  UR                  UR                  5      nU R                  XHS9nUR                  UU5      nU R                   R                  (       aj  U R                  " SUUU	U
UUS.UD6nU(       a  UR                  OUS   nSnUb3  U R                  " SUX°R                   R                   R"                  S.UD6nOLU R                  " SUUUUU	U
UUUS.	UD6nU(       a  UR$                  OUS   nU(       a  UR                  OUS	   n['        UUUUUS
9$ )aè	  
qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length)):
    The sequence used as a prompt to be fed to the Q-Former module.
qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
    Mask to avoid performing attention on padding token indices.

Examples:

```python
>>> from transformers import InstructBlipVideoProcessor, InstructBlipVideoForConditionalGeneration
>>> import torch
>>> from huggingface_hub import hf_hub_download
>>> import av
>>> import numpy as np

>>> def read_video_pyav(container, indices):
...     '''
...     Decode the video with PyAV decoder.
...     Args:
...         container (`av.container.input.InputContainer`): PyAV container.
...         indices (`list[int]`): List of frame indices to decode.
...     Returns:
...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
...     '''
...     frames = []
...     container.seek(0)
...     start_index = indices[0]
...     end_index = indices[-1]
...     for i, frame in enumerate(container.decode(video=0)):
...         if i > end_index:
...             break
...         if i >= start_index and i in indices:
...             frames.append(frame)
...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])

>>> model = InstructBlipVideoForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b", device_map="auto")
>>> processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

>>> file_path = hf_hub_download(
...       repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
... )
>>> container = av.open(file_path)

>>> # sample uniformly 4 frames from the videWhy is this video funny?o
>>> total_frames = container.streams.video[0].frames
>>> indices = np.arange(0, total_frames, total_frames / 4).astype(int)
>>> clip = read_video_pyav(container, indices)

>>> prompt = "What is happening in the video?"
>>> inputs = processor(text=prompt, images=clip, return_tensors="pt").to(model.device)

>>> outputs = model.generate(
...     **inputs,
...     do_sample=False,
...     num_beams=5,
...     max_length=256,
...     repetition_penalty=1.5,
...     length_penalty=1.0,
... )
>>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
>>> print(generated_text)
"A person is eating a bowl of pasta, and they are using a fork to eat it. The person is sitting at a table, and the plate of pasta is on the table in front"
```NT©r^   r_   rh   rg   ©rd   rv   r   )Úlogitsr±   Ú
vocab_size)	rd   ra   rb   rc   re   rf   rg   r±   ri   rr   )Úlossrµ   rw   rx   ry   r   )rz   r{   r©   Úto_tupler‹   r   r…   r   ro   rn   r¯   r‘   r;   rŠ   rµ   Úloss_functionr-   r¶   r·   rX   )r>   r]   r^   r_   r`   ra   rb   rc   rd   re   rf   r±   rg   rh   ri   r?   rœ   rw   rš   r   rž   rµ   r·   s                          r"   rŸ   Ú1InstructBlipVideoForConditionalGeneration.forward†  sð  € ðb &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà?C×?VÑ?VØØ/Ø#9Ø%=Øð @Wð @
Ñ<Ð˜~¨}ö ;F˜×0Ñ0Ô2È>ˆÞ8C˜×.Ñ.Ô0ÈˆàÑ Ø ×5Ñ5Ô7¸	ÓBˆMàÑ!Ü"Ÿ_š_¨YÓ7ˆNà 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ!×6Ñ6°yÐ6Ð^ÐØ%×4Ñ4Ð5GÐI^Ó_ˆà;‰;×6×6Ø×)Ò)ð Ø+Ø-Ø"3Ø%9Ø'Ø#ñð ñˆGö (3W—^’^¸À¹
ˆFØˆDØÑ!Ø×)Ò)ð Ø!¨&¿[¹[×=TÑ=T×=_Ñ=_ñØciñøð
 ×)Ò)ð Ø+Ø-Ø"3Ø'=Ø"3Ø%9Ø'ØØ#ñð ñˆGö $/7—<’<°G¸A±JˆDÞ'2W—^’^¸À¹
ˆFäCØØØ)Ø)Ø#*ñ
ð 	
r!   c                 óJ  • [        U S5      (       a  U R                  5         UR                  S   n	U R                  UUUUSS9u  p«nUc®  Uc•  U R                  R
                  /U R                  R                  -  S-  nXÐR                  R                  R                  /-   n[        R                  " U/[        R                  UR                  S9nUR                  U	S5      nU R                  5       " U5      nUc  [        R                  " U5      nU
R!                  UR                  UR"                  5      n
U R%                  XFS9nUR'                  Xú5      nXeS	.nU R(                  R                  R*                  (       d  UUS
'   U R(                  R,                  " S0 UDUD6nU$ )aA  
Overrides `generate` function to be able to use the model as a conditional generator.

Args:
    pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width) or
        (batch_size, num_frames, num_channels, height, width)): Input images or videos to be processed.
    qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt to be fed to the Q-Former module.
    qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        The sequence used as a prompt for the generation.
    attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
        Mask to avoid performing attention on padding token indices.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embedded representation of the inputs. Should be float, not int tokens.
    interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
        Whether to interpolate the positional encoding of the image embeddings.

Returns:
    captions (list): A list of strings of length batch_size * num_captions.
Úhf_device_mapr   Tr³   é   rm   rr   r´   )rd   ra   r`   r   )ÚhasattrÚ_preprocess_accelerater|   r©   rz   r,   r8   r-   Úbos_token_idr   rŒ   r‚   ro   Úrepeatr‹   r…   r   rn   r¯   r‘   rŠ   Úis_encoder_decoderÚgenerate)r>   r]   r^   r_   r`   ra   rd   rh   Úgenerate_kwargsr’   rœ   rw   rš   Úvideo_tokensÚstart_tokensr   Úinputsrž   s                     r"   rÃ   Ú2InstructBlipVideoForConditionalGeneration.generate  s‘  € ôD 4˜×)Ñ)à×'Ñ'Ô)à!×'Ñ'¨Ñ*ˆ
Ø?C×?VÑ?VØØ/Ø#9Ø%=Øð @Wð @
Ñ<Ð¨}ð Ñ ØÑ Ø $§¡× =Ñ =Ð>ÀÇÁ×A]ÑA]Ñ]Ð`aÑaØ+¯{©{×/FÑ/F×/SÑ/SÐ.TÑTÜ!ŸLšL¨,¨¼u¿z¹zÐR^×ReÑReÑf	Ø%×,Ñ,¨Z¸Ó;	Ø ×5Ñ5Ô7¸	ÓBˆMàÑ!Ü"Ÿ_š_¨YÓ7ˆNà 5× 8Ñ 8¸×9MÑ9MÈ}×ObÑObÓ cÐØ!×6Ñ6°yÐ6Ð^ÐØ%×4Ñ4Ð5GÓ_ˆà#0ÑSˆØ×"Ñ"×)Ñ)×<×<Ø"+ˆF;Ñà×%Ñ%×.Ò.ÑK°ÐK¸?ÑKˆàˆr!   r   )NFF)NNNNNNNNNNFN)NNNNNF)r   r   r   r   r   r¡   r¢   r   r¤   r©   r¬   r¯   r   r   r   r¥   rX   rŸ   Úno_gradrÃ   r    r   r!   r"   r§   r§   0  s  † ð
 >BØ38Ø&+ñ9%à×'Ñ'ð9%ð !×+Ñ+ð9%ð !)¨×)9Ñ)9Ñ :ð	9%ð
 #+¨4¡.ð9%ð ˜d‘^õ9%ð@ >BØ38Ø&+ñà×'Ñ'ðð !×+Ñ+ðð !)¨×)9Ñ)9Ñ :ð	ð
 #+¨4¡.ðð ˜d‘^õð"¨e×.>Ñ.>ð "Èu×O`ÑO`ô "ð& >BØ15Ø59Ø8<Ø=AØ59Ø,0Ø/3Ø-1Ø&*Ø).Ø$(ñN
à×'Ñ'ðN
ð !×,Ñ,ðN
ð !)¨×)9Ñ)9Ñ :ð	N
ð
 ˜E×-Ñ-Ñ.ðN
ð ! ×!1Ñ!1Ñ2ðN
ð $ E×$4Ñ$4Ñ5ðN
ð !)¨×)9Ñ)9Ñ :ðN
ð   × 1Ñ 1Ñ2ðN
ð $ D™>ðN
ð ' t™nðN
ð ˜×)Ñ)Ñ*ðN
ð ˜d‘^ðN
ð #'ðN
ð ˜D‘>ðN
ð  Ð+Ñ,ð!N
ð" 
ˆuÐJÐJÑ	Kõ#N
ð` ‡]‚]ƒ_ð 9=Ø=AØ04Ø59Ø59Ø).ñCà×'Ñ'ðCð $ E×$4Ñ$4Ñ5ðCð !)¨×)9Ñ)9Ñ :ð	Cð
 ˜E×,Ñ,Ñ-ðCð ! ×!1Ñ!1Ñ2ðCð   × 1Ñ 1Ñ2ðCð #'ðCð 
×	Ñ	ôCó óCr!   r§   )r(   r%   r   rR   rO   rU   r[   r§   )*Útypingr   r   r   Útorch.utils.checkpointÚ;transformers.models.instructblip.configuration_instructblipr   r   Ú6transformers.models.instructblip.modeling_instructblipr   r   r	   r
   r   r   r   Úconfiguration_utilsr   Úmodeling_flash_attention_utilsr   Úmodels.auto.modeling_autor   Úprocessing_utilsr   Úutilsr   Úautor   r   Ú
get_loggerr   r5   r   r%   r(   rO   rR   rU   rX   r[   r§   Ú__all__r   r!   r"   Ú<module>rÖ      sÏ   ð÷  #ã Û ÷÷÷ ñ õ 4Ý BÝ JÝ &Ý ß -ð 
×	Ò	˜HÓ	%€ô	Ð$<ô 	ô	Ð%>ô 	ôz
Ð.ô z
ôz	Ð'Bô 	ô	Ð#:ô 	ô	Ð$<ô 	ô	Ð;jô 	ôj
Ð.ô j
ôZjÐ0Tô jòZ		r!   