
    h&                       d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0m1Z1m2Z2  e&jf                  e4      Z5e e$d       G d de"                    Z6 G d de
jn                        Z8	 dHde
jn                  de	jr                  de	jr                  de	jr                  dee	jr                     d e:d!e:fd"Z; G d# d$e
jn                        Z< G d% d&e
jn                        Z= G d' d(e      Z>e$ G d) d*e             Z? G d+ d,e
jn                        Z@ G d- d.e?      ZA G d/ d0e
jn                        ZB G d1 d2e
jn                        ZC G d3 d4e
jn                        ZD G d5 d6e
jn                        ZE G d7 d8e
jn                        ZF G d9 d:e      ZG G d; d<e
jn                        ZH G d= d>e
jn                        ZI G d? d@e?      ZJ e$dA       G dB dCe?             ZK e$dD       G dE dFe?e             ZLg dGZMy)IzPyTorch InstructBLIP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)OutputRecordercheck_model_inputs   )	AutoModelAutoModelForCausalLMAutoModelForSeq2SeqLM   )InstructBlipConfigInstructBlipQFormerConfigInstructBlipVisionConfigzQ
    Class defining the outputs of [`InstructBlipForConditionalGeneration`].
    )custom_introc                      e Zd ZU dZdZeeej                        e	d<   dZ
eeej                        e	d<   dZeej                     e	d<   dZeeej                        e	d<   dZeeej                        e	d<   dee   fd	Zy)
/InstructBlipForConditionalGenerationModelOutputa  
    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
        Language modeling loss from the language model.
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head of the language model.
    vision_outputs (`BaseModelOutputWithPooling`):
        Outputs of the vision encoder.
    qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`):
        Outputs of the Q-Former (Querying Transformer).
    language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`):
        Outputs of the language model.
    Nlosslogitsvision_outputsqformer_outputslanguage_model_outputsreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r-   r.   r/   N)getattrto_tuple).0kselfs     u/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/instructblip/modeling_instructblip.py	<genexpr>zKInstructBlipForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>J   sD      
 ! WW Gq!**,- !s   -0)tuplekeysr8   s   `r9   r5   z8InstructBlipForConditionalGenerationModelOutput.to_tupleI   s%     
 YY[	
 
 	
    )__name__
__module____qualname____doc__r+   r   r;   torchFloatTensor__annotations__r,   r-   r.   r/   r   r5    r>   r9   r*   r*   .   s     04D(5**+
,315FHU5,,-.526NHU../6:>OXeE$5$567>AEHU5+<+<%=>E
%* 
r>   r*   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )InstructBlipVisionEmbeddingsconfigc                 p   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  dd| j                              | _        t        j                  d| j                  | j                  | j                        | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j                  t        j                  d| j                  | j                              | _        y )Nr$   r	   )in_channelsout_channelskernel_sizestrider    )super__init__rI   hidden_size	embed_dim
image_size
patch_sizer   	ParameterrC   randnclass_embeddingConv2dpatch_embeddingnum_patchesnum_positionsposition_embeddingr8   rI   	__class__s     r9   rP   z%InstructBlipVisionEmbeddings.__init__T   s    ++ ++ ++!||EKK1dnn,MN!yyDOO\`\k\k 
 !OOt>1D!--1"$,,u{{1d>P>PRVR`R`/a"br>   
embeddingsheightwidthr0   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  z  }	|| j
                  z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r$   Ng      ?r   r	   r    bicubicF)sizemodealign_cornersdim)shaper\   rC   jit
is_tracingrT   r   reshapepermuter   
functionalinterpolateviewcat)r8   r_   r`   ra   rZ   r[   class_pos_embedpatch_pos_embedri   
new_height	new_widthsqrt_num_positionss               r9   interpolate_pos_encodingz5InstructBlipVisionEmbeddings.interpolate_pos_encodingf   s`    !&&q)A-//55a81< yy##%+*F6UZ?***11!RaR%811!QR%8r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr>   pixel_valuesrx   c                    |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      j	                  |      }	t        j                  |	|gd      }
|r| j                  |
||      }n| j                  }|
|d d d |
j                  d      d d f   j	                  |      z   }
|
S )Ndtyper    r$   rc   rh   )rj   rY   weightr|   toflatten	transposerW   expandrC   rr   rx   r\   re   )r8   ry   rx   
batch_size_r`   ra   target_dtypepatch_embedsclass_embedsr_   r\   s               r9   forwardz$InstructBlipVisionEmbeddings.forward   s    '3'9'9$
Avu++2288++LOO,O,OP#++A.88A>++22:q"EHHVYYl;C
#!%!>!>z6SX!Y!%!8!8"4Q8L*//!:L8La5O"P"S"ST`"aa
r>   F)r?   r@   rA   r'   rP   rC   Tensorintrx   rD   boolr   __classcell__r^   s   @r9   rH   rH   S   sm    c7 c$&D5<< &D &DUX &D]b]i]i &DPE$5$5 QU bgbnbn r>   rH   modulequerykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nrc   rh   )ptrainingr$   r    )	rC   matmulr   r   ro   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r9   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r>   c                        e Zd ZdZ fdZdej                  dedefdZ	 ddej                  de	ej                     d	e
ej                  e	ej                     e	e
ej                        f   fd
Z xZS )InstructBlipAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        d| _
        |j                  | _        t        j                  | j                  d| j                  z  d      | _        |j                  ret        j                   t#        j$                  | j                              }t        j                   t#        j$                  | j                              }nd }d }|Qt#        j&                  |t#        j(                  |d      |f      }t        j                   |      | j                  _        t        j                  | j                  | j                        | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fr	   )bias)requires_grad)rO   rP   rI   rQ   rR   num_attention_heads	num_headshead_dim
ValueErrorscale	is_causalattention_dropoutr   Linearqkvqkv_biasrU   rC   zerosrr   
zeros_liker   
projection)r8   rI   q_biasv_biasr   r^   s        r9   rP   zInstructBlipAttention.__init__   su   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!9 99T^^Q-?eL??\\%++dnn"=>F\\%++dnn"=>FFFyy&%*:*:6QV*WY_!`aHLL2DHHM))DNNDNNCr>   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr$   r    )rq   r   r   r   r   )r8   r   r   r   s       r9   _shapezInstructBlipAttention._shape   s7    {{3GQQRSUVWbbddr>   hidden_states	head_maskr0   c                 6   |j                         \  }}}| j                  |      }|j                  ||d| j                  || j                  z        j	                  ddddd      }|d   |d   |d   }
}	}t
        }| j                  j                  dk7  rt        | j                  j                     } || ||	|
fd| j                  sdn| j                  | j                  d	|\  }}|j                  ||d
      j                         }| j                  |      }||fS )z#Input shape: Batch x Time x Channelr	   r    r   r$      eagerN        )r   r   r   rc   )re   r   rm   r   rn   r   rI   _attn_implementationr   r   r   r   r   r   )r8   r   r   r   r   tgt_lenrR   	mixed_qkvquery_states
key_statesvalue_statesattention_interfacer   r   s                 r9   r   zInstructBlipAttention.forward   s-    #0"4"4"6WiHH]+	%%c7At~~yTXTbTbGbckkq!Q
	 2;1y|YWX\,j(?;;++w6"9$++:Z:Z"[$7		%

  #}}C$2H2HJJ	%
 	%
!\ "))#w;FFHook2L((r>   N)r?   r@   rA   rB   rP   rC   r   r   r   r   r;   r   r   r   s   @r9   r   r      s    GD>eU\\ eC ec e -1$)||$) ELL)$)
 
u||Xell3XeELL>Q5RR	S$)r>   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InstructBlipMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rO   rP   rI   r
   
hidden_actactivation_fnr   r   rQ   intermediate_sizefc1fc2r]   s     r9   rP   zInstructBlipMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr>   r   r0   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   r8   r   s     r9   r   zInstructBlipMLP.forward  s4    /**=9/r>   r?   r@   rA   rP   rC   r   r   r   r   s   @r9   r   r     s$    KU\\ ell r>   r   c            	            e Zd Zdef fdZedej                  dej                  dee	   dej                  fd       Z xZS )InstructBlipEncoderLayerrI   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)rO   rP   rQ   rR   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r]   s     r9   rP   z!InstructBlipEncoderLayer.__init__  sm    ++.v6<<F<Q<QR"6*<<F<Q<QRr>   r   r   r   r0   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   rF   )r   r   r   r   )r8   r   r   r   residualr   s         r9   r   z InstructBlipEncoderLayer.forward  s     !((7)4>> 
'$
 
q
 &0 ((7/%0r>   )r?   r@   rA   r%   rP   r   rC   r   r   r   rD   r   r   r   s   @r9   r   r     s`    S1 S ||  +,	
 
		 r>   r   c                   B    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZd Zy)InstructBlipPreTrainedModelrI   blipT)InstructBlipQFormerEmbeddingsr   %InstructBlipQFormerMultiHeadAttentionInstructBlipQFormerSelfOutputc                    | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          yyt        |t        j                        r(|j                  j                  j                  d|       yt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              rYt        j                  j!                  |j"                  d|       t        j                  j!                  |j$                  d|       yt        |t&        t(        f      r%|j*                  j                  j                          yy)zInitialize the weightsr   )meanstdN      ?)rI   initializer_range
isinstancer   r   rX   r}   datanormal_r   zero_	Embeddingr   fill_rH   inittrunc_normal_r\   rW   $InstructBlipForConditionalGenerationInstructBlipModelquery_tokens)r8   r   factors      r9   _init_weightsz)InstructBlipPreTrainedModel._init_weightsH  sI   ..fryy"))45MM&&CV&<{{&  &&( '-MM&&CV&<-KK""$MM$$S) <=GG!!&";";#6!RGG!!&"8"8s!O!EGX YZ$$**, [r>   N)r?   r@   rA   r%   rE   base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraph_no_split_modulesr   rF   r>   r9   r   r   5  s>    &*#"&N!-r>   r   c            
       t     e Zd ZdZdef fdZe	 ddeej                     de
e   deeef   fd       Z xZS )	InstructBlipEncodera  
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`InstructBlipEncoderLayer`].

    Args:
        config (`InstructBlipConfig`):
            The corresponding vision configuration for the `InstructBlipEncoder`.
    rI   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rO   rP   rI   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)r8   rI   r   r^   s      r9   rP   zInstructBlipEncoder.__init__g  sV    mmuU[UmUmOn$oOn!%=f%EOn$op&+# %ps   A#r   r   r0   c                 V    |}| j                   D ]  } ||fd|i|} t        |      S )Nr   last_hidden_state)r  r   )r8   inputs_embedsr   r   r   encoder_layers         r9   r   zInstructBlipEncoder.forwardm  sC     &![[M)- M ) ??r>   r   )r?   r@   rA   rB   r%   rP   r   r   rC   r   r   r   r   r;   r   r   r   r   s   @r9   r  r  ]  sj    ,1 ,  26@ !.@ +,	@
 
uo%	&@ @r>   r  c                        e Zd ZU dZeed<   eedZdef fdZ	e
e	 	 d
deej                     dedee   deeef   fd              Zd	 Z xZS )InstructBlipVisionModelry   rI   )r   
attentionsc                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )rO   rP   rI   rQ   rH   r_   r  encoderr   r   r   post_layernorm	post_init)r8   rI   rR   r^   s      r9   rP   z InstructBlipVisionModel.__init__  s]     &&	6v>*62 ll9&:O:OPr>   rx   r   r0   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)rx   r  r   r  pooler_outputrF   )r   r_   r  r  r  r   )r8   ry   rx   r   r   encoder_outputsr  pooled_outputs           r9   r   zInstructBlipVisionModel.forward  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
r>   c                     | j                   S r   )r_   r=   s    r9   get_input_embeddingsz,InstructBlipVisionModel.get_input_embeddings  s    r>   r  )r?   r@   rA   main_input_namer'   rE   r   r   _can_record_outputsrP   r   r   r   rC   rD   r   r   r   r   r;   r   r   r  r   r   s   @r9   r  r    s    $O$$1+
	7 	  59).
u001
 #'
 +,	

 
u00	1
  
6r>   r  c                   Z     e Zd Zd	 fd	Zd Zd Zd Zd Zd Z	 	 	 	 d
de	e
   fdZ xZS )r   c                    t         |           || _        |j                  |j                  z  dk7  r0t        |d      s$t        d|j                  |j                  fz        |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _	        t        j                  |j                  | j                        | _        |r_t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        n^t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                   |j"                        | _        t'        |dd      | _        | j(                  dk(  s| j(                  dk(  rF|j*                  | _        t        j,                  d|j*                  z  d	z
  | j                        | _        d
| _        y )Nr   embedding_sizezLThe hidden size (%d) is not a multiple of the number of attention heads (%d)position_embedding_typeabsoluterelative_keyrelative_key_queryr    r$   F)rO   rP   rI   rQ   r   hasattrr   r   attention_head_sizeall_head_sizer   r   r   encoder_hidden_sizer   r   Dropoutattention_probs_dropout_probr   r4   r!  max_position_embeddingsr   distance_embeddingsave_attentionr8   rI   is_cross_attentionr^   s      r9   rP   z.InstructBlipQFormerMultiHeadAttention.__init__  s    : ::a?PVXhHi^%%v'A'ABC 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
yy!;!;T=O=OPDH6#=#=t?Q?QRDJyy!3!3T5G5GHDH6#5#5t7I7IJDJzz&"E"EF'.v7PR\']$''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD##r>   c                     || _         y r   attn_gradients)r8   r2  s     r9   save_attn_gradientsz9InstructBlipQFormerMultiHeadAttention.save_attn_gradients  s
    ,r>   c                     | j                   S r   r1  r=   s    r9   get_attn_gradientsz8InstructBlipQFormerMultiHeadAttention.get_attn_gradients  s    """r>   c                     || _         y r   attention_map)r8   r8  s     r9   save_attention_mapz8InstructBlipQFormerMultiHeadAttention.save_attention_map  s
    *r>   c                     | j                   S r   r7  r=   s    r9   get_attention_mapz7InstructBlipQFormerMultiHeadAttention.get_attention_map  s    !!!r>   c                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nrc   r   r    r$   r	   )re   r   r&  rq   rn   )r8   xnew_x_shapes      r9   transpose_for_scoresz:InstructBlipQFormerMultiHeadAttention.transpose_for_scores  sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r>   r   c                    |d u}|rC| j                  | j                  |            }| j                  | j                  |            }	|}n@| j                  | j                  |            }| j                  | j                  |            }	| j                  |      }
| j                  |
      }t	        j
                  ||j                  dd            }| j                  dk(  s| j                  dk(  rF|j                         d   }t	        j                  |t        j                  |j                        j                  dd      }t	        j                  |t        j                  |j                        j                  dd      }||z
  }| j                  || j                  z   dz
        }|j                  |j                         }| j                  dk(  rt	        j"                  d||      }||z   }nE| j                  dk(  r6t	        j"                  d||      }t	        j"                  d	||      }||z   |z   }|t%        j&                  | j(                        z  }|j                   }|||z   } t+        j,                  d
      |      j                  |      }|r8| j.                  r,| j1                  |       |j3                  | j4                         | j7                  |      }|||z  }t	        j
                  ||	      }|j9                  dddd      j;                         }|j                         d d | j<                  fz   } |j                  | }||fS )Nrc   r   r#  r$  r$   r|   devicer{   zbhld,lrd->bhlrzbhrd,lrd->bhlrrh   r   r    r	   )r?  r   r   r   rC   r   r   r!  re   arangelongrB  rq   r,  r+  r~   r|   einsummathsqrtr&  r   Softmaxr-  r9  register_hookr3  r   rn   r   r'  )r8   r   r   r   encoder_hidden_statesencoder_attention_maskr   r/  	key_layervalue_layermixed_query_layerquery_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_scores_dtypeattention_probsattention_probs_droppedcontext_layernew_context_layer_shapes                             r9   r   z-InstructBlipQFormerMultiHeadAttention.forward  s!    3$>11$((;P2QRI33DJJ?T4UVK3N11$((=2IJI33DJJ}4MNK JJ}5//0AB !<<Y5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ!1!7!7%/.@ -"**,-=>AABXY$"5"5##O4))$*B*BC #',,"?  &=	&I#%<kJ%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDo--r>   r   NNNN)r?   r@   rA   rP   r3  r5  r9  r;  r?  r   r   r   r   r   s   @r9   r   r     sF    $8-#+"% "#I. +,I.r>   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )r   c                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y r   )rO   rP   r   r   rQ   denser   r   r)  hidden_dropout_probr   r]   s     r9   rP   z&InstructBlipQFormerSelfOutput.__init__0  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r>   r   input_tensorr0   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   ra  r   r   r8   r   rc  s      r9   r   z%InstructBlipQFormerSelfOutput.forward6  7    

=1]3}|'CDr>   r   r   s   @r9   r   r   /  1    >U\\  RWR^R^ r>   r   c                        e Zd Zd fd	Zd Z	 	 	 	 ddej                  deej                     deej                     deej                     deej                     de	e
   d	ej                  fd
Z xZS )InstructBlipQFormerAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y r   )rO   rP   r   	attentionr   outputsetpruned_headsr.  s      r9   rP   z%InstructBlipQFormerAttention.__init__?  s3    >vGYZ3F;Er>   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r$   rh   )lenr   rl  r   r&  ro  r   r   r   r   rm  ra  r'  union)r8   headsindexs      r9   prune_headsz(InstructBlipQFormerAttention.prune_headsE  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r>   r   r   r   rJ  rK  r   r0   c           	      `     | j                   d|||||d|\  }}| j                  ||      }	|	S )N)r   r   r   rJ  rK  rF   )rl  rm  )
r8   r   r   r   rJ  rK  r   r   r   attention_outputs
             r9   r   z$InstructBlipQFormerAttention.forwardW  sO     ( 
')"7#9
 
Q  ;;{MBr>   r   r^  )r?   r@   rA   rP   ru  rC   r   r   rD   r   r   r   r   r   s   @r9   rj  rj  >  s    ";* 7;15=A>B ||  !!2!23  E--.	 
  ((9(9:  !)):): ;  +,  
 r>   rj  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InstructBlipQFormerIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rO   rP   r   r   rQ   r   ra  r   r   strr
   intermediate_act_fnr]   s     r9   rP   z(InstructBlipQFormerIntermediate.__init__n  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r>   r   r0   c                 J    | j                  |      }| j                  |      }|S r   )ra  r|  r   s     r9   r   z'InstructBlipQFormerIntermediate.forwardv  s&    

=100?r>   r   r   s   @r9   ry  ry  m  s#    9U\\ ell r>   ry  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )InstructBlipQFormerOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rO   rP   r   r   r   rQ   ra  r   r   r)  rb  r   r]   s     r9   rP   z"InstructBlipQFormerOutput.__init__~  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r>   r   rc  r0   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   re  rf  s      r9   r   z!InstructBlipQFormerOutput.forward  rg  r>   r   r   s   @r9   r  r  }  rh  r>   r  c                   H     e Zd Z fdZ	 	 	 	 	 ddee   fdZd Zd Z xZ	S )InstructBlipQFormerLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        || _        ||j                  z  dk(  rt	        |d      | _        d| _	        nd| _	        t        |      | _        t        |      | _        t        |      | _        t        |      | _        y )Nr$   r   T)r/  F)rO   rP   chunk_size_feed_forwardseq_len_dimrj  rl  	layer_idxcross_attention_frequencycrossattentionhas_cross_attentionry  intermediater  rm  intermediate_queryoutput_queryr8   rI   r  r^   s      r9   rP   z!InstructBlipQFormerLayer.__init__  s    '-'E'E$5f="v7771<">vZ^"_D'+D$',D$;FC/7"A&"I5f=r>   r   c           
      f    | j                   |f||d|}|dkD  r|d d d |d d f   }	| j                  r%|t        d       | j                  |	f||||d|}	t	        | j
                  | j                  | j                  |	      }
|j                  d   |kD  rjt	        | j                  | j                  | j                  |d d |d d d f         j                  |
j                        }t        j                  |
|gd      }
|
S t	        | j                  | j                  | j                  |      }
|
S )N)r   r   r   z>encoder_hidden_states must be given for cross-attention layers)r   r   rJ  rK  r$   rh   )rl  r  r   r  r   feed_forward_chunk_queryr  r  rj   feed_forward_chunkr~   rB  rC   rr   )r8   r   r   r   rJ  rK  query_lengthr   rw  query_attention_outputlayer_outputlayer_output_texts               r9   r   z InstructBlipQFormerLayer.forward  sv    *4>>
)
 	
 !%5a,6I%J"''(0$%eff)<)<)<**#1'*?+A* *& 5--,,  &	L  %%a(<7$=++00$$$Qq%89	%
 "\(() "  %yy,8I)JPQR  5'',,   	L r>   c                 L    | j                  |      }| j                  ||      }|S r   )r  rm  r8   rw  intermediate_outputr  s       r9   r  z+InstructBlipQFormerLayer.feed_forward_chunk  s,    "//0@A{{#68HIr>   c                 L    | j                  |      }| j                  ||      }|S r   )r  r  r  s       r9   r  z1InstructBlipQFormerLayer.feed_forward_chunk_query  s.    "556FG(()<>NOr>   NNNNr   )
r?   r@   rA   rP   r   r   r   r  r  r   r   s   @r9   r  r    s9    >. "#6 +,6p
r>   r  c                   F     e Zd Z fdZe	 	 	 	 	 ddee   fd       Z xZS )InstructBlipQFormerEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w r  )
rO   rP   rI   r   r  r  r  r  layerr  r  s      r9   rP   z#InstructBlipQFormerEncoder.__init__  s]    ]]JOPVPhPhJijJiY%fi8Jij

 ',# ks   A$r   c                     t        | j                  j                        D ])  }| j                  |   }	|||   nd }
 |	|||
|f||d|}+ t	        |      S )N)rK  r  r
  )r  rI   r  r  r   )r8   r   r   r   rJ  rK  r  r   ilayer_modulelayer_head_masks              r9   r   z"InstructBlipQFormerEncoder.forward  sy     t{{445A::a=L.7.CilO(%	
 (>) M	 6 9+
 	
r>   r  )	r?   r@   rA   rP   r   r   r   r   r   r   s   @r9   r  r    s=    ,  "#
 +,
 
r>   r  c                   2     e Zd ZdZ fdZ	 	 	 	 ddZ xZS )r   z;Construct the embeddings from word and position embeddings.c                 P   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j!                  dt#        j$                  |j                        j'                  d      d       t)        |dd      | _        || _        y )	N)padding_idxr   position_ids)r$   rc   F)
persistentr!  r"  )rO   rP   r   r   
vocab_sizerQ   pad_token_idword_embeddingsr+  position_embeddingsr   r   	layernormr)  rb  r   register_bufferrC   rC  r   r4   r!  rI   r]   s     r9   rP   z&InstructBlipQFormerEmbeddings.__init__  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<= 	ELL)G)GHOOPWXej 	 	
 (/v7PR\']$r>   c                    ||j                         d   }nd}|&| j                  d d |||z   f   j                         }|k| j                  |      }| j                  dk(  r/| j                  |j                  |j                              }||z   }|t        j                  ||fd      }n|}|j                  | j                  j                  j                        }| j                  |      }| j                  |      }|S )Nr$   r   r"  rh   )re   r  cloner  r!  r  r~   rB  rC   rr   r  r}   r|   r   )r8   	input_idsr  query_embedspast_key_values_lengthrQ  r_   r  s           r9   r   z%InstructBlipQFormerEmbeddings.forward  s      ")!,JJ,,Q0FVlIl0l-lmssuL --i8J++z9&*&>&>|zO`O`?a&b#'*==
'"YYj'AqI
%J]]4>>#8#8#>#>?
^^J/
\\*-
r>   )NNNr   )r?   r@   rA   rB   rP   r   r   r   s   @r9   r   r     s    E$  r>   r   c                       e Zd ZdZdZdZdZdZe e	e
dd      g e	e
dd      gdZdef fd	Zd
 Zd Zd Z	 ddej$                  dee   dej*                  dedej$                  f
dZee	 	 	 	 	 	 ddej4                  deej8                     deej4                     deej$                     deej8                     deej8                     deej8                     dee   deeej8                     e f   fd              Z! xZ"S )InstructBlipQFormerModelz
    Querying Transformer (Q-Former), used in InstructBLIP. Slightly modified from BLIP-2 as it also takes the
    instruction as input.
    Fr$   z
.attention)rt  
layer_namez.crossattention)r   r  cross_attentionsrI   c                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r   )rO   rP   rI   r   r_   r  r  r  r]   s     r9   rP   z!InstructBlipQFormerModel.__init__S  s9     7?1&9r>   c                 .    | j                   j                  S r   r_   r  r=   s    r9   r  z-InstructBlipQFormerModel.get_input_embeddings]  s    ...r>   c                 &    || j                   _        y r   r  r8   r   s     r9   set_input_embeddingsz-InstructBlipQFormerModel.set_input_embeddings`  s    */'r>   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  rl  ru  )r8   heads_to_pruner  rs  s       r9   _prune_headsz%InstructBlipQFormerModel._prune_headsc  s>    
 +002LE5LLu%//;;EB 3r>   r   input_shaperB  	has_queryr0   c                    |j                         dk(  r|dddddddf   }n=|j                         dk(  r|ddddddf   }nt        d| d|j                   d      |j                  | j                        }d|z
  d	z  }|S )
a>  
        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

        Arguments:
            attention_mask (`torch.Tensor`):
                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
            input_shape (`tuple[int]`):
                The shape of the input to the model.
            device: (`torch.device`):
                The device of the input to the model.

        Returns:
            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
        r	   Nr    z!Wrong shape for input_ids (shape z) or attention_mask (shape )r{   r   g     )ri   r   rj   r~   r|   )r8   r   r  rB  r  extended_attention_masks         r9   get_extended_attention_maskz4InstructBlipQFormerModel.get_extended_attention_maskk  s    . 1$&4Qa]&C#!Q& '5QdA5E&F#3K=@[\j\p\p[qqrs  #:"<"<4::"<"N#&)@#@H"L&&r>   r  r  r  r   rJ  rK  r   c           	      D   ||t        d      ||j                  d   nd}	| j                  |||      }
|
j                         dd }|\  }}|
j                  }|t        j                  ||f|      }| j                  |||      }|t        |t              r|d   j                         \  }}}n|j                         \  }}}||f}t        |t              r|D cg c]  }| j                  |       }}n?|)t        j                  ||      }| j                  |      }n| j                  |      }nd}| j                  || j                  j                        } | j                  |
f|||||	d|}|j                  }|dddddf   }t!        ||	      S c c}w )
a$  
        query_embeds (`torch.FloatTensor`  of shape `(batch_size, sequence_length, hidden_size)`):
            Hidden states to be used in the attention computation. If cross-attention,
            will be used for the query (i.e., key and value will use the encoder_hidden_states).
        Nz7You have to specify query_embeds when input_ids is Noner$   r   )r  r  r  rc   )rB  )r   r   rJ  rK  r  r  )r   rj   r_   re   rB  rC   onesr  r   listinvert_attention_maskget_head_maskrI   r  r  r  r   )r8   r  r   r  r  r   rJ  rK  r   r  embedding_outputr  r   rQ  rB  r  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapemaskencoder_extended_attention_maskr  sequence_outputr  s                            r9   r   z InstructBlipQFormerModel.forward  s   $ !5VWW0<0H|))!,a??%% + 
 '++-cr2!,
J!((!"ZZ*j)A6RN #'"B"B>S^`f"g !,/6AVWXAYA^A^A`>"$;QAVA[A[A]>"$;Q$68O#P 0$7`v2w`vX\43M3Md3S`v/2w'/).4HQW)X&262L2LMc2d/262L2LMc2d/.2+ &&y$++2O2OP	+74<<,
2"7#B%,
 ,
 *;;'1a0;-'
 	
9 3xs   Fr   )NNNNNN)#r?   r@   rA   rB   r   r   r   r   r  r   r   r  r&   rP   r  r  r  rC   r   r;   r   rB  r   r  r   r   
LongTensorr   rD   r   r   r   r   r   r   r   s   @r9   r  r  >  s   
 #( N 2@Vbc
 @Vgh
8 /0C  )')' 3Z)' 	)'
 )' 
)'V  7;37/315=A>BO
##O
 !!2!23O
 u//0	O

 u||,O
 E--.O
  ((9(9:O
 !)):): ;O
 +,O
 
uU&&')UU	VO
  O
r>   r  z[
    InstructBLIP base Model consisting of language model, qformer and vision encoder.
    c                       e Zd ZdZdgZdef fdZd Zd Zd Z	d Z
d	ej                  d
ej                  fdZee	 	 	 	 	 	 	 ddej                  dej                  deej                     d	eej                     deej                     deej                     deej                     d
eej$                     dedee   deeef   fd              Z xZS )r   ry   r   rI   c                 (   t         |   |       t        |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        |j                        | _        t        j                  |j                  j                  |j                   j                        | _        t%        j&                  |j                         | _        | j(                  j*                  /| j*                  j-                  | j(                  j*                         | j(                  j.                  /| j.                  j-                  | j(                  j.                         | j1                          y Nr$   )rO   rP   r  vision_configvision_modelr   rU   rC   r   num_query_tokensqformer_configrQ   r   r  qformerr   text_configlanguage_projectionr!   from_configlanguage_modelr   extend_keep_in_fp32_modulesr  r]   s     r9   rP   zInstructBlipModel.__init__  s    3F4H4HILLQ8O8OQWQfQfQrQr)st/0E0EF#%99V-B-B-N-NPVPbPbPnPn#o '33F4F4FG00<""))$*=*=*O*OP44@&&--d.A.A.W.WX 	r>   c                 6    | j                   j                         S r   r  r  r=   s    r9   r  z&InstructBlipModel.get_input_embeddings      ""7799r>   c                 :    | j                   j                  |       y r   r  r  r  s     r9   r  z&InstructBlipModel.set_input_embeddings	      007r>   c                     | j                   j                  s_| j                  j                  | j                  j                  _        | j                  j                  | j                  j                  _        y y r   rI   use_decoder_only_language_modelr  sharedr  embed_tokensdecoderr=   s    r9   _tie_weightszInstructBlipModel._tie_weights  T    {{::7;7J7J7Q7QD''47;7J7J7Q7QD''4 ;r>   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyz
        Some pre-processing hacks to make the model `accelerate` compatible. Check
        https://github.com/huggingface/transformers/pull/21707 for more details.
        r$   r  a  The `language_model` is not in the `hf_device_map` dictionary and you are running your script in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`. Please pass a `device_map` that contains `language_model` to remove this warning. Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for more details on creating a `device_map` for large models._hf_hookTNhf_device_maprq  rC   cudadevice_countloggerwarningr%  r  r  io_same_devicer8   r  s     r9   _preprocess_acceleratez(InstructBlipModel._preprocess_accelerate  y    
 **}!&6m&KPUPZPZPgPgPilmPmNNM 4&&
3:>D((7 4r>   r  r  c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S zZ
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`.
        rA  rc   r  rC   r   rI   image_token_idrD  rB  all	unsqueeze	expand_asr~   r8   r  r  special_image_masks       r9   get_placeholder_maskz&InstructBlipModel.get_placeholder_mask%       !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H/99"=GGVYYZgZnZno!!r>   qformer_input_idsqformer_attention_maskr   decoder_input_idsdecoder_attention_maskrx   r   r0   c
           	      *    | j                   d||	d|
}|d   }t        j                  |j                         dd t        j                  |j
                        }| j                  j                  |j                  d   dd      }t        j                  |j                         dd t        j                  |j
                        }|t        j                  |      }t        j                  ||gd      } | j                  d|||||d|
}|d   ddd|j                  d      ddf   }|7 | j                  j                         |      }|t        j                  |      }| j                  |      }|j                  |j
                  |j                         }| j#                  ||	      }|j%                  ||      }| j&                  j(                  r | j                  d||d
|
}n | j                  d||||d|
}t+        |||      S )a  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        )ry   rx   r   Nrc   rA  r$   rh   )r  r   r  rJ  rK  r  r  r   )r  r   r
  r  r3   rF   )r  rC   r  re   rD  rB  r   r   rj   	ones_likerr   r  r  r  r  r~   r|   r  masked_scatterrI   r  r*   )r8   ry   r  r	  r  r   r
  r  r  rx   r   r-   image_embedsimage_attention_maskr   query_attention_maskquery_outputsquery_outputlanguage_model_inputsr  outputss                        r9   r   zInstructBlipModel.forward4  sP   P +** 
%%=
 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!&,@BX+Y_`!a$ 
'1%".#7
 
 %Q'+A\->->q-A+A1(DE FD//DDFyQM%!&!; !% 8 8 F 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +- G *d)) +-"3'=	
 G ?))#*
 	
r>   )NNNNNNF)r?   r@   rA   r  r  r%   rP   r  r  r  r  rC   r  rD   r  r   r   r   r   r   r   r   r   r;   r*   r   r   r   s   @r9   r   r     s^    %O+,1 &:8R
?("e.>.> "uO`O` " 
 >B15598<=A04).^
''^
 !,,^
 !))9)9 :	^

 E--.^
 !!1!12^
 $E$4$45^
 !))9)9 :^
  -^
 #'^
 -.^
 
uEE	F^
  ^
r>   r   a  
    InstructBLIP Model for generating text given an image and an optional text prompt. The model consists of a vision
    encoder, Querying Transformer (Q-Former) and a language model.

    One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue
    the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token.
    c                       e Zd ZU eed<   dZdZdgZdef fdZd Z	d Z
d Zd	ej                  fd
Zd Zd Zd Zd Z	 	 	 ddej(                  dej*                  deej*                     dee   dee   f
dZdej*                  dej(                  fdZee	 	 	 	 	 	 	 	 ddej(                  dej(                  deej*                     deej(                     deej*                     deej*                     deej*                     deej(                     deej*                     dedee   d	eee f   fd              Z! ejD                         	 	 	 	 	 	 d dej(                  deej*                     deej*                     deej*                     deej*                     deej(                     ded	ej*                  fd       Z# xZ$S )!r   rI   ry   Tr   c                 \   t         |   |       t        j                  |j                        | _        t        j                  t        j                  d|j                  |j                  j                              | _        t        j                  |j                        | _        t        j                   |j                  j                  |j"                  j                        | _        |j&                  r t)        j*                  |j"                        }nt-        j*                  |j"                        }|j.                  %| j.                  j1                  |j.                         |j2                  %| j2                  j1                  |j2                         || _        | j7                          y r  )rO   rP   r  _from_configr  r  r   rU   rC   r   r  r  rQ   r   r  r  r   r  r  r  r"   r  r#   r   r  r  r  r  )r8   rI   r  r^   s      r9   rP   z-InstructBlipForConditionalGeneration.__init__  s3    3@@AUAUVLLQ8O8OQWQfQfQrQr)st/<<V=R=RS#%99V-B-B-N-NPVPbPbPnPn#o 111==f>P>PQN2>>v?Q?QRN++7"")).*J*JK//;&&--n.R.RS, 	r>   c                 6    | j                   j                         S r   r  r=   s    r9   r  z9InstructBlipForConditionalGeneration.get_input_embeddings  r  r>   c                 :    | j                   j                  |       y r   r  r  s     r9   r  z9InstructBlipForConditionalGeneration.set_input_embeddings  r  r>   c                 :    | j                   j                  |       y r   )r  set_output_embeddings)r8   new_embeddingss     r9   r  z:InstructBlipForConditionalGeneration.set_output_embeddings  s    11.Ar>   r0   c                 6    | j                   j                         S r   )r  get_output_embeddingsr=   s    r9   r!  z:InstructBlipForConditionalGeneration.get_output_embeddings  s    ""88::r>   c                 6    | j                   j                         S r   )r  get_encoderr=   s    r9   r#  z0InstructBlipForConditionalGeneration.get_encoder      ""..00r>   c                 6    | j                   j                         S r   )r  get_decoderr=   s    r9   r&  z0InstructBlipForConditionalGeneration.get_decoder  r$  r>   c                     | j                   j                  s_| j                  j                  | j                  j                  _        | j                  j                  | j                  j                  _        y y r   r  r=   s    r9   r  z1InstructBlipForConditionalGeneration._tie_weights  r  r>   c                    | j                   }t        |      dkD  r:d|vr6t        j                  j	                         dkD  rt
        j                  d       t        | j                  d      rd| j                  j                  _
        yyr  r  r  s     r9   r  z;InstructBlipForConditionalGeneration._preprocess_accelerate  r  r>   r  r	  rx   return_dictc                    | j                  ||d      }|d   }t        j                  |j                         dd t        j                  |j
                        }| j                  j                  |j                  d   dd      }	t        j                  |	j                         dd t        j                  |j
                        }
|t        j                  |      }t        j                  |
|gd      }| j                  |||	||d	      }|d   ddd|	j                  d      ddf   }| j                  |      }|r|||fS |S )
a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        T)ry   rx   r)  r   Nrc   rA  r$   rh   )r  r   r  rJ  rK  r)  )r  rC   r  re   rD  rB  r   r   rj   r  rr   r  r  )r8   ry   r  r	  rx   r)  r-   r  r  r   r  r  r  r  s                 r9   get_image_featuresz7InstructBlipForConditionalGeneration.get_image_features  sY   " **%%= + 

 &a(  %zz,*;*;*=cr*B%**]i]p]pq ((//0B0B10Er2N$zz,*;*;*=cr*B%**]i]p]pq!)%*__5F%G"!&,@BX+Y_`!a'1%".#7 % 
 %Q'+A\->->q-A+A1(DE !% 8 8 F(.-GG$$r>   r  r  c                    |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                  d      j                  |      j                  |j                        }|S r  r  r  s       r9   r  z9InstructBlipForConditionalGeneration.get_placeholder_mask  r  r>   r   r
  r  labelsr   c           	         | j                  ||||
d      \  }}}| | j                         |      }|t        j                  |      }|j	                  |j
                  |j                        }| j                  ||      }|j                  ||      }| j                  j                  rS | j                  d||d|}|d   }d}|	j | j                  d||	| j                  j                  j                  d|}n5d|d<    | j                  d|||||	d	|}|j                  }|j                   }t#        |||||
      S )aD  
        qformer_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary of the Q-Former. Input tokens can optionally be provided
            to serve as text prompt, which the Q-Former model will encode.

            Indices can be obtained using [`InstructBlipProcessor`]. See [`InstructBlipProcessor.__call__`] for
            details.

            [What are input IDs?](../glossary#input-ids)
        qformer_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.

            Only relevant in case an encoder-decoder language model (like T5) is used.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the language modeling loss. Indices should be in `[-100, 0, ..., config.vocab_size -
            1]`. All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> model = InstructBlipForConditionalGeneration.from_pretrained("Salesforce/instructblip-vicuna-7b")
        >>> processor = InstructBlipProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

        >>> device = "cuda" if torch.cuda.is_available() else "cpu"
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> url = "https://raw.githubusercontent.com/salesforce/LAVIS/main/docs/_static/Confusing-Pictures.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> prompt = "What is unusual about this image?"
        >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)

        >>> outputs = model.generate(
        ...     **inputs,
        ...     do_sample=False,
        ...     num_beams=5,
        ...     max_length=256,
        ...     min_length=1,
        ...     top_p=0.9,
        ...     repetition_penalty=1.5,
        ...     length_penalty=1.0,
        ...     temperature=1,
        ... )
        >>> generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
        >>> print(generated_text)
        The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV, which is parked in the middle of a busy city street. This is an unconventional approach to ironing clothes, as it requires the man to balance himself and his ironing equipment on top of the vehicle while navigating through traffic. Additionally, the presence of taxis and other vehicles in the scene further emphasizes the unusual nature of this situation.
        ```Tr  r	  rx   r)  Nr  r  r   )r,   r-  r  r)  )r  r   r
  r  r-  )r+   r,   r-   r.   r/   rF   )r+  r  rC   r  r~   rB  r|   r  r  rI   r  r  loss_functionr  r  r+   r,   r*   )r8   ry   r  r	  r  r   r
  r  r  r-  rx   r   r  r-   r  r  r  r,   r+   s                      r9   r   z,InstructBlipForConditionalGeneration.forward.  s   Z @D?V?V/#9%= @W @
<~}  7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_;;66)d)) +- G
 QZFD!)t)) !&T[[=T=T=_=_ci
 %)F=!)d)) +-"3'= G <<D^^F>))#*
 	
r>   c                 B   t        | d      r| j                          |j                  d   }	| j                  ||||d      \  }
}}||| j                  j
                  g| j                  j                  z  }|| j                  j                  j                  gz   }t        j                  |gt        j                  |j                        }|j                  |	d      } | j                         |      }|t        j                  |      }|
j!                  |j                  |j"                        }
| j%                  ||      }|j'                  ||
      }||d}| j(                  j                  j*                  s||d	<    | j(                  j,                  d
i ||}|S )a  
        Overrides `generate` function to be able to use the model as a conditional generator.

        Args:
            pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)):
                Input images to be processed.
            qformer_input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt to be fed to the Q-Former module.
            qformer_attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                The sequence used as a prompt for the generation.
            attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*):
                Mask to avoid performing attention on padding token indices.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embedded representation of the inputs. Should be float, not int tokens.
            interpolate_pos_encoding (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the positional encoding of the image embeddings.

        Returns:
            captions (list): A list of strings of length batch_size * num_captions.
        r  r   Tr/  rA  r$   r  r  r  rF   )r%  r  rj   r+  rI   image_token_indexr  r  bos_token_idrC   r   rD  rB  repeatr  r  r~   r|   r  r  r  is_encoder_decodergenerate)r8   ry   r  r	  r  r   r  rx   generate_kwargsr   r  r-   r  image_tokensstart_tokensr  inputsr  s                     r9   r6  z-InstructBlipForConditionalGeneration.generate  s   D 4)'')!''*
?C?V?V/#9%= @W @
<~}    $ = =>A]A]]+t{{/F/F/S/S.TT!LL,uzzR^ReRef	%,,Z;	7D557	BM!"__Y7N 5 8 89M9M}ObOb c!66yP]6^%445GI^_#0NS""))<<"+F;.$%%..KK?Kr>   )NFF)NNNNNNNF)NNNNNF)%r?   r@   rA   r%   rE   r  r   r  rP   r  r  r  r   Moduler!  r#  r&  r  r  rC   rD   r  r   r   r+  r  r   r   r   r   r   r;   r*   r   no_gradr6  r   r   s   @r9   r   r     s    $O!+,1 4:8B;ryy ;11R?0 >B38&+/%''/% !++/% !))9)9 :	/%
 #+4./% d^/%b"e.>.> "uO`O` " 
 >B15598<=A59-1).}
''}
 !,,}
 !))9)9 :	}

 E--.}
 !!1!12}
 $E$4$45}
 !))9)9 :}
   1 12}
 ))*}
 #'}
 +,}
 
uEE	F}
  }
~ U]]_ 9==A045959).C''C $E$4$45C !))9)9 :	C
 E,,-C !!1!12C   1 12C #'C 
		C Cr>   r   )r  r   r   r   r  )r   )NrB   rF  dataclassesr   typingr   r   r   r   rC   r   activationsr
   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   utils.genericr   r   autor!   r"   r#   configuration_instructblipr%   r&   r'   
get_loggerr?   r  r*   r;  rH   r   floatr   r   r   r   r   r  r  r   r   rj  ry  r  r  r  r   r  r   r   __all__rF   r>   r9   <module>rN     sp   "  ! 1 1   ! ) B 9  G & l l j j ? I I o o 
		H	% 
k 
 
<G299 Gd %II%<<% 
% <<	%
 U\\*% % %0I)BII I)Zbii  9 D #-/ #- #-N@")) @F19 1hw.BII w.vBII + 299 + ^bii  		 U9 Ur$
 $
N0BII 0fi
: i
X 
e
3 e

e
P S+F SSl
r>   