
    <hP                        S SK Jr  S SKJrJr  S SKrS SKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJrJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  \\" SS9 " S S\5      5       5       r\\" SS9 " S S\5      5       5       r " S S\R6                  5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S \\5      5       r/ S!Qr g)"    )	dataclass)OptionalUnionN)nn   )ACT2FN)Cache)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tuple   )	AutoModel   )VipLlavaConfigzM
    Base class for VipLlava outputs, with hidden states and attentions.
    )custom_introc                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)VipLlavaModelOutputWithPast&   a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_states )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations____static_attributes__r       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   &   s    
 8<%"3"34;r#   r   zT
    Base class for VipLlava causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)VipLlavaCausalLMOutputWithPast<   a)  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nlosslogitspast_key_valueshidden_states
attentionsr   r   )r   r   r   r   r   r(   r   r   r    r!   r)   r*   listr+   tupler,   r   r"   r   r#   r$   r&   r&   <   s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r#   r&   c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjector[   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr2   num_feature_layers	__class__s      r$   r8   $VipLlavaMultiModalProjector.__init__\   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr#   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)rA   rD   rF   rG   )rH   r+   s     r$   forward#VipLlavaMultiModalProjector.forwardk   sB    00?m4/m4r#   )rF   rD   rG   rA   )	r   r   r   r   r   r8   rN   r"   __classcell__rJ   s   @r$   r0   r0   [   s    m~ m r#   r0   c                   @    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrg)VipLlavaPreTrainedModels   r2    Tr*   r   N)r   r   r   r   r   r!   base_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_flex_attn_supports_attention_backendr"   r   r#   r$   rS   rS   s   s7    &*#"3N!"&r#   rS   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    c                     ^  \ rS rSrSS0rS\4U 4S jjrS rS rS r	S	 r
 SS
\R                  S\\\\\   4      4S jjrS\R$                  S\R                  S\R                  4S jr\            SS\R$                  S
\R                  S\\R*                     S\\R$                     S\\   S\\R                     S\\\\\   4      S\\   S\\   S\\   S\\   S\\R$                     S\\\4   4S jj5       rSrU =r$ )VipLlavaModel   zlanguage_model.modellanguage_modelr2   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g rM   )r7   r8   r   from_configr>   vision_towerr0   multi_modal_projectorrC   ra   	post_initrH   r2   rJ   s     r$   r8   VipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr#   c                 6    U R                   R                  5       $ rM   )ra   get_input_embeddingsrH   s    r$   rj   "VipLlavaModel.get_input_embeddings   s    ""7799r#   c                 :    U R                   R                  U5        g rM   )ra   set_input_embeddingsrH   values     r$   rn   "VipLlavaModel.set_input_embeddings   s    007r#   c                     Xl         g rM   ra   rH   decoders     r$   set_decoderVipLlavaModel.set_decoder   s    %r#   c                     U R                   $ rM   rs   rk   s    r$   get_decoderVipLlavaModel.get_decoder   s    """r#   pixel_valuesr:   c                 Z   Ub  UOU R                   R                  nU R                  USS9n[        U[        5      (       a  UR
                  U   SS2SS24   nO;U Vs/ sH  oSR
                  U   SS2SS24   PM     nn[        R                  " USS9nU R                  U5      nU$ s  snf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layers (`Union[int, list[int]]`):
        The vision feature layer, or the list of indexes of the layers to select
        the vision feature.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
NT)output_hidden_statesr   )dim)	r2   r:   rd   r9   r;   r+   r   catre   )rH   r{   r:   image_outputsimage_featuresindexs         r$   get_image_features VipLlavaModel.get_image_features   s      &;%F!DKKLmLm 	 )),T)R +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC ls    B(	input_idsinputs_embedsr   c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)dtypedevicer~   r   r   z6Image features and image tokens do not match: tokens: z, features )rj   r   tensorr2   image_token_idlongr   allsum	unsqueeze	expand_astoshapenumel
ValueError)rH   r   r   r   special_image_maskn_image_tokensn_image_featuress          r$   get_placeholder_mask"VipLlavaModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r#   attention_maskposition_idsr*   	use_cacheoutput_attentionsr}   return_dictcache_positionreturnc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbV  U R                  X'S9nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUU	U
SUS.	UD6n[        UR                  UR                   UR"                  UR$                  Ub  WOSS9nU(       a  U$ UR'                  5       $ )	z
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedsr{   r:   )r   r   T)	r   r   r*   r   r   r   r}   r   r   )last_hidden_stater*   r+   r,   r   r   )r2   r   r}   use_return_dictr:   r   rj   r   r   r   r   r   masked_scatterra   r   r   r*   r+   r,   to_tuple)rH   r   r{   r   r   r*   r   r:   r   r   r}   r   r   	lm_kwargsr   r   outputsoutputs                     r$   rN   VipLlavaModel.forward   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ  557	BM#!44) 5 N ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r#   )ra   re   rd   rM   )NNNNNNNNNNNN)r   r   r   r   _checkpoint_conversion_mappingr   r8   rj   rn   rv   ry   r   r    r   r   r;   r-   r   
LongTensorr   r   Tensorr	   boolr.   r   rN   r"   rP   rQ   s   @r$   r_   r_      s    '=>N%O"~ :8&# im!--FNuUXZ^_bZcUcOdFe>"))":?:K:K"]b]n]n"0  '+*.1537+/59AE$(,0/3&*59B<##B< ''B< !.	B<
 u//0B< "%B<   1 12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!1!12B< 
u11	2B< B<r#   r_   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            #         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS rS r S%S\R                   S\\\\\   4      4S jjr\S 5       r\S 5       r\S 5       r\\              S&S\R8                  S\R                   S\\R:                     S\\R8                     S\\   S\\R                      S\\\\\   4      S\\R8                     S\\   S\\   S\\   S\\   S \\R8                     S!\\\R:                  4   S\\ \!4   4S" jj5       5       r"      S'U 4S# jjr#S$r$U =r%$ )( VipLlavaForConditionalGenerationi  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr2   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFr5   )r7   r8   r_   modelr   rB   rC   r?   
vocab_sizer   rf   rg   s     r$   r8   )VipLlavaForConditionalGeneration.__init__)  sS     "6*
yy!3!3!?!?ASASA^A^ejkr#   c                 6    U R                   R                  5       $ rM   )r   rj   rk   s    r$   rj   5VipLlavaForConditionalGeneration.get_input_embeddings/  s    zz..00r#   c                 :    U R                   R                  U5        g rM   )r   rn   ro   s     r$   rn   5VipLlavaForConditionalGeneration.set_input_embeddings2  s    

''.r#   r   c                     U R                   $ rM   )r   rk   s    r$   get_output_embeddings6VipLlavaForConditionalGeneration.get_output_embeddings5  s    ||r#   c                 :    U R                   R                  U5        g rM   )r   rv   rt   s     r$   rv   ,VipLlavaForConditionalGeneration.set_decoder8  s    

w'r#   c                 6    U R                   R                  5       $ rM   )r   ry   rk   s    r$   ry   ,VipLlavaForConditionalGeneration.get_decoder;  s    zz%%''r#   r{   r:   c                 4    U R                   R                  XS9$ )Nr   )r   r   )rH   r{   r:   s      r$   r   3VipLlavaForConditionalGeneration.get_image_features>  s     zz,,,,ttr#   c                 .    U R                   R                  $ rM   )r   ra   rk   s    r$   ra   /VipLlavaForConditionalGeneration.language_modelD  s    zz(((r#   c                 .    U R                   R                  $ rM   )r   rd   rk   s    r$   rd   -VipLlavaForConditionalGeneration.vision_towerH  s    zz&&&r#   c                 .    U R                   R                  $ rM   )r   re   rk   s    r$   re   6VipLlavaForConditionalGeneration.multi_modal_projectorL  s    zz///r#   r   r   r   r*   r   labelsr   r   r}   r   r   logits_to_keepc                 r   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R
                  " SUUUUUUU	UU
USUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                   UR"                  S9$ )a  
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```NT)r   r{   r   r   r*   r   r   r:   r   r}   r   r   r   )r)   r   r   )r(   r)   r*   r+   r,   r   r   )r2   r   r}   r   r:   r   r9   r;   slicer   loss_functionrC   r   r&   r*   r+   r,   r   )rH   r   r{   r   r   r*   r   r:   r   r   r   r}   r   r   r   r   r   r+   slice_indicesr)   r(   s                        r$   rN   (VipLlavaForConditionalGeneration.forwardP  sZ   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 ** 
%)%+'"7/!5)
 
   
8B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r#   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)r*   r   r   r   r   r   r{   )r7   prepare_inputs_for_generation)rH   r   r*   r   r{   r   r   r   kwargsmodel_inputsrJ   s             r$   r   >VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r#   )r   r   rM   )NNNNNNNNNNNNNr   )NNNNNN)&r   r   r   r   r   _tied_weights_keysr   r8   rj   rn   r   Moduler   rv   ry   r   r    r   r   r;   r-   r   propertyra   rd   re   r   r   r   r   r	   r   r.   r&   rN   r   r"   rP   rQ   s   @r$   r   r     sX    "8-"?#,	&" ++~ 1/ryy (( imu!--uFNuUXZ^_bZcUcOdFeu ) ) ' ' 0 0  '+*.1537+/59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "%]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
  ]
D  r#   r   )r_   r   rS   )!dataclassesr   typingr   r   r   r   activationsr   cache_utilsr	   
generationr
   modeling_outputsr   r   modeling_utilsr   utilsr   r   autor   configuration_vipllavar   r   r&   r   r0   rS   r_   r   __all__r   r#   r$   <module>r      s  , " "   !   ) D - 5  2 
<"9 < <  
<[ < <2")) 0 'o ' ' 
Q<+ Q<
Q<h 
m'> m
m` [r#   