
    <h0                     <   S SK JrJr  S SKrS SKJr  S SKJrJrJrJ	r	J
r
  SSKJr  SSKJr  SSKJrJr  S	S
KJr  \R(                  " \5      r " S S\	5      r " S S\5      r " S S\R2                  5      r " S S\
5      r " S S\5      r " S S\5      r/ SQrg)    )OptionalUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)auto_docstringlogging   )VipLlavaConfigc                       \ rS rSrSrg)VipLlavaModelOutputWithPast&    N__name__
__module____qualname____firstlineno____static_attributes__r       e/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &       r   r   c                       \ rS rSrSrg)VipLlavaCausalLMOutputWithPast*   r   Nr   r   r   r   r    r    *   r   r   r    c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )VipLlavaMultiModalProjector.   configc                 B  > [         TU ]  5         [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " X!R                  R                  -  UR                  S9U l        [        R                  " X!R                  R                  -  UR                  R                  SS9U l        [        UR                      U l        [        R                  " UR                  R                  UR                  R                  SS9U l        g )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr%   num_feature_layers	__class__s      r   r*   $VipLlavaMultiModalProjector.__init__/   s    ",V-I-I3"O"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ N)r3   r6   r8   r9   )r:   hidden_statess     r   forward#VipLlavaMultiModalProjector.forward>   sB    00?m4/m4r   )r8   r6   r9   r3   )	r   r   r   r   r   r*   rA   r   __classcell__)r<   s   @r   r#   r#   .   s    m~ m r   r#   c                       \ rS rSrSrg)VipLlavaPreTrainedModelF   r   Nr   r   r   r   rE   rE   F   r   r   rE   c                      \ rS rSr SS\R
                  S\\\\	\   4      4S jjr
\            SS\R                  S\R
                  S\\R                     S\\R                     S	\\   S
\\R
                     S\\\\	\   4      S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrg)VipLlavaModelJ   Npixel_valuesr,   c                 Z   Ub  UOU R                   R                  nU R                  USS9n[        U[        5      (       a  UR
                  U   SS2SS24   nO;U Vs/ sH  oSR
                  U   SS2SS24   PM     nn[        R                  " USS9nU R                  U5      nU$ s  snf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layers (`Union[int, list[int]]`):
        The vision feature layer, or the list of indexes of the layers to select
        the vision feature.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
NT)output_hidden_statesr   )dim)	r%   r,   vision_towerr+   r-   r@   torchcatmulti_modal_projector)r:   rJ   r,   image_outputsimage_featuresindexs         r   get_image_features VipLlavaModel.get_image_featuresK   s      &;%F!DKKLmLm 	 )),T)R +S11*889NOPQSTSUPUVN VkkUjE99%@ABGUjNk"YY~2>N33NC ls    B(	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrL   return_dictcache_positionreturnc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [        S5      eUc  U R                  5       " U5      nUbV  U R                  X'S9nUR                  UR                  UR                  5      nU R                  XUS9nUR                  X5      nU R                  " SUUUUUU	U
SUS.	UD6n[        UR                  UR                   UR"                  UR$                  Ub  WOSS9nU(       a  U$ UR'                  5       $ )	z
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
Nz:You must specify exactly one of input_ids or inputs_embedsrJ   r,   )r\   rT   T)	rY   rZ   r[   r\   r]   r^   rL   r_   r`   )last_hidden_stater[   r@   
attentionsimage_hidden_statesr   )r%   r^   rL   use_return_dictr,   
ValueErrorget_input_embeddingsrV   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   rd   r[   r@   re   to_tuple)r:   rX   rJ   rY   rZ   r[   r\   r,   r]   r^   rL   r_   r`   	lm_kwargsrT   special_image_maskoutputsoutputs                     r   rA   VipLlavaModel.forwardj   s   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ  557	BM#!44) 5 N ,..}/C/C]EXEXYN!%!:!:~ "; " *889K\M%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r   r   r?   )NNNNNNNNNNNN)r   r   r   r   rP   FloatTensorr   r   r-   listrV   r   
LongTensorTensorr   booltupler   rA   r   r   r   r   rH   rH   J   sg   hl!--FNuUXZ^_bZcUcOdFe>  '+*.1537+/59AE$(,0/3&*59B<##B< ''B< !.	B<
 u//0B< "%B<   1 12B<  (c49n(=>B< D>B< $D>B< 'tnB< d^B< !!1!12B< 
u11	2B< B<r   rH   c            !          \ rS rSr SS\R
                  S\\\\	\   4      4S jjr
              SS\R                  S\R
                  S\\R                     S\\R                     S	\\   S
\\R
                     S\\\\	\   4      S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\\R                  4   S\\\4   4S jjrSrg) VipLlavaForConditionalGeneration   NrJ   r,   c                 4    U R                   R                  XS9$ )Nrc   )modelrV   )r:   rJ   r,   s      r   rV   3VipLlavaForConditionalGeneration.get_image_features   s     zz,,,,ttr   rX   rY   rZ   r[   r\   labelsr]   r^   rL   r_   r`   logits_to_keepra   c                 r   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R
                  " SUUUUUUU	UU
USUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb.  U R                  UXR                   R                  R                  S9n[        UUUR                  UR                  UR                   UR"                  S9$ )a  
vision_feature_layers (`Union[int, list[int]]`, *optional*):
    The vision feature layer, or the list of indexes of the layers to select
    the vision feature.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import torch
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

>>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
>>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

>>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
>>> question = "Can you please describe this image?"
>>> prompt = prompt.format(question)
>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=20)
>>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
The image features a brown and white cat sitting on a green surface, with a red ball in its
```NT)rX   rJ   rY   rZ   r[   r\   r]   r,   r^   rL   r_   r`   r   )logitsr   
vocab_size)lossr   r[   r@   re   rf   r   )r%   r^   rL   rg   r,   r   r+   r-   slicelm_headloss_functionr5   r   r    r[   r@   re   rf   )r:   rX   rJ   rY   rZ   r[   r\   r,   r   r]   r^   rL   r_   r`   r   rq   rs   r@   slice_indicesr   r   s                        r   rA   (VipLlavaForConditionalGeneration.forward   sZ   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 ** 
%)%+'"7/!5)
 
   
8B>SV8W8W~ot4]kmA}a,?@A%%VF{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r   r?   )NNNNNNNNNNNNNr   )r   r   r   r   rP   rv   r   r   r-   rw   rV   rx   ry   r   rz   r{   r    rA   r   r   r   r   r}   r}      s   hlu!--uFNuUXZ^_bZcUcOdFeu '+*.1537+/59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "%]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
 ]
r   r}   )rH   r}   rE   )typingr   r   rP   r   (transformers.models.llava.modeling_llavar   r   r   r	   r
   activationsr   cache_utilsr   utilsr   r   configuration_vipllavar   
get_loggerr   loggerr   r    Moduler#   rE   rH   r}   __all__r   r   r   <module>r      s     #    "   , 2 
		H	%	": 		%@ 	")) 0	2 	c<J c<Lc
'D c
L [r   