
    <hF                     .   S r SSKJrJr  SSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  \R.                  " \5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\
5      5       r/ SQrg)zPyTorch Fuyu model.    )OptionalUnionN)nn   )Cache)GenerationMixin)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)auto_docstringcan_return_tuplelogging   )
FuyuConfigc                   F    \ rS rSr% \\S'   SrSrSrSr	Sr
Sr/ rSrS rSrg)	FuyuPreTrainedModel#   configfuyuTpast_key_valuesc                 "   U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Ng        )meanstd)r   initializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler   s      ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weights!FuyuPreTrainedModel._init_weights/   s    kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> . .     N)__name__
__module____qualname____firstlineno__r   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr'   __static_attributes__r*   r)   r&   r   r   #   s<    &*#"&N"3	?r)   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                   D  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS r	S	 r
S
\R                  S\\R                     S\R                  S\R                  4S jrS\R                  4S jrS\R"                  S\R                  S\R                  4S jr\           S S\R"                  S\R                  S\R                  S\\R                     S\\R"                     S\\   S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )!	FuyuModel;   zlanguage_model.modellanguage_modelr   c                   > [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)super__init__pad_token_idr#   text_config
vocab_sizer   from_configr=   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr$   r   	__class__s     r&   r@   FuyuModel.__init__C   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r)   c                 6    U R                   R                  5       $ N)r=   get_input_embeddingsr$   s    r&   rP   FuyuModel.get_input_embeddingsP   s    ""7799r)   c                 :    U R                   R                  U5        g rO   )r=   set_input_embeddingsr$   values     r&   rT   FuyuModel.set_input_embeddingsS   s    007r)   c                     Xl         g rO   r=   r$   decoders     r&   set_decoderFuyuModel.set_decoderV   s    %r)   c                     U R                   $ rO   rY   rQ   s    r&   get_decoderFuyuModel.get_decoderY   s    """r)   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       H  n[
        R                  " X5   S:  SS9S   nX5   U   nUR                   S   X%   R                   S   :  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   M     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r$   ra   rb   rc   output_embeddings	batch_idxdst_indicessrc_indicess           r&   gather_continuous_embeddings&FuyuModel.gather_continuous_embeddings\   sI   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78I  --(A(LPQ(Q\`abcdK 4>{KK  #&;&F&L&LQ&OO ^7L7W7]7]6_ `I6A6G6G5II[\e[ffgi  9N8XYd8e8h8h!((945 9  ! r)   pixel_valuesc                     U Vs/ sHP  nU R                  UR                  U R                   R                  R                  5      5      R	                  S5      PMR     nnU$ s  snf )z
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
r   )rH   ro   r   dtypesqueeze)r$   rw   kwargspatchpatch_embeddingss        r&   get_image_featuresFuyuModel.get_image_features   sc     &
% $$UXXd.F.F.M.M.S.S%TU]]^_`% 	 
  	
s   AA	input_idsinputs_embedsimage_featuresc           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
ry   rp   r   r   z6Image features and image tokens do not match: tokens: z, features )rP   rm   tensorr   image_token_idlongrp   allsum	unsqueeze	expand_asro   rh   numelrj   )r$   r   r   r   special_image_maskn_image_tokensn_image_featuress          r&   get_placeholder_maskFuyuModel.get_placeholder_mask   s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r)   image_patchesimage_patches_indicesattention_maskposition_idsr   	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pOUb  UR                  u  pnO[        S5      eUci  Ub  UR                  OUR                  nUb  UR                  5       OSn[        R                  " UUU-   [        R                  US9nUR                  S5      nUc   U R                  R                  5       " U5      nUbl  U R                  U5      n[        R                   " USS9R#                  UR                  UR$                  5      nU R'                  XUS9nUR)                  UU5      nU R                  " SUUUUU	U
UUS.UD6nU$ )	a  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   r   )dim)r   r   )r   r   r   r   r   r   r   r   r*   )r   r   r   r   use_return_dictrj   rh   rp   get_seq_lengthrm   aranger   r   r=   rP   r~   catro   ry   r   masked_scatter)r$   r   r   r   r   r   r   r   r   r   r   r   r{   
batch_size
seq_length_rp   past_key_values_lengthr}   r   outputss                        r&   forwardFuyuModel.forward   s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%JASTT)2)>Y%%MDXDXFIXId_%C%C%Ejk" <<&
5K(KSXS]S]flL (11!4L  //DDFyQM$#66}E$yy)9qADD]EYEY[h[n[no!%!:!:GW "; " *889KM]^M%% 

')%+/!5#

 

 r)   )rI   r=   r#   rH   rC   )NNNNNNNNNNN)r+   r,   r-   r.   _checkpoint_conversion_mappingr   r@   rP   rT   r\   r_   rm   Tensorlistru   FloatTensorr~   
LongTensorr   r   r   r   boolr   tupler	   r   r8   __classcell__rL   s   @r&   r;   r;   ;   s    '=>N%O"z :8&#*!*!  $ELL1*! $)<<	*!
 
*!X u/@/@  "))":?:K:K"]b]n]n"0  '+&*.21537+/59$(,0/3&*F##F ||F  %||	F
 !.F u//0F "%F   1 12F D>F $D>F 'tnF d^F 
u,,	-F Fr)   r;   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !         ^  \ rS rSrSSSS.rS/rS\4U 4S jjrS	 rS
 r	S r
S r\\             SS\R                  S\R                   S\R                   S\\R                      S\\R                     S\\   S\\R&                     S\\   S\\R                      S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       r      SU 4S jjrSrU =r$ ) FuyuForCausalLM   zmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r    )r?   r@   r;   modelr   r   rB   rG   rC   r   rJ   rK   s     r&   r@   FuyuForCausalLM.__init__  sS     v&
yy!3!3!?!?ASASA^A^ejkr)   c                 6    U R                   R                  5       $ rO   )r   rP   rQ   s    r&   rP   $FuyuForCausalLM.get_input_embeddings  s    zz..00r)   c                 :    U R                   R                  U5        g rO   )r   rT   rU   s     r&   rT   $FuyuForCausalLM.set_input_embeddings  s    

''.r)   c                 :    U R                   R                  U5        g rO   )r   r\   rZ   s     r&   r\   FuyuForCausalLM.set_decoder  s    

w'r)   c                 6    U R                   R                  5       $ rO   )r   r_   rQ   s    r&   r_   FuyuForCausalLM.get_decoder  s    zz%%''r)   r   r   r   r   r   r   r   r   labelsr   r   r   logits_to_keeprd   c                 Z   U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUUUU
UUSS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                   S9$ )ap  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import requests

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```NT)r   r   r   r   r   r   r   r   r   r   r   r   )logitsr   rC   )lossr   r   hidden_states
attentionsr*   )r   r   r   r   r   r   r   intslicer   loss_functionrB   rC   r	   r   r   r   )r$   r   r   r   r   r   r   r   r   r   r   r   r   r   r{   r   r   slice_indicesr   r   s                       r&   r   FuyuForCausalLM.forward  sM   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]**'"7')%+/!5  
  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD &#33!//))
 	
r)   c           
      ^   > [         T
U ]  " U4UUUUUUS.UD6n	US   S:w  a
  S U	S'   S U	S'   U	$ )N)r   r   r   r   r   cache_positionr   r   r   )r?   prepare_inputs_for_generation)r$   r   r   r   r   r   r   r   r{   model_inputsrL   s             r&   r   -FuyuForCausalLM.prepare_inputs_for_generationv  sb     w<	
+)''"7)	
 	
 !!48L01,0L)r)   )r   r   )NNNNNNNNNNNNr   )NNNNNN)r+   r,   r-   r.   r   _tied_weights_keysr   r@   rP   rT   r\   r_   r   r   rm   r   r   r   r   r   r   r   r   r   r	   r   r   r8   r   r   s   @r&   r   r      s    "8 ;#,&"
 ++z 1/((  '+&*.21537+/59$()-,0/3&*()[
##[
 ||[
  %||	[

 !.[
 u//0[
 "%[
   1 12[
 D>[
 &[
 $D>[
 'tn[
 d^[
 ![
  
u,,	-![
  [
@ " r)   r   )r   r   r;   )__doc__typingr   r   rm   torch.utils.checkpointr   cache_utilsr   
generationr   modeling_outputsr	   modeling_utilsr
   models.auto.modeling_autor   utilsr   r   r   configuration_fuyur   
get_loggerr+   loggerr   r;   r   __all__r*   r)   r&   <module>r      s     "      ) 6 - 2 > > * 
		H	% ?/ ? ?. 
u# u
up 
V)? V
Vr Br)   