ó
    <±hÄF  ã                   ó.  • S r SSKJrJr  SSKrSSKrSSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJr  \R.                  " \5      r\ " S S\5      5       r\" SS9 " S S\5      5       r\" SS9 " S S\\
5      5       r/ SQrg)zPyTorch Fuyu model.é    )ÚOptionalÚUnionN)Únné   )ÚCache)ÚGenerationMixin)ÚCausalLMOutputWithPast)ÚPreTrainedModel)Ú	AutoModel)Úauto_docstringÚcan_return_tupleÚloggingé   )Ú
FuyuConfigc                   óF   • \ rS rSr% \\S'   SrSrSrSr	Sr
Sr/ rSrS rSrg)	ÚFuyuPreTrainedModelé#   ÚconfigÚfuyuTÚpast_key_valuesc                 ó"  • U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         g g g )Ng        )ÚmeanÚstd)r   Úinitializer_rangeÚ
isinstancer   ÚLinearÚweightÚdataÚnormal_ÚbiasÚzero_Ú	EmbeddingÚpadding_idx)ÚselfÚmoduler   s      Ú^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/fuyu/modeling_fuyu.pyÚ_init_weightsÚ!FuyuPreTrainedModel._init_weights/   sÉ   € Øk‰k×+Ñ+ˆÜfœbŸi™i×(Ñ(ØM‰M×Ñ×&Ñ&¨C°SÐ&Ñ9Ø{‰{Ñ&Ø—‘× Ñ ×&Ñ&Õ(ð 'ä˜¤§¡×-Ñ-ØM‰M×Ñ×&Ñ&¨C°SÐ&Ñ9Ø×!Ñ!Ñ-Ø—‘×"Ñ" 6×#5Ñ#5Ñ6×<Ñ<Õ>ð .ð .ó    © N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_supports_attention_backendÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnÚ_no_split_modulesÚ_skip_keys_device_placementr'   Ú__static_attributes__r*   r)   r&   r   r   #   s<   ‡ àÓØÐØ&*Ð#Ø"&ÐØÐØ€NØÐØÐØ"3Ðõ	?r)   r   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )Úcustom_introc                   óD  ^ • \ rS rSrSS0rS\4U 4S jjrS rS rS r	S	 r
S
\R                  S\\R                     S\R                  S\R                  4S jrS\R                  4S jrS\R"                  S\R                  S\R                  4S jr\           S S\R"                  S\R                  S\R                  S\\R                     S\\R"                     S\\   S\\R                     S\\   S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )!Ú	FuyuModelé;   zlanguage_model.modelÚlanguage_modelr   c                 óˆ  >• [         TU ]  U5        UR                  U l        UR                  R
                  U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  -  UR                  -  UR                  5      U l        SU l        U R!                  5         g )NF)ÚsuperÚ__init__Úpad_token_idr#   Útext_configÚ
vocab_sizer   Úfrom_configr=   r   r   Ú
patch_sizeÚnum_channelsÚhidden_sizeÚvision_embed_tokensÚgradient_checkpointingÚ	post_init©r$   r   Ú	__class__s     €r&   r@   ÚFuyuModel.__init__C   s–   ø€ Ü‰Ñ˜Ô Ø!×.Ñ.ˆÔØ ×,Ñ,×7Ñ7ˆŒÜ'×3Ò3°F×4FÑ4FÓGˆÔÜ#%§9¢9Ø×Ñ × 1Ñ 1Ñ1°F×4GÑ4GÑGÈ×I[ÑI[ó$
ˆÔ ð ',ˆÔ#à‰Õr)   c                 ó6   • U R                   R                  5       $ ©N)r=   Úget_input_embeddings©r$   s    r&   rP   ÚFuyuModel.get_input_embeddingsP   s   € Ø×"Ñ"×7Ñ7Ó9Ð9r)   c                 ó:   • U R                   R                  U5        g rO   )r=   Úset_input_embeddings©r$   Úvalues     r&   rT   ÚFuyuModel.set_input_embeddingsS   s   € Ø×Ñ×0Ñ0°Õ7r)   c                 ó   • Xl         g rO   ©r=   ©r$   Údecoders     r&   Úset_decoderÚFuyuModel.set_decoderV   s   € Ø%Õr)   c                 ó   • U R                   $ rO   rY   rQ   s    r&   Úget_decoderÚFuyuModel.get_decoderY   s   € Ø×"Ñ"Ð"r)   Úword_embeddingsÚcontinuous_embeddingsÚimage_patch_input_indicesÚreturnc           
      ó  • UR                   S   [        U5      :X  d)  [        S[        U5      < SUR                   S   < 35      eUR                  5       n[	        UR                   S   5       Hš  n[
        R                  " X5   S:¬  SS9S   nX5   U   nUR                   S   X%   R                   S   :”  a-  [        SX%   R                   < SUR                   < SU S	35      eX%   U   R                  UR                  5      XEU4'   Mœ     U$ )
ay  This function places the continuous_embeddings into the word_embeddings at the locations
indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
embeddings.

Args:
    word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Tensor of word embeddings.
    continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
        [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
        indices in image_patch_input_indices for that batch element.
    image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Tensor of indices of the image patches in the input_ids tensor.
r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)Úas_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element Ú.)	ÚshapeÚlenÚ
ValueErrorÚcloneÚrangeÚtorchÚnonzeroÚtoÚdevice)r$   ra   rb   rc   Úoutput_embeddingsÚ	batch_idxÚdst_indicesÚsrc_indicess           r&   Úgather_continuous_embeddingsÚ&FuyuModel.gather_continuous_embeddings\   sI  € ð(  ×%Ñ% aÑ(¬CÐ0EÓ,FÓFÜØJ¬sÐ3HÓ/IÑ.KÐKjÐQ`×QfÑQfÐghÑQiÑPkÐlóð ð ,×1Ñ1Ó3ÐÜ˜×4Ñ4°QÑ7Ö8ˆIô  Ÿ-š-Ð(AÑ(LÐPQÑ(QÐ\`ÑaÐbcÑdˆKð 4Ñ>¸{ÑKˆKà× Ñ  Ñ#Ð&;Ñ&F×&LÑ&LÈQÑ&OÓOÜ Ø^Ð7LÑ7W×7]Ñ7]Ñ6_ð `IØ6A×6GÑ6GÑ5IÐI[Ð\eÐ[fÐfgðióð ð 9NÑ8XÐYdÑ8e×8hÑ8hØ!×(Ñ(ó9Ð¨Ð4Ó5ñ 9ð  !Ð r)   Úpixel_valuesc                 óÈ   • U Vs/ sHP  nU R                  UR                  U R                   R                  R                  5      5      R	                  S5      PMR     nnU$ s  snf )zü
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
r   )rH   ro   r   ÚdtypeÚsqueeze)r$   rw   ÚkwargsÚpatchÚpatch_embeddingss        r&   Úget_image_featuresÚFuyuModel.get_image_featuresˆ   sc   € ñ &ó
á%ð ×$Ñ$ U§X¡X¨d×.FÑ.F×.MÑ.M×.SÑ.SÓ%TÓU×]Ñ]Ð^_Ö`Ù%ð 	ð 
ð  Ðùò	
s   …AAÚ	input_idsÚinputs_embedsÚimage_featuresc           	      óJ  • Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )zÖ
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
©ry   rp   éÿÿÿÿr   r   z6Image features and image tokens do not match: tokens: z, features )rP   rm   Útensorr   Úimage_token_idÚlongrp   ÚallÚsumÚ	unsqueezeÚ	expand_asro   rh   Únumelrj   )r$   r€   r   r‚   Úspecial_image_maskÚn_image_tokensÚn_image_featuress          r&   Úget_placeholder_maskÚFuyuModel.get_placeholder_mask–   s  € ð ÑØ!.×2KÑ2KÔ2MÜ—’˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÑgó3ñ "Ðð "4×!7Ñ!7¸Ó!;Ñà!*¯k©k×.HÑ.HÑ!HÐà+×/Ñ/Ó1ˆØ/×9Ñ9¸"Ó=×GÑGÈÓV×YÑYÐZg×ZnÑZnÓoÐØ)×/Ñ/°Ñ2°^×5IÑ5IÈ!Ñ5LÑLÐØÑ,×2Ñ2Ó4¸×8LÑ8LÓ8NÓNÜØHÈÐHXÐXcÐdtÐcuÐvóð ð "Ð!r)   Úimage_patchesÚimage_patches_indicesÚattention_maskÚposition_idsr   Ú	use_cacheÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictc                 ó’  • U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  Ub  [        S5      eUb  UR                  u  pÞOUb  UR                  u  pÞnO[        S5      eUci  Ub  UR                  OUR                  nUb  UR                  5       OSn[        R                  " UUU-   [        R                  US9nUR                  S5      nUc   U R                  R                  5       " U5      nUbl  U R                  U5      n[        R                   " USS9R#                  UR                  UR$                  5      nU R'                  XUS9nUR)                  UU5      nU R                  " SUUUUU	U
UUS.UD6nU$ )	a´  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr   r„   )Údim)r   r‚   )r   r•   r–   r   r˜   r™   r—   rš   r*   )r   r˜   r™   r—   Úuse_return_dictrj   rh   rp   Úget_seq_lengthrm   Úarangerˆ   r‹   r=   rP   r~   Úcatro   ry   r‘   Úmasked_scatter)r$   r€   r“   r”   r•   r–   r   r   r—   r˜   r™   rš   r{   Ú
batch_sizeÚ
seq_lengthÚ_rp   Úpast_key_values_lengthr}   rŽ   Úoutputss                        r&   ÚforwardÚFuyuModel.forward®   só  € ð. 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	à%0Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆàÑ  ]Ñ%>ÜÐcÓdÐdØÑ"Ø%.§_¡_Ñ"ˆJ˜
ØÑ&Ø(5×(;Ñ(;Ñ%ˆJ¡AäÐSÓTÐTàÑØ)2Ñ)>Y×%Ò%ÀM×DXÑDXˆFØIXÑId _×%CÑ%CÔ%EÐjkÐ"Ü Ÿ<š<Ø&¨
Ð5KÑ(KÔSX×S]ÑS]ÐflñˆLð (×1Ñ1°!Ó4ˆLàÑ Ø ×/Ñ/×DÑDÔFÀyÓQˆMàÑ$Ø#×6Ñ6°}ÓEÐÜ$ŸyšyÐ)9¸qÑA×DÑDÀ]×EYÑEYÐ[h×[nÑ[nÓoÐØ!%×!:Ñ!:ØÐGWð ";ð "Ðð *×8Ñ8Ð9KÐM]Ó^ˆMà×%Ò%ð 

Ø'Ø)Ø%Ø+Ø/Ø!5ØØ#ñ

ð ñ

ˆð ˆr)   )rI   r=   r#   rH   rC   )NNNNNNNNNNN)r+   r,   r-   r.   Ú_checkpoint_conversion_mappingr   r@   rP   rT   r\   r_   rm   ÚTensorÚlistru   ÚFloatTensorr~   Ú
LongTensorr‘   r   r   r   Úboolr   Útupler	   r§   r8   Ú__classcell__©rL   s   @r&   r;   r;   ;   sº  ø† ð '=Ð>NÐ%OÐ"ð˜z÷ ò:ò8ò&ò#ð*!àŸ™ð*!ð  $ E§L¡LÑ1ð*!ð $)§<¡<ð	*!ð
 
‰ô*!ðX ¨u×/@Ñ/@ô  ð"Ø×)Ñ)ð"Ø:?×:KÑ:Kð"Ø]b×]nÑ]nô"ð0 ð '+Ø&*Ø.2Ø15Ø37Ø+/Ø59Ø$(Ø,0Ø/3Ø&*ñFà×#Ñ#ðFð —|‘|ðFð  %Ÿ|™|ð	Fð
 ! §¡Ñ.ðFð ˜u×/Ñ/Ñ0ðFð " %™ðFð   × 1Ñ 1Ñ2ðFð ˜D‘>ðFð $ D™>ðFð ' t™nðFð ˜d‘^ðFð 
ˆuÐ,Ð,Ñ	-ôFó öFr)   r;   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !       ó¼  ^ • \ rS rSrSSSS.rS/rS\4U 4S jjrS	 rS
 r	S r
S r\\             SS\R                  S\R                   S\R                   S\\R                      S\\R                     S\\   S\\R&                     S\\   S\\R                      S\\   S\\   S\\   S\\   S\\\4   4S jj5       5       r      SU 4S jjrSrU =r$ ) ÚFuyuForCausalLMéø   zmodel.language_modelzmodel.vision_embed_tokensÚlm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                 óê   >• [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)r    )r?   r@   r;   Úmodelr   r   rB   rG   rC   rµ   rJ   rK   s     €r&   r@   ÚFuyuForCausalLM.__init__  sS   ø€ Ü‰Ñ˜Ô Ü˜vÓ&ˆŒ
Ü—y’y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÑkˆŒØ‰Õr)   c                 ó6   • U R                   R                  5       $ rO   )r·   rP   rQ   s    r&   rP   Ú$FuyuForCausalLM.get_input_embeddings  s   € Øz‰z×.Ñ.Ó0Ð0r)   c                 ó:   • U R                   R                  U5        g rO   )r·   rT   rU   s     r&   rT   Ú$FuyuForCausalLM.set_input_embeddings  s   € Ø
‰
×'Ñ'¨Õ.r)   c                 ó:   • U R                   R                  U5        g rO   )r·   r\   rZ   s     r&   r\   ÚFuyuForCausalLM.set_decoder  s   € Ø
‰
×Ñ˜wÕ'r)   c                 ó6   • U R                   R                  5       $ rO   )r·   r_   rQ   s    r&   r_   ÚFuyuForCausalLM.get_decoder  s   € Øz‰z×%Ñ%Ó'Ð'r)   r€   r“   r”   r•   r–   r   r   r—   Úlabelsr˜   r™   rš   Úlogits_to_keeprd   c                 óZ  • U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R                  UUUUUUUU
UUSS9nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                   S9$ )ap  
image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
    Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
    hidden size of the model.
image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Tensor of indices of the image patches in the input_ids tensor.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

Examples:

```python
>>> from transformers import FuyuProcessor, FuyuForCausalLM
>>> from PIL import Image
>>> import requests

>>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
>>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

>>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
>>> image = Image.open(requests.get(url, stream=True).raw)
>>> prompt = "Generate a coco-style caption.\n"

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")
>>> outputs = model(**inputs)

>>> generated_ids = model.generate(**inputs, max_new_tokens=7)
>>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
>>> print(generation_text[0])
A blue bus parked on the side of a road.
```NT)r€   r“   r”   r   r•   r–   r   r˜   r™   r—   rš   r   )ÚlogitsrÁ   rC   )ÚlossrÄ   r   Úhidden_statesÚ
attentionsr*   )r   r˜   r™   r—   r   r·   r   ÚintÚslicerµ   Úloss_functionrB   rC   r	   r   rÆ   rÇ   )r$   r€   r“   r”   r•   r–   r   r   r—   rÁ   r˜   r™   rš   rÂ   r{   r¦   rÆ   Úslice_indicesrÄ   rÅ   s                       r&   r§   ÚFuyuForCausalLM.forward  sM  € ðj 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	à%0Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà—*‘*ØØ'Ø"7Ø'Ø)Ø%Ø+Ø/Ø!5ØØð ð 
ˆð   ™
ˆä8BÀ>ÔSV×8WÑ8Wœ˜~˜o¨tÔ4Ð]kˆØ—‘˜mªA¨}ºaÐ,?Ñ@ÓAˆàˆØÑØ×%Ò%ð Ø f¿¹×9PÑ9P×9[Ñ9[ñØ_eñˆDô &ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
r)   c           
      ó^   >• [         T
U ]  " U4UUUUUUS.UD6n	US   S:w  a
  S U	S'   S U	S'   U	$ )N)r   r•   r   r“   r”   Úcache_positionr   r”   r“   )r?   Úprepare_inputs_for_generation)r$   r€   r   r•   r   r“   r”   rÎ   r{   Úmodel_inputsrL   s             €r&   rÏ   Ú-FuyuForCausalLM.prepare_inputs_for_generationv  sb   ø€ ô ‘wÒ<Øð	
à+Ø)Ø'Ø'Ø"7Ø)ñ	
ð ñ	
ˆð ˜!Ñ Ó!à48ˆLÐ0Ñ1Ø,0ˆL˜Ñ)àÐr)   )rµ   r·   )NNNNNNNNNNNNr   )NNNNNN)r+   r,   r-   r.   r©   Ú_tied_weights_keysr   r@   rP   rT   r\   r_   r   r   rm   r­   rª   r   r   r¬   r®   rÈ   r   r¯   r	   r§   rÏ   r8   r°   r±   s   @r&   r³   r³   ø   s˜  ø† ð "8Ø ;Ø#,ñ&Ð"ð
 +Ð+Ðð˜z÷ ò1ò/ò(ò(ð Øð '+Ø&*Ø.2Ø15Ø37Ø+/Ø59Ø$(Ø)-Ø,0Ø/3Ø&*Ø()ñ[
à×#Ñ#ð[
ð —|‘|ð[
ð  %Ÿ|™|ð	[
ð
 ! §¡Ñ.ð[
ð ˜u×/Ñ/Ñ0ð[
ð " %™ð[
ð   × 1Ñ 1Ñ2ð[
ð ˜D‘>ð[
ð ˜Ÿ™Ñ&ð[
ð $ D™>ð[
ð ' t™nð[
ð ˜d‘^ð[
ð ! ™ð[
ð  
ˆuÐ,Ð,Ñ	-ô![
ó ó ð[
ð@ ØØØØ"Ø÷õ r)   r³   )r³   r   r;   )Ú__doc__Útypingr   r   rm   Útorch.utils.checkpointr   Úcache_utilsr   Ú
generationr   Úmodeling_outputsr	   Úmodeling_utilsr
   Úmodels.auto.modeling_autor   Úutilsr   r   r   Úconfiguration_fuyur   Ú
get_loggerr+   Úloggerr   r;   r³   Ú__all__r*   r)   r&   Ú<module>rà      s¹   ðñ ç "ã Û Ý å  Ý )Ý 6Ý -Ý 2ß >Ñ >Ý *ð 
×	Ò	˜HÓ	%€ð ô?˜/ó ?ó ð?ñ. ðñô
uÐ#ó uóð
uñp ðñô
VÐ)¨?ó Vóð
Vòr Br)   