ó
    <±hƒI  ã                   ó¢  • S SK JrJr  S SKrS SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJrJrJr  S	S
KJrJr  S	SKJr  S	SKJr  S	SKJrJrJrJrJr  \R<                  " \5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r)/ S Qr*g)!é    )ÚOptionalÚUnionN)Únné   )ÚCacheÚDynamicCache)ÚFlashAttentionKwargs)ÚUnpack)Úauto_docstringÚcan_return_tupleÚloggingé   )ÚIdefics3ConfigÚIdefics3VisionConfig)ÚIdefics3ImageProcessor)ÚIdefics3ImageProcessorFast)ÚIdefics3BaseModelOutputWithPastÚ Idefics3ForConditionalGenerationÚIdefics3ModelÚIdefics3PreTrainedModelÚIdefics3VisionTransformerc                   ó   • \ rS rSrSrSrSrg)ÚSmolVLMVisionConfigé)   a  
This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1152):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

Example:

```python
>>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
>>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

>>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
>>> configuration = SmolVLMVisionConfig()

>>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
>>> model = SmolVLMVisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Úsmolvlm_vision© N©Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚ__static_attributes__r   ó    Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   )   s   † ñ1ðf "€JÚr%   r   c                   ó   • \ rS rSrSrg)ÚSmolVLMPreTrainedModeléa   r   N©r   r   r    r!   r$   r   r%   r&   r(   r(   a   ó   † Úr%   r(   c                   ó   • \ rS rSrSrg)ÚSmolVLMVisionTransformerée   r   Nr*   r   r%   r&   r-   r-   e   r+   r%   r-   c                   ó   • \ rS rSrSrSrSrg)ÚSmolVLMConfigéi   aC  
This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should cache the key/value pairs of the attention mechanism. Only
        relevant if `config.is_decoder=True`.
    image_token_id (`int`, *optional*, defaults to 128257):
        The id of the "image" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to tie the word embeddings with the token embeddings.
    vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
        Custom vision config or dict for the vision tower
    text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
        Custom text config or dict for the text model
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.
    pad_token_id (`int`, *optional*, defaults to 128002):
        The id of the padding token.

Example:
```python
>>> from transformers import SmolVLMModel, SmolVLMConfig
>>> # Initializing configuration
>>> configuration = SmolVLMConfig()
>>> # Initializing a model from the configuration
>>> model = SmolVLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```Úsmolvlmr   Nr   r   r%   r&   r0   r0   i   s   † ñ#ðJ €JÚr%   r0   c                   ó   • \ rS rSrSrg)ÚSmolVLMImageProcessoré“   r   Nr*   r   r%   r&   r4   r4   “   r+   r%   r4   c                   ó   • \ rS rSrSrg)ÚSmolVLMImageProcessorFasté—   r   Nr*   r   r%   r&   r7   r7   —   r+   r%   r7   c                   ó   • \ rS rSrSrg)ÚSmolVLMBaseModelOutputWithPasté›   r   Nr*   r   r%   r&   r:   r:   ›   r+   r%   r:   c            #       ó&  • \ rS rSrSrS\R                  S\R                  S\R                  4S jrSS\R                  S	\R                  4S
 jjr
\\" SS9             SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\R                     S	\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       rSrg)ÚSmolVLMModeléŸ   z›
A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
in forward. Instead, we override inputs_merger here with custom logic.
Ú	input_idsÚinputs_embedsÚimage_hidden_statesc           	      ó  • UR                   u  pEnUc^  X R                  5       " [        R                  " U R                  R
                  [        R                  UR                  S95      :H  nUS   nOXR                  R
                  :H  nUR                  SS9n[        R                  " Xu-  S:H  5      (       d  [        S5      eXu-  n[        R                  R                  R                  UR                  SS9SSS9n	U	S S	 n
UR                  S	S9nUS-
  U-  nUS-
  U-  nU
R                  S5      U-   n[        R                   " U5      nX>U   XÖ   S S 24   Xö'   [        R"                  " UR                  S	5      Xò5      nU$ )
N©ÚdtypeÚdevice).r   é   ©Údimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rF   r   )Úvalueéÿÿÿÿ)ÚshapeÚget_input_embeddingsÚtorchÚtensorÚconfigÚimage_token_idÚlongrE   ÚsumÚallÚ
ValueErrorr   Ú
functionalÚpadÚcumsumÚ	unsqueezeÚ
zeros_likeÚwhere)Úselfr?   r@   rA   Ú_Ú
patch_sizeÚ
image_maskÚnum_image_tokensÚblocks_per_sampleÚoffsetsÚblock_offsetÚrow_cumÚ	chunk_idxÚ	local_idxÚ	block_idxÚimage_embedsÚmerged_embedss                    r&   Úinputs_mergerÚSmolVLMModel.inputs_merger¥   s  € ð /×4Ñ4ÑˆqàÑØ&×*CÑ*CÔ*EÜ—’˜TŸ[™[×7Ñ7¼u¿z¹zÐR_×RfÑRfÑgó+ñ ˆJð $ FÑ+‰Jà"§k¡k×&@Ñ&@Ñ@ˆJà%Ÿ>™>¨a˜>Ð0ÐÜyŠyÐ)Ñ6¸!Ñ;×<Ñ<ÜÐbÓcÐcà,Ñ:Ðä—(‘(×%Ñ%×)Ñ)Ð*;×*BÑ*BÀqÐ*BÐ*IÈ6ÐYZÐ)Ð[ˆØ˜s |ˆØ×#Ñ#¨Ð#Ð+ˆØ˜q‘[ ZÑ/ˆ	Ø˜q‘[ JÑ.ˆ	Ø ×*Ñ*¨1Ó-°	Ñ9ˆ	ä×'Ò'¨Ó6ˆØ#6ÀÑ7LÈiÑNcÒefÐ7fÑ#gˆÑ äŸš J×$8Ñ$8¸Ó$<¸lÓZˆØÐr%   NÚpixel_valuesÚpixel_attention_maskc                 óB  • UR                   u  p4pVnUR                  " X4-  /UR                   SS Q76 nUR                   SS R                  5       nUS:H  R                  SS9U:g  n	[	        U	5      (       d  SU	S'   X   R                  5       nUcK  [        R                  " S	 V
s/ sH  o¡R                   U
   PM     sn
[        R                  UR                  S
9nO4UR                  " X4-  /UR                   SS Q76 nX)   R                  5       nU R                  R                  R                  nUR                  SX»S9nUR                  SX»S9nUR                  SS9S:„  R                  5       nU R                  XS9nUR                  nU R!                  U5      nU$ s  sn
f )az  
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    pixel_attention_mask (`torch.LongTensor`, *optional*):
        The attention mask indicating padded regions in the image.
r   NrF   g        )rJ   éþÿÿÿéýÿÿÿrG   Tr   )r   r   r   )ÚsizerD   rE   )Ú	dimensionrp   Ústep)rJ   rn   )rk   Úpatch_attention_mask)rK   ÚviewÚnumelrR   ÚanyÚ
contiguousrM   ÚonesÚboolrE   rO   Úvision_configr]   ÚunfoldÚvision_modelÚlast_hidden_stateÚ	connector)r[   rk   rl   Ú
batch_sizeÚ
num_imagesÚnum_channelsÚheightÚwidthÚnb_values_per_imageÚreal_images_indsÚir]   Úpatches_subgridrs   rA   s                  r&   Úget_image_featuresÚSmolVLMModel.get_image_featuresÅ   sÁ  € ð ?K×>PÑ>PÑ;ˆ
 °eØ#×(Ò(¨Ñ)@ÐZÀ<×CUÑCUÐVWÐVXÐCYÒZˆð +×0Ñ0°°Ð4×:Ñ:Ó<ÐØ(¨CÑ/×4Ñ4¸Ð4ÐFÐJ]Ñ]ÐäÐ#×$Ñ$à"&Ð˜QÑà#Ñ5×@Ñ@ÓBˆàÑ'Ü#(§:¢:Ù5>Ó?±Y°×(Ñ(¨Ô+±YÑ?Ü—j‘jØ#×*Ñ*ñ$Ñ ð $8×#<Ò#<¸ZÑ=TÐ#vÐWk×WqÑWqÐrsÐrtÐWuÒ#vÐ Ø#7Ñ#I×#TÑ#TÓ#VÐ Ø—[‘[×.Ñ.×9Ñ9ˆ
Ø.×5Ñ5ÀÈ
Ð5ÐdˆØ)×0Ñ0¸1À:Ð0Ð_ˆØ /× 3Ñ 3¸Ð 3Ð AÀAÑ E×KÑKÓMÐð #×/Ñ/¸\Ð/ÐuÐØ1×CÑCÐð #Ÿn™nÐ-@ÓAÐØ"Ð"ùò' @s   Â#FaØ  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        )Úcustom_introÚattention_maskÚposition_idsÚpast_key_valuesÚ	use_cacheÚoutput_attentionsÚoutput_hidden_statesÚreturn_dictÚcache_positionÚkwargsÚreturnc                 ó  • U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU R
                  (       a9  U R                  R                  (       a  U	(       a  [        R                  S5        Sn	Ub  UR                  u  nnOUb  UR                  u  nnnO[        S5      eU	(       a  Uc
  [        5       nUc9  U R                  R                  5       " U5      R                  UR                  5      nUb  Ub  [        S5      eUb+  U R!                  Xg5      R                  UR                  5      nO'Ub$  UR                  U R"                  UR                  S9nUb  U R%                  UUUS9nU R                  " S
UUUUU	U
USUS.	UD6n['        UR(                  UR*                  UR,                  UR.                  US	9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedszMYou cannot specify both pixel_values and image_hidden_states at the same timerC   )r?   r@   rA   T)	r@   r‹   rŒ   r   rŽ   r   r   r‘   r’   )r}   r   Úhidden_statesÚ
attentionsrA   r   )rO   r   r   rŽ   Úuse_return_dictÚtrainingÚ
text_modelÚgradient_checkpointingÚloggerÚwarning_oncerK   rT   r   rL   ÚtorE   rˆ   rD   ri   r:   r}   r   r–   r—   )r[   r?   r‹   rŒ   r   r@   rk   rl   rA   rŽ   r   r   r‘   r’   r“   r   Ú
seq_lengthr\   Úoutputss                      r&   ÚforwardÚSmolVLMModel.forwardó   s  € ð: 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð "+Ñ!6‘I¸D¿K¹K×<QÑ<Qˆ	Ø%0Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà==˜TŸ_™_×C×CÎ	Ü×ÑØlôð ˆIð Ñ Ø%.§_¡_Ñ"ˆJ™
ØÑ&Ø(5×(;Ñ(;Ñ%ˆJ˜
¡AäÐTÓUÐUæ˜Ñ0Ü*›nˆOàÑ Ø ŸO™O×@Ñ@ÔBÀ9ÓM×PÑPÐQZ×QaÑQaÓbˆMð Ñ#Ð(;Ñ(GÜÐlÓmÐmàÑ#Ø"&×"9Ñ"9¸,Ó"]×"`Ñ"`Ðan×auÑauÓ"vÑØ Ñ,Ø"5×"8Ñ"8¸t¿z¹zÐR_×RfÑRfÐ"8Ð"gÐàÑ*ð !×.Ñ.Ø#Ø+Ø$7ð /ð ˆMð —/’/ð 
Ø'Ø)Ø%Ø+ØØ/Ø!5ØØ)ñ
ð ñ
ˆô .Ø%×7Ñ7Ø#×3Ñ3Ø!×/Ñ/Ø×)Ñ)Ø 3ñ
ð 	
r%   r   )N)NNNNNNNNNNNNN)r   r   r    r!   r"   rM   Ú
LongTensorÚTensorri   ÚFloatTensorrˆ   r   r   r   r   Ú
BoolTensorry   r
   r	   r   Útupler:   r¡   r$   r   r%   r&   r=   r=   Ÿ   sÅ  † ñð
Ø×)Ñ)ðØ:?¿,¹,ðØ]b×]iÑ]iôñ@,#¨u×/@Ñ/@ð ,#ÐX]×XhÑXhõ ,#ð\ Ùðñ
ð 15Ø15Ø37Ø+/Ø59Ø48Ø;?Ø;?Ø$(Ø,0Ø/3Ø&*Ø59ñQ
à˜E×,Ñ,Ñ-ðQ
ð ! §¡Ñ.ðQ
ð ˜u×/Ñ/Ñ0ð	Q
ð
 " %™ðQ
ð   × 1Ñ 1Ñ2ðQ
ð ˜u×0Ñ0Ñ1ðQ
ð ' u×'7Ñ'7Ñ8ðQ
ð & e×&7Ñ&7Ñ8ðQ
ð ˜D‘>ðQ
ð $ D™>ðQ
ð ' t™nðQ
ð ˜d‘^ðQ
ð ! ×!1Ñ!1Ñ2ðQ
ð Ð-Ñ.ðQ
ð  
ˆuÐ4Ð4Ñ	5ô!Q
ó
ó óQ
r%   r=   c                   ó4   ^ • \ rS rSrU 4S jrU 4S jrSrU =r$ )ÚSmolVLMForConditionalGenerationiS  c                 óê   >• [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)Úbias)ÚsuperÚ__init__r=   Úmodelr   ÚLinearÚtext_configÚhidden_sizeÚ
vocab_sizeÚlm_headÚ	post_init)r[   rO   Ú	__class__s     €r&   r­   Ú(SmolVLMForConditionalGeneration.__init__T  sS   ø€ Ü‰Ñ˜Ô Ü! &Ó)ˆŒ
Ü—y’y ×!3Ñ!3×!?Ñ!?À×ASÑAS×A^ÑA^ÐejÑkˆŒØ‰Õr%   c                 ó&   >• [         TU ]  " S0 UD6  g)aŠ  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The hidden states of the image encoder after modality projection.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "video", "path": path/to/video},
...             {"type": "text", "text": "What is happening in this video?"},
...         ]
...     }
... ]

>>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts)
```Nr   )r¬   r¡   )r[   Úsuper_kwargsrµ   s     €r&   r¡   Ú'SmolVLMForConditionalGeneration.forwardZ  s   ø€ ôb 	‰ŠÑ'˜,Ó'r%   )r³   r®   )r   r   r    r!   r­   r¡   r$   Ú__classcell__)rµ   s   @r&   r©   r©   S  s   ø† õ÷1(ó 1(r%   r©   )r   r0   r4   r7   r©   r(   r=   r-   )+Útypingr   r   rM   Útorch.utils.checkpointr   Úcache_utilsr   r   Úmodeling_flash_attention_utilsr	   Úprocessing_utilsr
   Úutilsr   r   r   Úidefics3.configuration_idefics3r   r   Ú"idefics3.image_processing_idefics3r   Ú'idefics3.image_processing_idefics3_fastr   Úidefics3.modeling_idefics3r   r   r   r   r   Ú
get_loggerr   rœ   r   r(   r-   r0   r4   r7   r:   r=   r©   Ú__all__r   r%   r&   Ú<module>rÇ      sÇ   ð÷  #ã Û Ý ç .Ý BÝ &ß >Ñ >ß RÝ GÝ P÷õ ð 
×	Ò	˜HÓ	%€ô5	Ð.ô 5	ôp	Ð4ô 	ô	Ð8ô 	ô'	Nô '	ôT	Ð2ô 	ô	Ð :ô 	ô	Ð%Dô 	ôq
=ô q
ôh8(Ð&Fô 8(òv	r%   