
    <hI                        S SK JrJr  S SKrS SKrS SKJr  SSKJrJr  SSK	J
r
  SSKJr  SSKJrJrJr  S	S
KJrJr  S	SKJr  S	SKJr  S	SKJrJrJrJrJr  \R<                  " \5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r% " S S\5      r& " S S\5      r' " S S\5      r( " S S\5      r)/ S Qr*g)!    )OptionalUnionN)nn   )CacheDynamicCache)FlashAttentionKwargs)Unpack)auto_docstringcan_return_tuplelogging   )Idefics3ConfigIdefics3VisionConfig)Idefics3ImageProcessor)Idefics3ImageProcessorFast)Idefics3BaseModelOutputWithPast Idefics3ForConditionalGenerationIdefics3ModelIdefics3PreTrainedModelIdefics3VisionTransformerc                       \ rS rSrSrSrSrg)SmolVLMVisionConfig)   a  
This is the configuration class to store the configuration of a [`SmolVLMVisionModel`]. It is used to instantiate a
SmolVLM vision encoder according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
[google/siglip-so400m-patch14-384](https://huggingface.co/google/siglip-so400m-patch14-384) used in SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct).

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    hidden_size (`int`, *optional*, defaults to 1152):
        Dimensionality of the encoder layers and the pooler layer.
    intermediate_size (`int`, *optional*, defaults to 3072):
        Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
    num_hidden_layers (`int`, *optional*, defaults to 12):
        Number of hidden layers in the Transformer encoder.
    num_attention_heads (`int`, *optional*, defaults to 16):
        Number of attention heads for each attention layer in the Transformer encoder.
    num_channels (`int`, *optional*, defaults to 3):
        Number of channels in the input images.
    image_size (`int`, *optional*, defaults to 224):
        The size (resolution) of each image.
    patch_size (`int`, *optional*, defaults to 32):
        The size (resolution) of each patch.
    hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
        The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
        `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
    layer_norm_eps (`float`, *optional*, defaults to 1e-06):
        The epsilon used by the layer normalization layers.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.

Example:

```python
>>> from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
>>> from transformers.models.smolvlm.configuration_smolvlm import SmolVLMVisionConfig

>>> # Initializing a SmolVLMVisionConfig with google/siglip-so400m-patch14-384 style configuration
>>> configuration = SmolVLMVisionConfig()

>>> # Initializing a SmolVLMVisionTransformer (with random weights) from the google/siglip-so400m-patch14-384 style configuration
>>> model = SmolVLMVisionTransformer(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlm_vision N__name__
__module____qualname____firstlineno____doc__
model_type__static_attributes__r       c/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/smolvlm/modular_smolvlm.pyr   r   )   s    1f "Jr%   r   c                       \ rS rSrSrg)SmolVLMPreTrainedModela   r   Nr   r   r    r!   r$   r   r%   r&   r(   r(   a       r%   r(   c                       \ rS rSrSrg)SmolVLMVisionTransformere   r   Nr*   r   r%   r&   r-   r-   e   r+   r%   r-   c                       \ rS rSrSrSrSrg)SmolVLMConfigi   aC  
This is the configuration class to store the configuration of a [`SmolVLMModel`]. It is used to instantiate a
SmolVLM model according to the specified arguments, defining the model architecture. Instantiating a
configuration with the defaults will yield a similar configuration to that of the model of the SmolVLM
[HuggingFaceTB/SmolVLM2-2.2B-Instruct](https://huggingface.co/HuggingFaceTB/SmolVLM2-2.2B-Instruct) architecture.

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    use_cache (`bool`, *optional*, defaults to `True`):
        Whether or not the model should cache the key/value pairs of the attention mechanism. Only
        relevant if `config.is_decoder=True`.
    image_token_id (`int`, *optional*, defaults to 128257):
        The id of the "image" token.
    tie_word_embeddings (`bool`, *optional*, defaults to `False`):
        Whether or not to tie the word embeddings with the token embeddings.
    vision_config (`IdeficsVisionConfig` or `dict`, *optional*, defaults to `IdeficsVisionConfig`):
        Custom vision config or dict for the vision tower
    text_config (`PretrainedConfig` or `dict`, *optional*, defaults to `LlamaConfig`):
        Custom text config or dict for the text model
    scale_factor (`int`, *optional*, defaults to 2):
        The scale factor for the image encoder.
    pad_token_id (`int`, *optional*, defaults to 128002):
        The id of the padding token.

Example:
```python
>>> from transformers import SmolVLMModel, SmolVLMConfig
>>> # Initializing configuration
>>> configuration = SmolVLMConfig()
>>> # Initializing a model from the configuration
>>> model = SmolVLMModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```smolvlmr   Nr   r   r%   r&   r0   r0   i   s    #J Jr%   r0   c                       \ rS rSrSrg)SmolVLMImageProcessor   r   Nr*   r   r%   r&   r4   r4      r+   r%   r4   c                       \ rS rSrSrg)SmolVLMImageProcessorFast   r   Nr*   r   r%   r&   r7   r7      r+   r%   r7   c                       \ rS rSrSrg)SmolVLMBaseModelOutputWithPast   r   Nr*   r   r%   r&   r:   r:      r+   r%   r:   c            #       &   \ rS rSrSrS\R                  S\R                  S\R                  4S jrSS\R                  S	\R                  4S
 jjr
\\" SS9             SS\\R                     S\\R                     S\\R                     S\\   S\\R                     S\\R                     S	\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\   S\\\4   4S jj5       5       rSrg)SmolVLMModel   z
A subclass of Idefics3Model. We do *not* remove or block the call to inputs_merger
in forward. Instead, we override inputs_merger here with custom logic.
	input_idsinputs_embedsimage_hidden_statesc           	         UR                   u  pEnUc^  X R                  5       " [        R                  " U R                  R
                  [        R                  UR                  S95      :H  nUS   nOXR                  R
                  :H  nUR                  SS9n[        R                  " Xu-  S:H  5      (       d  [        S5      eXu-  n[        R                  R                  R                  UR                  SS9SSS9n	U	S S	 n
UR                  S	S9nUS-
  U-  nUS-
  U-  nU
R                  S5      U-   n[        R                   " U5      nX>U   X   S S 24   X'   [        R"                  " UR                  S	5      X5      nU$ )
Ndtypedevice).r      dimr   zCAt least one sample has <image> tokens not divisible by patch_size.)rF   r   )value)shapeget_input_embeddingstorchtensorconfigimage_token_idlongrE   sumall
ValueErrorr   
functionalpadcumsum	unsqueeze
zeros_likewhere)selfr?   r@   rA   _
patch_size
image_masknum_image_tokensblocks_per_sampleoffsetsblock_offsetrow_cum	chunk_idx	local_idx	block_idximage_embedsmerged_embedss                    r&   inputs_mergerSmolVLMModel.inputs_merger   s    /44q&*C*C*ET[[77uzzR_RfRfg+ J $F+J"kk&@&@@J%>>a>0yy)6!;<<bcc,:((%%))*;*B*Bq*B*I6YZ)[s|###+q[Z/	q[J.	 **1-	9	''6#67LiNcef7f#g J$8$8$<lZr%   Npixel_valuespixel_attention_maskc                 B   UR                   u  p4pVnUR                  " X4-  /UR                   SS Q76 nUR                   SS R                  5       nUS:H  R                  SS9U:g  n	[	        U	5      (       d  SU	S'   X   R                  5       nUcK  [        R                  " S	 V
s/ sH  oR                   U
   PM     sn
[        R                  UR                  S
9nO4UR                  " X4-  /UR                   SS Q76 nX)   R                  5       nU R                  R                  R                  nUR                  SXS9nUR                  SXS9nUR                  SS9S:  R                  5       nU R                  XS9nUR                  nU R!                  U5      nU$ s  sn
f )az  
Encodes images into continuous embeddings that can be forwarded to the language model.

Args:
    pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        The tensors corresponding to the input images.
    pixel_attention_mask (`torch.LongTensor`, *optional*):
        The attention mask indicating padded regions in the image.
r   NrF   g        )rJ   rG   Tr   )r   r   r   )sizerD   rE   )	dimensionrp   step)rJ   rn   )rk   patch_attention_mask)rK   viewnumelrR   any
contiguousrM   onesboolrE   rO   vision_configr]   unfoldvision_modellast_hidden_state	connector)r[   rk   rl   
batch_size
num_imagesnum_channelsheightwidthnb_values_per_imagereal_images_indsir]   patches_subgridrs   rA   s                  r&   get_image_featuresSmolVLMModel.get_image_features   s    ?K>P>P;
e#(()@Z<CUCUVWVXCYZ +004::<(C/444FJ]]#$$"&Q#5@@B'#(::5>?Y((+Y?jj#**$  $8#<#<Z=T#vWkWqWqrsrtWu#v #7#I#T#T#V [[..99
.55
5d)001:0_ / 3 3 3 AA EKKM #//\/u1CC #nn-@A""' @s   #Fa  
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        )custom_introattention_maskposition_idspast_key_values	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictcache_positionkwargsreturnc                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  UOU R                   R                  nU R
                  (       a9  U R                  R                  (       a  U	(       a  [        R                  S5        Sn	Ub  UR                  u  nnOUb  UR                  u  nnnO[        S5      eU	(       a  Uc
  [        5       nUc9  U R                  R                  5       " U5      R                  UR                  5      nUb  Ub  [        S5      eUb+  U R!                  Xg5      R                  UR                  5      nO'Ub$  UR                  U R"                  UR                  S9nUb  U R%                  UUUS9nU R                  " S
UUUUU	U
USUS.	UD6n['        UR(                  UR*                  UR,                  UR.                  US	9$ )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz5You have to specify either input_ids or inputs_embedszMYou cannot specify both pixel_values and image_hidden_states at the same timerC   )r?   r@   rA   T)	r@   r   r   r   r   r   r   r   r   )r}   r   hidden_states
attentionsrA   r   )rO   r   r   r   use_return_dicttraining
text_modelgradient_checkpointingloggerwarning_oncerK   rT   r   rL   torE   r   rD   ri   r:   r}   r   r   r   )r[   r?   r   r   r   r@   rk   rl   rA   r   r   r   r   r   r   r   
seq_lengthr\   outputss                      r&   forwardSmolVLMModel.forward   s   : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]==T__CC	l I  %.__"J
&(5(;(;%J
ATUU0*nO  OO@@B9MPPQZQaQabM #(;(Glmm#"&"9"9,"]"`"`anauau"v ,"5"8"8tzzR_RfRf"8"g* !..#+$7 / M // 
')%+/!5)
 
 .%77#33!//)) 3
 	
r%   r   )N)NNNNNNNNNNNNN)r   r   r    r!   r"   rM   
LongTensorTensorri   FloatTensorr   r   r   r   r   
BoolTensorry   r
   r	   r   tupler:   r   r$   r   r%   r&   r=   r=      s   
)):?,,]b]i]i@,#u/@/@ ,#X]XhXh ,#\ 
 151537+/5948;?;?$(,0/3&*59Q
E,,-Q
 !.Q
 u//0	Q

 "%Q
   1 12Q
 u001Q
 'u'7'78Q
 &e&7&78Q
 D>Q
 $D>Q
 'tnQ
 d^Q
 !!1!12Q
 -.Q
  
u44	5!Q

 Q
r%   r=   c                   4   ^  \ rS rSrU 4S jrU 4S jrSrU =r$ )SmolVLMForConditionalGenerationiS  c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NF)bias)super__init__r=   modelr   Lineartext_confighidden_size
vocab_sizelm_head	post_init)r[   rO   	__class__s     r&   r   (SmolVLMForConditionalGeneration.__init__T  sS     !&)
yy!3!3!?!?ASASA^A^ejkr%   c                 &   > [         TU ]  " S0 UD6  g)a  
pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
    Mask to avoid performing attention on padding pixel indices.
image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
    The hidden states of the image encoder after modality projection.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
    ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> import requests
>>> import torch
>>> from PIL import Image
>>> from io import BytesIO

>>> from transformers import AutoProcessor, AutoModelForImageTextToText
>>> from transformers.image_utils import load_image

>>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
>>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
>>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
>>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")

>>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
>>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")

>>> # Create inputs
>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {"type": "video", "path": path/to/video},
...             {"type": "text", "text": "What is happening in this video?"},
...         ]
...     }
... ]

>>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)

>>> # Generate
>>> generated_ids = model.generate(**inputs, max_new_tokens=256)
>>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)

>>> print(generated_texts)
```Nr   )r   r   )r[   super_kwargsr   s     r&   r   'SmolVLMForConditionalGeneration.forwardZ  s    b 	','r%   )r   r   )r   r   r    r!   r   r   r$   __classcell__)r   s   @r&   r   r   S  s    1( 1(r%   r   )r   r0   r4   r7   r   r(   r=   r-   )+typingr   r   rM   torch.utils.checkpointr   cache_utilsr   r   modeling_flash_attention_utilsr	   processing_utilsr
   utilsr   r   r   idefics3.configuration_idefics3r   r   "idefics3.image_processing_idefics3r   'idefics3.image_processing_idefics3_fastr   idefics3.modeling_idefics3r   r   r   r   r   
get_loggerr   r   r   r(   r-   r0   r4   r7   r:   r=   r   __all__r   r%   r&   <module>r      s     #    . B & > > R G P  
		H	%5	. 5	p	4 		8 	'	N '	T	2 		 : 		%D 	q
= q
h8(&F 8(v	r%   