
    <hO8                        S SK JrJr  S SKrS SKJr  SSKJr  SSKJr  SSK	J
r
  SSKJr  SS	KJr  S
SKJrJrJrJrJrJr  S
SKJr  SSKJr  \R4                  " \5      r " S S\5      r " S S\R<                  5      r " S S\R<                  5      r  " S S\5      r! " S S\5      r" " S S\5      r# " S S\5      r$ " S S\5      r%/ SQr&g)     )OptionalUnionN)nn   )ACT2FN)Cache)FlashAttentionKwargs)Unpack)logging   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                       \ rS rSrSrg)Mistral3RMSNorm)    N__name__
__module____qualname____firstlineno____static_attributes__r       e/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   )       r    r   c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  4S jr	S	r
U =r$ )
Mistral3PatchMerger-   z4
Learned merging of spatial_merge_size ** 2 patches
configc                   > [         TU ]  5         Xl        UR                  R                  nUR
                  U l        U R                  R                  R                  U l        [        R                  " X R
                  S-  -  USS9U l	        g )Nr   Fbias)
super__init__r&   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr&   r-   	__class__s      r!   r+   Mistral3PatchMerger.__init__2   sn    **66"(";";++33>>YY{5L5La5O'OQ\chir    image_featuresimage_sizesreturnc                    U Vs/ sH&  o3S   U R                   -  US   U R                   -  4PM(     nnU VVs/ sH	  u  pEXE-  PM     nnnUR                  S   n/ n[        UR                  U5      5       H  u  pX)   u  pEU
R	                  XEU5      R                  SSS5      R                  S5      n[        R                  R                  R                  XR                  U R                  S9nUR	                  XpR                  S-  -  S5      R                  5       nUR                  U5        M     [        R                  " USS9nU R                  U5      nU$ s  snf s  snnf )Nr   r   r   )kernel_sizestridedim)r/   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr.   tappendcatr1   )r2   r5   r6   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r!   forwardMistral3PatchMerger.forward;   s[   cn
cnU_]doo-z!}/OPcn 	 
 /::kdaAEk:  $)2>3G3GHX3Y)Z%K+DA%**13;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4( *[ ?:++N;)
 ;s
   ,EE)r&   r1   r/   r.   )r   r   r   r   __doc__r   r+   rD   TensorrT   r   __classcell__r3   s   @r!   r$   r$   -   sD    j~ jell  RWR^R^  r    r$   c                   j   ^  \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )Mistral3MultiModalProjectorS   r&   c                   > [         TU ]  5         [        UR                  R                  UR
                  R                  S9U l        [        U5      U l	        [        UR                  [        5      (       a  SO[        UR                  5      n[        R                  " UR                  R                  U-  UR
                  R                  UR                   S9U l        [$        UR&                     U l        [        R                  " UR
                  R                  UR
                  R                  UR                   S9U l        g )N)epsr   r(   )r*   r+   r   r,   r-   text_configrms_norm_epsnormr$   patch_merger
isinstancevision_feature_layerintlenr   r0   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r2   r&   num_feature_layersr3   s      r!   r+   $Mistral3MultiModalProjector.__init__T   s    #F$8$8$D$D&J\J\JiJij	/7",V-H-H#"N"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r    r5   r6   c                     U R                  U5      nU R                  X5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ N)ra   rb   rh   rj   rk   )r2   r5   r6   hidden_statess       r!   rT   #Mistral3MultiModalProjector.forwardd   sP    >2**>Gn5/m4r    )rj   rh   rk   ra   rb   )r   r   r   r   r   r+   rD   rW   rT   r   rX   rY   s   @r!   r[   r[   S   s/    
~ 
 ell   r    r[   c                       \ rS rSrSrg)Mistral3CausalLMOutputWithPastm   r   Nr   r   r    r!   rs   rs   m   r"   r    rs   c                       \ rS rSrSrg)Mistral3ModelOutputWithPastq   r   Nr   r   r    r!   rv   rv   q   r"   r    rv   c                       \ rS rSrSrg)Mistral3PreTrainedModelu   r   Nr   r   r    r!   ry   ry   u   r"   r    ry   c            !          \ rS rSr SS\R
                  S\R                  S\\\	\
\	   4      4S jjr             SS\R                  S\R
                  S\\R                     S	\\R                     S
\\   S\\R
                     S\\\	\
\	   4      S\\   S\\   S\\   S\\   S\\R                     S\R                  S\\   S\\\4   4S jjrSrg)Mistral3Modely   Npixel_valuesr6   rd   c                    Ub  UOU R                   R                  nUR                  5        VVs0 sH  u  pVUc  M
  XV_M     nnnU R                  " U4USS.UD6n[	        U[
        5      (       a  UR                  U   nO2U V	s/ sH  oR                  U	   PM     n
n	[        R                  " U
SS9nU R                  UR                  S5      U5      nU R                  R                  U R                   R                  -  nU VVs/ sH  u  pX-  X-  -  PM     nnn[        R                  " UR                  S5      U5      nU$ s  snnf s  sn	f s  snnf )a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
       The tensors corresponding to the input images.
    vision_feature_layer (`Union[int, list[int]]`, *optional*):
        The index of the layer to select the vision feature. If multiple indices are provided,
        the vision feature of the corresponding indices will be concatenated to form the
        vision features.
    image_sizes (`torch.Tensor`, *optional*):
        Tensor containing the image sizes as returned by the processor.
Returns:
    image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
T)r6   output_hidden_statesr9   r<   r   )r&   rd   itemsvision_towerrc   re   rp   rD   rI   multi_modal_projectorsqueezer/   r.   r@   )r2   r~   r6   rd   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolr5   downsample_ratioheightwidthsplit_sizess                   r!   get_image_features Mistral3Model.get_image_featuresz   sK   . %9$D $++JjJj 	 $*<<>C>41Q$!$>C)),uKfjuntu *C00%2%@%@AU%V"OcdOc)229=OcGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XXgrsgrVcV\2u7PQgrs^%;%;A%>L D e
 ts   EEE	E	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsr   return_dictcache_positionr   r7   c                    U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUS L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbl  U R                  UUUS9n[        R                  " USS9R                  UR                  UR                  5      nU R                  XUS9nUR                  UU5      nU R                  " S	UUUUUU	U
SUS.	UD6n[!        UR"                  UR$                  UR&                  UR(                  Ub  WS9$ S S9$ )
Nz:You must specify exactly one of input_ids or inputs_embeds)r~   rd   r6   r   r<   )r   r5   T)	r   r   r   r   r   r   r   r   r   )last_hidden_stater   rp   
attentionsimage_hidden_statesr   )r&   r   r   use_return_dictrd   
ValueErrorget_input_embeddingsr   rD   rI   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelrv   r   r   rp   r   )r2   r   r~   r   r   r   r   rd   r   r   r   r   r   r6   r   r5   special_image_maskoutputss                     r!   rT   Mistral3Model.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ  557	BM#!44)%9' 5 N
 #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r    r   ro   )NNNNNNNNNNNNN)r   r   r   r   rD   FloatTensorrW   r   r   re   listr   
LongTensorr   boolr
   r	   tuplerv   rT   r   r   r    r!   r|   r|   y   s   
 AE	)'') \\) 'uS$s)^'<=	)Z '+*.1537+/59@D$(,0/3&*59$(?
##?
 ''?
 !.	?

 u//0?
 "%?
   1 12?
 'uS$s)^'<=?
 D>?
 $D>?
 'tn?
 d^?
 !!1!12?
 \\?
 -.?
  
u11	2!?
 ?
r    r|   c            #          \ rS rSr SS\R
                  S\R                  S\\\	\
\	   4      4S jjr              SS\R                  S\R
                  S\\R                     S	\\R                     S
\\   S\\R
                     S\\R                     S\\   S\\   S\\   S\\   S\\R                     S\\	\R                  4   S\\R                     S\\   S\\\4   4 S jjrSrg) Mistral3ForConditionalGeneration   Nr~   r6   rd   c                 B    U R                   R                  " SUUUS.UD6$ )N)r~   r6   rd   r   )modelr   )r2   r~   r6   rd   r   s        r!   r   3Mistral3ForConditionalGeneration.get_image_features   s3     zz,, 
%#!5
 	
 	
r    r   r   r   r   r   labelsr   r   r   r   r   logits_to_keepr   r7   c                 F   U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nU R                  " SUUUUUUUU	U
SUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnUb3  U R                  " SUXpR                   R                  R                  S.UD6n[        UUUR                  UR                  UR                  UR                   S9$ )a  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

Example:

```python
>>> from PIL import Image
>>> import requests
>>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

>>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
>>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

>>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> inputs = processor(images=image, text=prompt, return_tensors="pt")

>>> # Generate
>>> generate_ids = model.generate(**inputs, max_new_tokens=15)
>>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
"What is the image?The image depicts two cats lying on a pink blanket."
```NT)r   r~   r   r   r   r   r   r   r   r   r   r6   r   )logitsr   
vocab_size)lossr   r   rp   r   r   r   )r&   r   r   r   r   rc   re   slicelm_headloss_functionr_   r   rs   r   rp   r   r   )r2   r   r~   r   r   r   r   r   r   r   r   r   r   r   r6   r   r   rp   slice_indicesr   r   s                        r!   rT   (Mistral3ForConditionalGeneration.forward   sM   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]** 
%)%+'/!5)#
 
   
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r    r   ro   )NNNNNNNNNNNNr   N)r   r   r   r   rD   r   rW   r   r   re   r   r   r   r   r   r
   r   r   rs   rT   r   r   r    r!   r   r      s   
 AE	
''
 \\
 'uS$s)^'<=	
  '+*.1537+/59-1$(,0/3&*5934.2U
##U
 ''U
 !.	U

 u//0U
 "%U
   1 12U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
 c5<</0U
 ell+U
  +,!U
" 
u44	5#U
 U
r    r   )r|   ry   r   )'typingr   r   rD   r   activationsr   cache_utilsr   modeling_flash_attention_utilsr	   processing_utilsr
   utilsr   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler$   r[   rs   rv   ry   r|   r   __all__r   r    r!   <module>r      s     #   !   B &   6 2 
		H	%	n 	#")) #L")) 4	%@ 		": 		2 	k
J k
\d
'D d
Nr    