
    Ph48                        d dl mZmZ d dlZd dlmZ ddlmZ ddlmZ ddl	m
Z
 ddlmZ d	d
lmZmZmZmZmZmZ d	dlmZ ddlmZ  ej0                  e      Z G d de      Z G d dej8                        Z G d dej8                        Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z#g dZ$y)    )OptionalUnionN)nn   )ACT2FN)Cache)Unpack)logging   )LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModelTransformersKwargs)MistralRMSNorm   )Mistral3Configc                       e Zd Zy)Mistral3RMSNormN__name__
__module____qualname__     g/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/mistral3/modular_mistral3.pyr   r   (       r   r   c                   x     e Zd ZdZdef fdZdej                  dej                  dej                  fdZ xZ	S )Mistral3PatchMergerz<
    Learned merging of spatial_merge_size ** 2 patches
    configc                 "   t         |           || _        |j                  j                  }|j
                  | _        | j                  j                  j                  | _        t        j                  || j
                  dz  z  |d      | _	        y )Nr   Fbias)
super__init__r!   vision_confighidden_sizespatial_merge_size
patch_sizer   Linearmerging_layer)selfr!   r(   	__class__s      r   r&   zMistral3PatchMerger.__init__1   sr    **66"(";";++33>>YY{T5L5La5O'OQ\chir   image_featuresimage_sizesreturnc                    |D cg c]&  }|d   | j                   z  |d   | j                   z  f( }}|D cg c]
  \  }}||z   }}}|j                  d   }g }t        |j                  |            D ]  \  }	}
||	   \  }}|
j	                  |||      j                  ddd      j                  d      }t        j                  j                  j                  || j                  | j                        }|j	                  || j                  dz  z  d      j                         }|j                  |        t        j                  |d      }| j                  |      }|S c c}w c c}}w )Nr   r   r   )kernel_sizestridedim)r*   shape	enumeratesplitviewpermute	unsqueezetorchr   
functionalunfoldr)   tappendcatr,   )r-   r/   r0   
image_sizehwtokens_per_imagedpermuted_tensorimage_indeximage_tokens
image_gridgrids                r   forwardzMistral3PatchMerger.forward:   sm   cn
cnU_Z]doo-z!}/OPcn 	 
 /::kdaAEk:  $)2>3G3GHX3Y)Z%K{+DAq%**1a3;;Aq!DNNqQJ88&&--(?(?H_H_ . D 99Q!8!8!!;;R@BBDD""4( *[ ?:++N;)
 ;s
   +E"E')
r   r   r   __doc__r   r&   r>   TensorrN   __classcell__r.   s   @r   r    r    ,   s?    j~ jell  RWR^R^ r   r    c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Mistral3MultiModalProjectorr!   c                    t         |           t        |j                  j                  |j
                  j                        | _        t        |      | _	        t        |j                  t              rdnt        |j                        }t        j                  |j                  j                  |z  |j
                  j                  |j                         | _        t$        |j&                     | _        t        j                  |j
                  j                  |j
                  j                  |j                         | _        y )N)epsr   r#   )r%   r&   r   r'   r(   text_configrms_norm_epsnormr    patch_merger
isinstancevision_feature_layerintlenr   r+   multimodal_projector_biaslinear_1r   projector_hidden_actactlinear_2)r-   r!   num_feature_layersr.   s      r   r&   z$Mistral3MultiModalProjector.__init__S   s    #F$8$8$D$D&J\J\JiJij	/7",V-H-H#"NQTWX^XsXsTt		  ,,/AA**11

 &556		**F,>,>,J,JQWQqQq
r   r/   r0   c                     | j                  |      }| j                  ||      }| j                  |      }| j                  |      }| j	                  |      }|S N)rY   rZ   r`   rb   rc   )r-   r/   r0   hidden_statess       r   rN   z#Mistral3MultiModalProjector.forwardc   sR    >2**>;Gn5/m4r   )	r   r   r   r   r&   r>   rP   rN   rQ   rR   s   @r   rT   rT   R   s*    
~ 
 ell  r   rT   c                       e Zd Zy)Mistral3CausalLMOutputWithPastNr   r   r   r   ri   ri   l   r   r   ri   c                       e Zd Zy)Mistral3ModelOutputWithPastNr   r   r   r   rk   rk   p   r   r   rk   c                       e Zd Zy)Mistral3PreTrainedModelNr   r   r   r   rm   rm   t   r   r   rm   c            !          e Zd Z	 ddej                  dej
                  deeee	e   f      fdZ
	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej
                     deej                     d	ee   d
eej                     deeee	e   f      dee   dee   dee   dee   deej                     deej
                     dee   deeef   fdZy)Mistral3ModelNpixel_valuesr0   r\   c                    ||n| j                   j                  }|j                         D ci c]  \  }}|	|| }}} | j                  |f|dd|}t	        |t
              r|j                  |   }n3|D 	cg c]  }	|j                  |	    }
}	t        j                  |
d      }| j                  |j                  d      |      }| j                  j                  | j                   j                  z  }|D cg c]  \  }}||z  ||z  z   }}}t        j                  |j                  d      |      }|S c c}}w c c}	w c c}}w )aU  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
               The tensors corresponding to the input images.
            vision_feature_layer (`Union[int, list[int]]`, *optional*):
                The index of the layer to select the vision feature. If multiple indices are provided,
                the vision feature of the corresponding indices will be concatenated to form the
                vision features.
            image_sizes (`torch.Tensor`, *optional*):
                Tensor containing the image sizes as returned by the processor.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)r0   output_hidden_statesr3   r6   r   )r!   r\   itemsvision_towerr[   r]   rg   r>   rC   multi_modal_projectorsqueezer*   r)   r:   )r-   rp   r0   r\   kwargskvimage_outputsselected_image_feature	layer_idxhs_poolr/   downsample_ratioheightwidthsplit_sizess                   r   get_image_featuresz Mistral3Model.get_image_featuresy   sY   . %9$D $++JjJj 	 $*<<>C>41aQ]!Q$>C))),uKfjuntu *C0%2%@%@AU%V"OcdOc)}229=OcGd%*YYwB%?"334J4R4RST4UWbc,,77$++:X:XXgrsgrVcV\^c"22u@P7PQgrs^%;%;A%>L D e
 ts   
D<D<;E=E	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrr   return_dictcache_positionrw   r1   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt        d      | | j                         |      }|u| j                  |||      }t        j                  |d      j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d	||||||	|
d|d	|}t!        |j"                  |j$                  |j&                  |j(                  |      S d       S )
Nz:You must specify exactly one of input_ids or inputs_embeds)rp   r\   r0   r   r6   )r   r/   T)	r   r   r   r   r   r   rr   r   r   )last_hidden_stater   rg   
attentionsimage_hidden_statesr   )r!   r   rr   use_return_dictr\   
ValueErrorget_input_embeddingsr   r>   rC   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelrk   r   r   rg   r   )r-   r   rp   r   r   r   r   r\   r   r   rr   r   r   r0   rw   r/   special_image_maskoutputss                     r   rN   zMistral3Model.forward   s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	 -t";<YZZ 7D557	BM#!44)%9' 5 N
 #YY~1=@@AUAUWdWjWjkN!%!:!:~ "; " *889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r   rf   )NNNNNNNNNNNNN)r   r   r   r>   FloatTensorrP   r   r   r]   listr   
LongTensorr   boolr	   r   tuplerk   rN   r   r   r   ro   ro   x   s   
 AE	)'') \\) 'uS$s)^'<=	)Z 15481537+/59@D$(,0/3&*59.2?
E,,-?
 u001?
 !.	?

 u//0?
 "%?
   1 12?
 'uS$s)^'<=?
 D>?
 $D>?
 'tn?
 d^?
 !!1!12?
 ell+?
 +,?
  
u11	2!?
r   ro   c            #          e Zd Z	 ddej                  dej
                  deeee	e   f      fdZ
	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej
                     deej                     d	ee   d
eej                     deej                     dee   dee   dee   dee   deej                     deeej
                  f   deej
                     dee   deeef   f dZy) Mistral3ForConditionalGenerationNrp   r0   r\   c                 B     | j                   j                  d|||d|S )N)rp   r0   r\   r   )modelr   )r-   rp   r0   r\   rw   s        r   r   z3Mistral3ForConditionalGeneration.get_image_features   s5     -tzz,, 
%#!5
 	
 	
r   r   r   r   r   r   labelsr   r   rr   r   r   logits_to_keeprw   r1   c                 <   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  } | j                  d||||||||	|
d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|4 | j                  d||| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration

        >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
        >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")

        >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "What is the image?The image depicts two cats lying on a pink blanket."
        ```NT)r   rp   r   r   r   r   r   r   rr   r   r   r0   r   )logitsr   
vocab_size)lossr   r   rg   r   r   r   )r!   r   rr   r   r   r[   r]   slicelm_headloss_functionrW   r   ri   r   rg   r   r   )r-   r   rp   r   r   r   r   r   r   r   rr   r   r   r   r0   rw   r   rg   slice_indicesr   r   s                        r   rN   z(Mistral3ForConditionalGeneration.forward   sP   Z 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%)%+'/!5)#
 
   
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r   rf   )NNNNNNNNNNNNr   N)r   r   r   r>   r   rP   r   r   r]   r   r   r   r   r   r	   r   r   ri   rN   r   r   r   r   r      s   
 AE	
''
 \\
 'uS$s)^'<=	
  15481537+/59-1$(,0/3&*5934.2U
E,,-U
 u001U
 !.	U

 u//0U
 "%U
   1 12U
 ))*U
 D>U
 $D>U
 'tnU
 d^U
 !!1!12U
 c5<</0U
 ell+U
  +,!U
" 
u44	5#U
r   r   )ro   rm   r   )%typingr   r   r>   r   activationsr   cache_utilsr   processing_utilsr	   utilsr
   llava.modeling_llavar   r   r   r   r   r   mistral.modeling_mistralr   configuration_mistral3r   
get_loggerr   loggerr   Moduler    rT   ri   rk   rm   ro   r   __all__r   r   r   <module>r      s     #   !   &   6 2 
		H	%	n 	#")) #L")) 4	%@ 		": 		2 	k
J k
\d
'D d
Nr   