
    <hS                        S r SSKJr  SSKJrJr  SSKrSSKrSSKJr  SSK	J
r
  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJrJr  SSKJr  \R<                  " \5      r S r! " S S\RD                  5      r#S r$S-S jr% S.S\RD                  S\RL                  S\RL                  S\RL                  S\\RL                     S\'S\'4S jjr( " S S\RD                  5      r) " S  S!\RD                  5      r* " S" S#\RD                  5      r+ " S$ S%\5      r, " S& S'\RD                  5      r-\ " S( S)\5      5       r.S* r/\ " S+ S,\.5      5       r0S,S)/r1g)/zPyTorch Pixtral model.    )Callable)OptionalUnionN)nn   )ACT2FN)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging   )PixtralVisionConfigc                    / nU  H  nUR                   SS  u  pE[        R                  " [        R                  " U5      [        R                  " U5      SS9n[        R                  " USS9R                  SS5      R                  SS5      u  pxXq-  U-   n	UR                  U	S S 2S4   5        M     [        R                  " U5      $ )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr1   &   s    I"BC(~~ell62ELL4GRVWTr2::2qAGG2N 6)QT# # 99Y    c                   h   ^  \ rS rSrSrSU 4S jjr\R                  " 5       \S 5       5       r	Sr
U =r$ )PixtralRotaryEmbedding1   a  
The key with pixtral embedding is just that you have a frequency for each pixel positions.
If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
is given by indexing the pre_computed frequency on the width and height.

What you output is of dimension (batch, height * width, dim) with dim the embed dim.

This simply means that for each image hidden state, you are going to add
a corresponding positional embedding, based on its index in the grid.
c                 ~  > [         T
U ]  5         SU l        UR                  U l        UR
                  U l        UR                  UR                  -  nSU R                  [        R                  " SU R                  S5      R                  5       U R                  -  -  -  n[        R                  " X4R                  S9n[        R                  " X4R                  S9n[        R                  " XTS S S2   5      R                  5       n[        R                  " XdSS S2   5      R                  5       n[        R                  " US S 2S S S 24   R                  SUS5      US S S 2S S 24   R                  USS5      /SS9R!                  SU R                  S-  5      n	U R#                  S	[        R                  " X4SS9S
S9  g )Ndefault      ?r   r   )devicer   r   r   inv_freqF)
persistent)super__init__	rope_typehead_dimr   
rope_thetabase
image_size
patch_sizer   r    floatr9   outerr%   repeatr"   register_buffer)selfconfigr9   max_patches_per_sidefreqshwfreqs_hfreqs_wr:   	__class__s             r0   r=   PixtralRotaryEmbedding.__init__=   ss   "??%%	%00F4E4EEtyyU\\!TXXq%A%G%G%IDHH%TUVLL-llCLL-llC++ass,224++aqt!t-335994
#**1.BAFa
#**+?AF 
 '"dhh!m
$ 	 	ZH3GR)P]bcr2   c                    U R                   U   n[        UR                  R                  [        5      (       a0  UR                  R                  S:w  a  UR                  R                  OSn[
        R                  " USS9   UnUR                  5       nUR                  5       nS S S 5        WR                  UR                  S9WR                  UR                  S94$ ! , (       d  f       N@= f)NmpscpuF)device_typeenabled)dtype)r:   
isinstancer9   typestrr   autocastcossintorW   )rH   xposition_idsrK   rU   embr\   r]   s           r0   forwardPixtralRotaryEmbedding.forwardV   s     l+'1!((--'E'E!((--[`J`ahhmmfk^^UCC'')C'')C D
 vvAGGv$cff177f&;;; DCs    #C
C+)rA   r   r>   N)__name__
__module____qualname____firstlineno____doc__r=   r   no_gradr   rb   __static_attributes____classcell__rP   s   @r0   r4   r4   1   s0    	d2 ]]_	<  	<r2   r4   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r   r%   )r_   x1x2s      r0   rotate_halfrq   e   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r2   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezerq   )qkr\   r]   r`   unsqueeze_dimq_embedk_embeds           r0   apply_rotary_pos_embry   l   sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr2   modulequerykeyvalueattention_maskscalingdropoutc                    [         R                  " XR                  SS5      5      U-  nUb  X-   n[        R                  R                  US[         R                  S9R                  UR                  5      n[        R                  R                  XU R                  S9n[         R                  " X5      n	U	R                  SS5      R                  5       n	X4$ )Nr   r   )r   rW   )ptrainingr   r   )r   matmul	transposer   
functionalsoftmaxfloat32r^   rW   r   r   
contiguous)
rz   r{   r|   r}   r~   r   r   kwargsattn_weightsattn_outputs
             r0   eager_attention_forwardr      s     <<}}R'<=GL!#4==((2U]](SVVW\WbWbcL==((6??([L,,|3K''1-88:K$$r2   c                     ^  \ rS rSrSrU 4S jr   SS\R                  S\\R                     S\\	\R                  \R                  4      S\\
   S\\   S	\	\R                  \\R                     4   4S
 jjrSrU =r$ )PixtralAttention   zA
Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
c                   > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        SU l        U R                  S-  U l	        SU l        UR                  U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        [        R                  " U R                  U R                  SS9U l        g )NFg      ࿩bias)r<   r=   rI   hidden_size	embed_dimnum_attention_heads	num_headsr?   	is_causalr   attention_dropoutr   r   Lineark_projv_projq_projo_projrH   rI   rP   s     r0   r=   PixtralAttention.__init__   s    ++33$..8}}d*//iiUKiiUKiiUKiiUKr2   hidden_statesr~   position_embeddingsoutput_attentionsr   returnc                 F   UR                  5       u  pgnU R                  U5      n	U R                  U5      n
U R                  U5      nU	R	                  XgU R
                  U R                  5      R                  SS5      n	U
R	                  XgU R
                  U R                  5      R                  SS5      n
UR	                  XgU R
                  U R                  5      R                  SS5      nUu  p[        XXSS9u  p[        nU R                  R                  S:w  aT  U R                  R                  S:X  a  U(       a  [        R                  S5        O[        U R                  R                     nU R                  R                  S:X  a   US	   R                  UR                   S
S9US	'   U" U U	U
UU4U R"                  (       d  SOU R$                  U R&                  S.UD6u  nnUR)                  XgS5      R+                  5       nU R-                  U5      nU(       d  SnUU4$ )z#Input shape: Batch x Time x Channelr   r   r   )rv   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.flash_attention_2r`   T)non_blocking        )r   r   r   N)sizer   r   r   viewr   r?   r   ry   r   rI   _attn_implementationloggerwarning_oncer   r^   r9   r   r   r   r"   r   r   )rH   r   r~   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesr\   r]   attention_interfacer   r   s                    r0   rb   PixtralAttention.forward   s    "/!3!3!5
Q{{=1[[/
{{=1#((dnndmm\ffghjkl__Z$..$--Xbbcdfgh
#((dnndmm\ffghjkl&#7RUjk#l (?;;++w6{{//69>O##L
 '>dkk>^>^&_# ;;++/BB%+N%;%>%>}?S?Sbf%>%gF>"$7	%
  $}}C$,,LL	%
 	%
!\ "))*rBMMOkk+. LL((r2   )rI   r   r   r?   r   r   r   r   r   r   r   )NNF)re   rf   rg   rh   ri   r=   r   Tensorr   tupleboolr   r	   rb   rk   rl   rm   s   @r0   r   r      s    L* 26KO,15)||5) !.5) &eELL%,,,F&GH	5)
 $D>5) -.5) 
u||Xell33	45) 5)r2   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )
PixtralMLP   c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l        [
        R                  " U R                  U R                  SS9U l	        [        UR                     U l        g )NFr   )r<   r=   rI   r   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   s     r0   r=   PixtralMLP.__init__   s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r2   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ rd   )r   r   r   r   )rH   r_   r   s      r0   rb   PixtralMLP.forward   s6    NN4;;t~~a/@#ADLLQRO#ST	r2   )r   rI   r   r   r   r   r   )re   rf   rg   rh   r=   rb   rk   rl   rm   s   @r0   r   r      s    0 r2   r   c                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )PixtralRMSNorm   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z-
PixtralRMSNorm is equivalent to T5LayerNorm
N)r<   r=   r   	Parameterr   onesweightvariance_epsilon)rH   r   epsrP   s      r0   r=   PixtralRMSNorm.__init__  s/     	ll5::k#:; #r2   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   r   T)keepdim)	rW   r^   r   r   powmeanrsqrtr   r   )rH   r   input_dtypevariances       r0   rb   PixtralRMSNorm.forward	  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r2   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r   r   r   rH   s    r0   
extra_reprPixtralRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr2   )r   r   )gư>)	re   rf   rg   rh   r=   rb   r   rk   rl   rm   s   @r0   r   r      s    $;J Jr2   r   c                      ^  \ rS rSrU 4S jr  SS\R                  S\R                  S\\\R                  \R                  4      S\\	   S\
\   S\\R                     4S	 jjrS
rU =r$ )PixtralAttentionLayeri  c                    > [         TU ]  5         [        UR                  SS9U l        [        U5      U l        [        U5      U l        [        UR                  SS9U l	        g )Nh㈵>r   )
r<   r=   r   r   attention_normr   feed_forwardr   	attentionffn_normr   s     r0   r=   PixtralAttentionLayer.__init__  sP    ,V-?-?TJ&v.)&1&v'9'9tDr2   r   r~   r   r   r   r   c                     UnU R                  U5      nU R                  " SUUUUS.UD6u  pXa-   nUnU R                  U5      nU R                  U5      nXa-   nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        Input to the layer of shape `(batch, seq_len, embed_dim)`.
    attention_mask (`torch.FloatTensor`):
        Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
    output_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r~   r   r    )r   r   r   r   )	rH   r   r~   r   r   r   residualr   outputss	            r0   rb   PixtralAttentionLayer.forward  s    $ !++M:&*nn '
') 3/	'

 '
# !0 m4))-8 0 "&Gr2   )r   r   r   r   )NN)re   rf   rg   rh   r=   r   r   r   r   r   r   r	   FloatTensorrb   rk   rl   rm   s   @r0   r   r     s    E LP,0'||' ' &eELL%,,,F&GH	'
 $D>' -.' 
u  	!' 'r2   r   c                      ^  \ rS rSrU 4S jr     SS\\R                     S\\\R                  \R                  4      S\\	   S\\	   S\\	   S\
\   S	\\\4   4S
 jjrSrU =r$ )PixtralTransformeriF  c                   > [         TU ]  5         Xl        [        R                  R                  5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     SU l        g )NF)r<   r=   rI   r   r   
ModuleListlayersrangenum_hidden_layersr$   r   gradient_checkpointing)rH   rI   r   rP   s      r0   r=   PixtralTransformer.__init__G  s\    hh))+v//0AKK4V<= 1&+#r2   r~   r   r   output_hidden_statesreturn_dictr   r   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU(       a  SOSnU(       a  SOSn	Un
U R                   H2  nU(       a  X4-   nU" U
U4UUS.UD6nUS   n
U(       d  M*  XS   4-   n	M4     U(       a  X4-   nU(       d  [        S XU	4 5       5      $ [        XU	S9$ )a  
Args:
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
        Embeddings which serve as input to the Transformer.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr   )r   r   r   r   c              3   ,   #    U H  oc  M  Uv   M     g 7frd   r   ).0vs     r0   	<genexpr>-PixtralTransformer.forward.<locals>.<genexpr>  s     e$Sq$Ss   	)last_hidden_stater   
attentions)rI   r   r   use_return_dictr   r   r   )rH   inputs_embedsr~   r   r   r   r   r   encoder_statesall_attentionsr   encoder_layerlayer_outputss                r0   rb   PixtralTransformer.forwardO  s
   < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[M#!/2B!B) %8"3	
 M *!,M  !/3C2E!E )   +.>>Ne]N$Seee+Vd
 	
r2   )rI   r   r   )NNNNN)re   rf   rg   rh   r=   r   r   r   r   r   r   r	   r   r   rb   rk   rl   rm   s   @r0   r   r   F  s    , 26KO,0/3&*?
 !.?
 &eELL%,,,F&GH	?

 $D>?
 'tn?
 d^?
 -.?
 
uo%	&?
 ?
r2   r   c                   X    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrS/rSr
SrSrSr	S rSrg	)
PixtralPreTrainedModeli  rI   modelpixel_valuesTr   c                    U R                   R                  n[        U[        R                  [        R
                  45      (       aW  UR                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         g g [        U[        5      (       a&  UR                  R                  R                  S5        g g )Nr   )r   stdr8   )rI   initializer_rangerX   r   r   Conv2dr   datanormal_r   zero_r   fill_)rH   rz   r  s      r0   _init_weights$PixtralPreTrainedModel._init_weights  s    kk++fryy"))455MM&&CS&9{{&  &&( '//MM$$S) 0r2   r   N)re   rf   rg   rh   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr  rk   r   r2   r0   r	  r	    sU    $O&*#"&N01N"&*r2   r	  c                    UR                   nUR                  nUR                  S   n[        R                  " U5      R
                  n[        R                  " XD4XRUS9n[        R                  " U 5      R                  S5      n[        R                  " S/U S S -   5      R                  S5      n[        X5       H  u  pSXiU
2X24'   M     US S S S 2S S 24   R                  UR                  S   SSS5      nU$ )Nr   )
fill_valuerW   r9   r   r   )rW   r9   r   r   finfominfulltensorcumsumzipexpand)r&   r$  rW   r9   seq_lend_mincausal_maskblock_end_idxblock_start_idxstartends              r0   generate_block_attention_maskr/    s    LLE]]Fll1oGKK""E**g/EW]^KLL!23::2>MllA3):3B)?#?@GGKO/9
,-#Iuy() : dD!Q./66v||A2rRKr2   c                      ^  \ rS rSrSrU 4S jrS r\\    SS\	R                  S\\	R                     S\\   S\\   S	\\   S
\\   S\\\4   4S jj5       5       rSrU =r$ )PixtralVisionModeli  vision_encoderc                 n  > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  UR                  UR                  SS9U l        UR                  U l        [        UR                  SS9U l
        [        U5      U l        [        U5      U l        U R                  5         g )NF)in_channelsout_channelskernel_sizestrider   r   r   )r<   r=   rI   r   r  num_channelsr   rC   
patch_convr   ln_prer   transformerr4   patch_positional_embedding	post_initr   s     r0   r=   PixtralVisionModel.__init__  s     ))++++))$$
 !++$V%7%7TB-f5*@*H'r2   c                     U R                   $ rd   )r9  r   s    r0   get_input_embeddings'PixtralVisionModel.get_input_embeddings  s    r2   r  image_sizesr   r   r   r   r   c           
      6   Uc  UR                   u  ppX4/U-  nU R                  U5      n[        X5       VVs/ sH1  u  pUSS US   U R                  -  2S US   U R                  -  24   PM3     nnn[        R
                  " U Vs/ sH  nUR                  S5      R                  PM      snSS9R                  S5      nU R                  U5      n[        XR                  R                  U R                  R                  -  S9nUUS'   U R                  UU5      nU R                  R                  S:X  a  S nO9[        U Vs/ sH"  nUR                   S   UR                   S	   -  PM$     snU5      nU R                   " U4UUUUS
S.UD6$ s  snnf s  snf s  snf )N.r   r   r   )r'   r`   r   r   r   T)r~   r   r   r   r   )r   r9  r&  rC   r   r%   flattenTrs   r:  r1   rI   rB   r<  r   r/  r;  )rH   r  rB  r   r   r   argsr   r   r   r*   r+   patch_embedsembedr   r&   r   r`   r   r~   s                       r0   rb   PixtralVisionModel.forward  s    +7+=+=(J6"?+j8K |4  #<=
= #5$q'T__457U$q'T__:T7UUV= 	 
 yy:K!L:KQ!))A,..:K!LRST^^_`a{{<0 0)?)?4;;CYCY)Y
 ".~"==lLY;;++/BB!N:4EF4Eqqwwr{*4EFN 
) 3!5/
 
 	
3
 "M  Gs   7F$F(F)rI   r:  r9  r<  rC   r;  )NNNN)re   rf   rg   rh   r  r=   r@  r   r   r   r   r   r   r   r	   r   r   r   rb   rk   rl   rm   s   @r0   r1  r1    s    ("  /3/3,0&*1
ll1
 ell+1
 'tn	1

 $D>1
 d^1
 -.1
 
uo%	&1
  1
r2   r1  )Nr   )r   )2ri   collections.abcr   typingr   r   r   torch.utils.checkpointr   activationsr   modeling_flash_attention_utilsr	   modeling_layersr
   modeling_outputsr   modeling_rope_utilsr   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_pixtralr   
get_loggerre   r   r1   Moduler4   rq   ry   r   rD   r   r   r   r   r   r   r	  r/  r1  __all__r   r2   r0   <module>rY     sr    $ "    ! B 9 / 6 F & > > 6 
		H	% 0<RYY 0<h(F %II%<<% 
% <<	%
 U\\*% % %.L)ryy L)` "JRYY J(/6 /dH
 H
V *_ * *2  J
/ J
 J
Z  !9
:r2   