
    <hk                     F   S SK rS SKJr  S SKJrJrJr  S SKrS SK	J
r
  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  SSK J!r!J"r"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)J*r*  \" S5       " S S\
RV                  5      5       r, SDS\
RV                  S\RZ                  S\RZ                  S\RZ                  S\\RZ                     S\.S\.4S jjr/ " S S\
RV                  5      r0\# " S  S!\5      5       r1\\#" S"S#9 " S$ S%\5      5       5       r2 " S& S'\
RV                  5      r3 " S( S)\
RV                  5      r4 " S* S+\
RV                  5      r5\
Rl                  \,S,.r7 " S- S.\5      r8 " S/ S0\
RV                  5      r9\# " S1 S2\15      5       r:\# " S3 S4\5      5       r; " S5 S6\
RV                  5      r<\\#" S7S#9 " S8 S9\5      5       5       r=\#" S:S#9 " S; S<\;5      5       r>\\#" S=S#9 " S> S?\!5      5       5       r?\#" S@S#9 " SA SB\;\5      5       r@/ SCQrAg)E    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuple	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )InternVLVisionRMSNorm,   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z4
InternVLVisionRMSNorm is equivalent to T5LayerNorm
N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/internvl/modeling_internvl.pyr$   InternVLVisionRMSNorm.__init__.   s/     	ll5::k#:; #    c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   T)keepdim)	dtypetor'   float32powmeanrsqrtr*   r)   )r+   hidden_statesinput_dtypevariances       r/   forwardInternVLVisionRMSNorm.forward6   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r1   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler)   shaper*   r+   s    r/   
extra_repr InternVLVisionRMSNorm.extra_repr=   s*    ))*+6$2G2G1HIIr1   )r*   r)   )gư>)	__name__
__module____qualname____firstlineno__r$   r>   rD   __static_attributes____classcell__r.   s   @r/   r    r    ,   s    $;J Jr1   r    modulequerykeyvalueattention_maskscalingdropoutc                    UnUn	[         R                  " XR                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR                  S   24   nX-   n
[        R
                  R                  U
SS9n
[        R
                  R                  XU R                  S9n
[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr   r   r3   dim)ptrainingr   )
r'   matmul	transposerB   r%   
functionalsoftmaxrS   rY   
contiguous)rM   rN   rO   rP   rQ   rR   rS   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r/   eager_attention_forwardre   A   s     JL<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1 ==((2(>L==((6??([L,,|:K''1-88:K$$r1   c            
          ^  \ rS rSrSrS\4U 4S jjr  SS\R                  S\	\R                     S\	\R                     S\
\   4S	 jjrS
rU =r$ )InternVLVisionAttention\   z+Attention Class for InternVL Vision Encoderconfigc                 $  > [         TU ]  5         Xl        UR                  U l        UR
                  U l        U R                  U R                  -  U l        U R                  U R                  -  U R                  :w  a&  [        SU R                   SU R                   S35      eU R                  S-  U l	        UR                  U l
        UR                  nUR                  nSU l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  U R                  -  UR                   S9U l        [        R                  " U R                  U R                  5      U l        US:  a  [        R*                  " U5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      O[        R,                  " 5       U l        U(       a  [/        U R                  5      U l        g [        R,                  " 5       U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r#   r$   ri   r,   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr%   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr    q_normk_norm)r+   ri   proj_dropoutqk_normr.   s       r/   r$    InternVLVisionAttention.__init___   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr1   r;   rQ   output_attentionsr_   c                    UR                  5       u  pVnU R                  U5      nU R                  U5      n	U R                  U5      n
U R	                  U5      nU R                  U	5      n	UR                  XVU R                  U R                  5      R                  SS5      nU	R                  XVU R                  U R                  5      R                  SS5      n	U
R                  XVU R                  U R                  5      R                  SS5      n
[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
U4U R                  (       d  SOU R                   U R"                  SS.UD6u  pUR                  XVU R$                  5      nU R'                  U5      nU R)                  U5      nU(       a  X4nU$ US 4nU$ )Nr   r   eager        F)rS   rR   rv   )sizery   rz   r{   r   r   reshapero   rp   r[   viewre   ri   _attn_implementationr   rY   rs   rr   rm   r|   rt   )r+   r;   rQ   r   r_   
batch_sizeseq_len_query_statesr`   ra   attention_interfacerd   rb   outputoutputss                   r/   r>   InternVLVisionAttention.forward{   s    "/!3!3!5
Q{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
T^^T]][eefgijk
#((dnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJ
%
 
%
! "))*t~~N&&{3((0,=6( EKD>r1   )rs   ri   rm   rp   rv   r   rz   ro   rt   r|   r   ry   rr   r{   NN)rF   rG   rH   rI   __doc__r   r$   r'   Tensorr   r   r   r>   rJ   rK   rL   s   @r/   rg   rg   \   si    5Z3 Z> 2648	)||) !.) $ELL1	)
 -.) )r1   rg   c                   V   ^  \ rS rSr% \\S'   SrSrSrS/r	Sr
SrSrSrU 4S jrSrU =r$ )	InternVLVisionPreTrainedModel   ri   internvl_visionpixel_valuesTInternVLVisionLayerc                 n  > [         TU ]  U5        [        U[        5      (       a  UR                  R
                  R                  5         UR                  b$  UR                  R
                  R                  5         UR                  b%  UR                  R
                  R                  5         gg[        U[        5      (       as  UR                  R
                  R                  U R                  R                  5        UR                  R
                  R                  U R                  R                  5        gg)zInitialize the weightsN)r#   _init_weights
isinstanceInternVLVisionEmbeddings	cls_tokendatazero_
mask_tokenposition_embeddingsr   lambda_1fill_ri   layer_scale_init_valuelambda_2)r+   rM   r.   s     r/   r   +InternVLVisionPreTrainedModel._init_weights   s    f%f677!!'')  ,!!&&,,.))5**//557 6 344OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r1    )rF   rG   rH   rI   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rJ   rK   rL   s   @r/   r   r      sF      )$O&*#./N"&K Kr1   r   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       \ rS rSrSrSrg)$InternVLVisionModelOutputWithPooling   a2  
pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
    *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
    will be returned.
r   N)rF   rG   rH   rI   r   rJ   r   r1   r/   r   r      s    r1   r   c                   f   ^  \ rS rSrSrU 4S jrS\R                  S\R                  4S jrSr	U =r
$ )InternVLVisionPatchEmbeddings   z
This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
`hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
Transformer.
c                 H  > [         TU ]  5         UR                  UR                  p2UR                  UR
                  pTUS   US   -  US   US   -  -  nUS   US   -  US   US   -  4nX l        X0l        X@l        X`l        Xpl        [        R                  " XEX3S9U l
        g )Nr   r   )kernel_sizestride)r#   r$   
image_size
patch_sizenum_channelsr,   num_patchespatch_shaper%   Conv2d
projection)	r+   ri   r   r   r   r,   r   r   r.   s	           r/   r$   &InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L:ir1   r   returnc                     UR                   u  p#pEX0R                  :w  a  [        S5      eU R                  U5      nUR                   S   UR                   S   pUR	                  S5      R                  SS5      nXgU44$ )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )rB   r   rq   r   flattenr[   )	r+   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r/   r>   %InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
&,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
+666r1   )r   r   r   r   r   r   )rF   rG   rH   rI   r   r$   r'   r   r>   rJ   rK   rL   s   @r/   r   r      s.    j7ELL 7U\\ 7 7r1   r   c                      ^  \ rS rSrSrS\SS4U 4S jjrS\R                  S\	S	\	S\R                  4S
 jr
 SS\R                  S\\R                     S\R                  4S jjrSrU =r$ )r      z[
Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

ri   r   Nc                 ^  > [         TU ]  5         [        R                  " [        R
                  " SSUR                  5      5      U l        UR                  (       a<  [        R                  " [        R
                  " SSUR                  5      5      U l	        OS U l	        [        U5      U l        UR                  U l        [        UR                  [        R                   R"                  5      (       a  UR                  OUR                  UR                  4U l        U R                  R$                  nUR&                  (       a?  [        R                  " [        R
                  " SUS-   UR                  5      5      U l        OS U l        [        R*                  " UR,                  5      U l        g )Nr   )r#   r$   r%   r&   r'   zerosr,   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   r}   hidden_dropout_probrS   )r+   ri   r   r.   s      r/   r$   !InternVLVisionEmbeddings.__init__   s'   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EFF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r1   r   r   r   c                    UR                   S   S-
  nU R                  R                   S   S-
  n[        R                  R	                  5       (       d  XE:X  a  X#:X  a  U R                  $ U R                  SS2SS24   nU R                  SS2SS24   nUR                   S   nX R
                  S   -  n	X0R
                  S   -  n
[        US-  5      nUR                  SXU5      nUR                  SSSS5      n[        R                  R                  UX4SS	S
9nUR                  SSSS5      R                  SSU5      n[        R                  " Xg4SS9$ )a  
This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
images. This method is also adapted to support torch.jit tracing.

Adapted from:
- https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
- https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
r   Nr3   r         ?r   r   bicubicF)r   modealign_cornersrV   )rB   r   r'   jit
is_tracingr   r   r   permuter%   r\   interpolater   cat)r+   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrW   
new_height	new_widthsqrt_num_positionss               r/   interpolate_pos_encoding1InternVLVisionEmbeddings.interpolate_pos_encoding  s]    !&&q)A-0066q9A= yy##%%+*F6?+++221bqb59221ab59r"q11
__Q//	&}c'9:)11!5G]`a)11!Q1=--33(	 4 
 *11!Q1=BB1b#Nyy/;CCr1   r   bool_masked_posc                    UR                   u    p4nU R                  U5      u  nu  pxUR                  5       u  pnUbI  U R                  R	                  XS5      nUR                  S5      R                  U5      nUSU-
  -  X-  -   nU R                  R	                  U	SS5      n[        R                  " X4SS9nU R                  b  X`R                  XdU5      -   nU R                  U5      nXgU44$ )Nr3   r   rV   )rB   r   r   r   expand	unsqueezetype_asr   r'   r   r   r   rS   )r+   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r/   r>    InternVLVisionEmbeddings.forward:  s    
 +001e262G2G2U/
/\!+!2
Q&//00bIK))"-55kBA#q1u-?J^^**:r2>
YY
7Q?
##/#&C&CJX]&^^J\\*-
+666r1   )r   rS   r   r   r   r   r   N)rF   rG   rH   rI   r   r   r$   r'   r   intr   r   
BoolTensorr>   rJ   rK   rL   s   @r/   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7 7r1   r   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )InternVLVisionMLPiT  c                   > [         TU ]  5         Xl        [        UR                     U l        [        R                  " UR                  UR                  5      U l
        [        R                  " UR                  UR                  5      U l        g r   )r#   r$   ri   r   
hidden_actactivation_fnr%   rw   r,   intermediate_sizefc1fc2r+   ri   r.   s     r/   r$   InternVLVisionMLP.__init__U  sb    #F$5$5699V//1I1IJ99V55v7I7IJr1   r;   r   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r  )r+   r;   s     r/   r>   InternVLVisionMLP.forward\  s4    /**=9/r1   )r   ri   r   r  )
rF   rG   rH   rI   r$   r'   r   r>   rJ   rK   rL   s   @r/   r   r   T  s)    KU\\ ell  r1   r   )
layer_normrms_normc                      ^  \ rS rSrSrS\SS4U 4S jjr SS\R                  S\	S\
\\R                     \\R                  \R                  4   4   4S	 jjrS
rU =r$ )r   if  z?This corresponds to the Block class in the timm implementation.ri   r   Nc                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        UR                     " UR                  UR                  S9U l        [        UR                     " UR                  UR                  S9U l        UR                  n[        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R                   " U["        R$                  " UR                  5      -  SS9U l        [        R*                  " UR,                  5      U l        g )Nr   r-   T)requires_grad)r#   r$   chunk_size_feed_forwardseq_len_dimrg   	attentionr   mlpNORM2FN	norm_typer,   layer_norm_epslayernorm_beforelayernorm_afterr   r%   r&   r'   r(   r   r   r}   r   rS   )r+   ri   init_valuesr.   s      r/   r$   InternVLVisionLayer.__init__i  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r1   r;   r   c                    U R                  U R                  U5      US9u  p4U R                  U-  nX1-   nU R                  U5      nU R	                  U5      nU R                  U5      nU R                  b  U R                  U-  nXQ-   nXT4$ )N)r   )r  r  r   r  r  rS   r   )r+   r;   r   attention_outputattention_weightslayer_outputs         r/   r>   InternVLVisionLayer.forwardx  s    
 /3nn!!-0/ /= /
+
  ==+;; )8 ++M:xx-||L1==$==<7L $3..r1   )	r  r  rS   r   r   r  r  r  r  )F)rF   rG   rH   rI   r   r   r$   r'   r   boolr   rA   r>   rJ   rK   rL   s   @r/   r   r   f  sn    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/ /r1   r   c                   ~   ^  \ rS rSrS\SS4U 4S jjr\  SS\R                  S\	S\	S\
\\4   4S	 jj5       rS
rU =r$ )InternVLVisionEncoderi  ri   r   Nc                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r#   r$   ri   r%   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r+   ri   ir.   s      r/   r$   InternVLVisionEncoder.__init__  sS    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A%r;   r   output_hidden_statesc                     U(       a  SOS nU(       a  SOS n[        U R                  5       H/  u  pgU(       a  XA4-   nU" X5      nUS   nU(       d  M'  XXS   4-   nM1     U(       a  XA4-   n[        UUUS9$ )Nr   r   r   last_hidden_stater;   
attentions)	enumerater#  r   )	r+   r;   r   r'  all_hidden_statesall_self_attentionsr%  layer_modulelayer_outputss	            r/   r>   InternVLVisionEncoder.forward  s     #7BD$5b4(4OA#$58H$H!(JM)!,M  &91=M<O&O#  5   14D D++*
 	
r1   )ri   r$  r#  )FF)rF   rG   rH   rI   r   r$   r   r'   r   r  r   rA   r   r>   rJ   rK   rL   s   @r/   r  r    sg    ,3 , ,  #(%*	
||
  
 #	

 
uo%	&
 
r1   r  c                      ^  \ rS rSrS\SS4U 4S jjrS r\\   SS\	R                  S\\	R                     S	\\   S
\\   S\\\4   4
S jj5       5       rSrU =r$ )InternVLVisionModeli  ri   r   Nc                 8  > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  (       a  [        R                  " 5       O([        R                  " UR                  UR                  S9U l        U R                  5         g )Nr
  )r#   r$   ri   r   r   r  encoderuse_mean_poolingr%   r~   	LayerNormr,   r  	layernorm	post_initr  s     r/   r$   InternVLVisionModel.__init__  sm     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r1   c                 .    U R                   R                  $ r   )r   r   rC   s    r/   get_input_embeddings(InternVLVisionModel.get_input_embeddings  s    ///r1   r   r   r   r'  c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                  XS9u  pVU R	                  UUUS9nUS   nU R                  U5      n[        UUR                  UR                  S9$ )z
bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
    Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
)r   )r   r'  r   r)  )	ri   r   r'  r   r5  r8  r   r;   r+  )	r+   r   r   r   r'  embedding_outputr   encoder_outputssequence_outputs	            r/   r>   InternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolo\,,/!5 ' 

 *!,..93-)77&11
 	
r1   )ri   r   r5  r8  )NNN)rF   rG   rH   rI   r   r$   r<  r   r   r'   r   r   r   r  r   rA   r   r>   rJ   rK   rL   s   @r/   r3  r3    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r1   r3  c                   @    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSrSrg)InternVLPreTrainedModeli  ri    Tpast_key_valuesr   N)rF   rG   rH   rI   r   r   r   r   _skip_keys_device_placementr   r   _can_compile_fullgraphr   r   rJ   r   r1   r/   rD  rD    s7    &*#"3N!"&r1   rD  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )InternVLMultiModalProjectori  ri   c                 0  > [         TU ]  5         [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  5      U l        [        R                  " UR                  R
                  [        SUR                  -  5      S-  -  UR                  R
                  5      U l        [        UR                     U l        [        R                  " UR                  R
                  UR                  R
                  5      U l        g )Nr   r   )r#   r$   r%   r7  vision_configr,   r   downsample_ratior  rw   text_configlinear_1r   projector_hidden_actactlinear_2r  s     r/   r$   $InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar1   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  rO  rQ  rR  )r+   image_featuresr;   s      r/   r>   #InternVLMultiModalProjector.forward  s@    7m4/m4r1   )rQ  r  rO  rR  )	rF   rG   rH   rI   r   r$   r>   rJ   rK   rL   s   @r/   rJ  rJ    s    b~ b r1   rJ  zM
    Base class for InternVL outputs, with hidden states and attentions.
    c                   B    \ rS rSr% SrSr\\R                     \	S'   Sr
g)InternVLModelOutputWithPasti  a  
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
Nimage_hidden_statesr   )rF   rG   rH   rI   r   rY  r   r'   FloatTensorr   rJ   r   r1   r/   rX  rX    s    
 8<%"3"34;r1   rX  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    c            #       l  ^  \ rS rSrSS0rS\4U 4S jjrS rS rS r	S	 r
  S!S
\R                  S\\\\\   4      S\\   4S jjrS\R&                  S\R                  S\R                  4S jr\\             S"S\R&                  S
\R                  S\\R.                     S\\R&                     S\\   S\\R                     S\\\\\   4      S\\   S\\   S\\   S\\   S\\   S\\R&                     S\\   S\\\4   4S jj5       5       rS#S\R.                  S\4S jjr S r!U =r"$ )$InternVLModeli)  zlanguage_model.modellanguage_modelri   c                    > [         TU ]  U5        [        R                  " UR                  5      U l        [        U5      U l        [        R                  " UR                  5      U l	        U R                  5         g r   )r#   r$   r   from_configrL  vision_towerrJ  multi_modal_projectorrN  r]  r9  r  s     r/   r$   InternVLModel.__init__1  sY     %11&2F2FG%@%H"'33F4F4FGr1   c                 6    U R                   R                  5       $ r   )r]  r<  rC   s    r/   r<  "InternVLModel.get_input_embeddings9  s    ""7799r1   c                 :    U R                   R                  U5        g r   )r]  set_input_embeddingsr+   rP   s     r/   rf  "InternVLModel.set_input_embeddings<  s    007r1   c                     Xl         g r   r]  r+   decoders     r/   set_decoderInternVLModel.set_decoder?  s    %r1   c                     U R                   $ r   rj  rC   s    r/   get_decoderInternVLModel.get_decoderB  s    """r1   r   vision_feature_layervision_feature_select_strategyc                 <   Ub  UOU R                   R                  nUb  UOU R                   R                  nU R                   R                  nUS:X  a  U R	                  US9R
                  nOU R                  US9R                  U   nUS:X  a  USS2SS2SS24   nUR                  S   n[        US-  5      nUR                  S   n	UR                  XUS5      nU R                  XeS9nUR                  U	SUR                  S   5      nU R                  U5      nU$ )	a  
Obtains image last hidden states from the vision tower and apply multimodal projection.

Args:
    pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
       The tensors corresponding to the input images.
    vision_feature_layer (`int` or `list[int]`):
        Layer index or list of layer indices to extract features from.
Returns:
    vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
Nr3   )r   defaultr   r   r   )scale_factor)ri   rr  rs  rM  r`  r*  vision_modelr;   rB   r   r   pixel_shufflera  )
r+   r   rr  rs  r_   rM  vision_featureschannelsfeature_sizer   s
             r/   get_image_features InternVLModel.get_image_featuresE  s?   & %9$D $++JjJj 	
 .9 +;; 	'  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*LZ\] ,,_,\ *11*b/BWBWXZB[\ 44_Er1   	input_idsinputs_embedsrU  c           	      J   Ucj  X R                  5       " [        R                  " U R                  R                  [        R
                  UR                  S95      :H  nUR                  S5      nOXR                  R                  :H  nUR                  5       nUR                  S5      R                  U5      R                  UR                  5      nUR                  S   UR                  S   -  nX$   R                  5       UR                  5       :w  a  [        SU SU 35      eU$ )z
Obtains multimodal placeholdr mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
equal to the length of multimodal features. If the lengths are different, an error is raised.
)r5   devicer3   r   r   z6Image features and image tokens do not match: tokens: z, features )r<  r'   tensorri   image_token_idlongr  allsumr   	expand_asr6   rB   numelrq   )r+   r~  r  rU  special_image_maskn_image_tokensn_image_featuress          r/   get_placeholder_mask"InternVLModel.get_placeholder_maskz  s    !.2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*kk.H.H!H+//1/99"=GGVYYZgZnZno)//2^5I5I!5LL,2248L8L8NNHHXXcdtcuv  "!r1   rQ   position_idsrF  	use_cacher   r'  return_dictcache_positionr_   r   c                    U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nUS L US L-  (       a  [        S5      eUc  U R                  5       " U5      nUbY  U R                  UUUS9nUR                  UR                  UR                  5      nU R                  XUS9nUR                  UU5      nU R                  " SUUUUU	U
USUS.	UD6n[        UR                   UR"                  UR$                  UR&                  Ub  WS9$ S S9$ )Nz:You must specify exactly one of input_ids or inputs_embedsr   rr  rs  )r  rU  T)	rQ   r  rF  r  r  r   r'  r  r  )r*  rF  r;   r+  rY  r   )ri   r   r'  use_return_dictrr  rs  rq   r<  r|  r6   r  r5   r  masked_scatterr]  rX  r*  rF  r;   r+  )r+   r~  r   rQ   r  rF  r  rr  rs  r  r   r'  r  r  r_   rU  r  r   s                     r/   r>   InternVLModel.forward  s   & 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' -t";<YZZ  557	BM#!44)%9/M 5 N
 ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r1   ry  rv  c           
         UR                  5       u  p4pVXR-  S:w  d  XB-  S:w  a  [        S5      eUR                  X4[        XR-  5      [        Xb-  5      5      nUR	                  SSSS5      R                  5       nUR                  U[        XR-  5      [        XB-  5      [        XbS-  -  5      5      nUR	                  SSSS5      R                  5       nU$ )a  Perform pixel shuffle downsampling on vision features.

Args:
    vision_features (`torch.Tensor`):
        Input tensor of shape (batch_size, width, height, channels).
    scale_factor (`float`, *optional*, defaults to `0.5`):
        Factor by which to downsample. Default is 0.5, which halves the dimensions.

Returns:
    vision_features (`torch.Tensor`):
        Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r   rq   r   r   r   r^   )r+   ry  rv  r   r   r   rz  s          r/   rx  InternVLModel.pixel_shuffle  s     />.B.B.D+
6 A%)=)Bjkk *..s6#893x?V;W
 *11!Q1=HHJ *..F12C8L4MsS[mn_nSoOp

 *11!Q1=HHJr1   )r]  ra  r`  r   )NNNNNNNNNNNNN)r   )#rF   rG   rH   rI   _checkpoint_conversion_mappingr   r$   r<  rf  rm  rp  r'   rZ  r   r   r   liststrr|  
LongTensorr  r   r   r   r	   r  r   r   rA   rX  r>   floatrx  rJ   rK   rL   s   @r/   r\  r\  )  s     '=>N%O"~ :8&# AE8<	3''3 'uS$s)^'<=3 )1	3j"))":?:K:K"]b]n]n"0  '+*.1537+/59@D8<$(,0/3&*59D
##D
 ''D
 !.	D

 u//0D
 "%D
   1 12D
 'uS$s)^'<=D
 )1D
 D>D
 $D>D
 'tnD
 d^D
 !!1!12D
 -.D
  
u11	2!D
  D
L!U\\ ! ! !r1   r\  zT
    Base class for InternVL causal language model (or autoregressive) outputs.
    c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\R                     \	S	'   S
rg)InternVLCausalLMOutputWithPasti  a)  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
image_hidden_states (`torch.FloatTensor`, *optional*):
    A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
    image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
NlosslogitsrF  r;   r+  rY  r   )rF   rG   rH   rI   r   r  r   r'   rZ  r   r  rF  r  r;   rA   r+  rY  rJ   r   r1   r/   r  r    s      )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r1   r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            )         ^  \ rS rSrSSSSS.rS/rS\4U 4S	 jjrS
 rS r	S\
R                  4S jrS rS r  S(S\R                   S\\\\\   4      S\\   4S jjr\S 5       r\S 5       r\S 5       r\\                S)S\R:                  S\R                   S\\R<                     S\\R:                     S\\   S\\R                      S\\\\\   4      S\\   S\\R:                     S\\    S\\    S\\    S \\    S!\\R:                     S"\\\R<                  4   S#\\R<                     S$\!\"   S\\#\$4   4$S% jj5       5       r%      S*U 4S& jjr&S'r'U =r($ )+ InternVLForConditionalGenerationi  zmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightri   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  R                  UR                  R                  SS9U l	        U R                  5         g )NFrk   )r#   r$   r\  modelr%   rw   rN  r,   
vocab_sizer  r9  r  s     r/   r$   )InternVLForConditionalGeneration.__init__+  sS     "6*
yy!3!3!?!?ASASA^A^ejkr1   c                 6    U R                   R                  5       $ r   )r  r<  rC   s    r/   r<  5InternVLForConditionalGeneration.get_input_embeddings1  s    zz..00r1   c                 :    U R                   R                  U5        g r   )r  rf  rg  s     r/   rf  5InternVLForConditionalGeneration.set_input_embeddings4  s    

''.r1   r   c                     U R                   $ r   )r  rC   s    r/   get_output_embeddings6InternVLForConditionalGeneration.get_output_embeddings7  s    ||r1   c                 :    U R                   R                  U5        g r   )r  rm  rk  s     r/   rm  ,InternVLForConditionalGeneration.set_decoder:  s    

w'r1   c                 6    U R                   R                  5       $ r   )r  rp  rC   s    r/   rp  ,InternVLForConditionalGeneration.get_decoder=  s    zz%%''r1   r   rr  rs  c                 B    U R                   R                  " SUUUS.UD6$ )Nr  r   )r  r|  )r+   r   rr  rs  r_   s        r/   r|  3InternVLForConditionalGeneration.get_image_features@  s3     zz,, 
%!5+I
 	
 	
r1   c                 .    U R                   R                  $ r   )r  r]  rC   s    r/   r]  /InternVLForConditionalGeneration.language_modelO  s    zz(((r1   c                 .    U R                   R                  $ r   )r  r`  rC   s    r/   r`  -InternVLForConditionalGeneration.vision_towerS  s    zz&&&r1   c                 .    U R                   R                  $ r   )r  ra  rC   s    r/   ra  6InternVLForConditionalGeneration.multi_modal_projectorW  s    zz///r1   r~  rQ   r  rF  r  labelsr  r   r'  r  r  logits_to_keepimage_sizesr_   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R
                  nU R                  " SUUUUUUUUU
UUSUUS.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnU	b3  U R                  " SUXR                   R                  R                  S.UD6n[        UUUR                  UR                   UR"                  UR$                  S9$ )ay  
Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, AutoModelForImageTextToText

>>> torch_device = "cuda"
>>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
>>> model = AutoModelForImageTextToText.from_pretrained(
...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
... )

>>> messages = [
...     {
...         "role": "user",
...         "content": [
...             {
...                 "type": "image",
...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
...             },
...             {
...                 "type": "image",
...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
...             },
...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
...         ],
...     },
... ]

>>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
>>> generate_ids = model.generate(**inputs, max_new_tokens=200)
>>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
The images depict the Statue of Liberty and the Golden Gate Bridge.
```NT)r~  r   rQ   r  rF  r  rr  rs  r  r   r'  r  r  r  r   )r  r  r  )r  r  rF  r;   r+  rY  r   )ri   r   r'  r  rr  rs  r  r   r   slicer  loss_functionrN  r  r  rF  r;   r+  rY  )r+   r~  r   rQ   r  rF  r  rr  rs  r  r  r   r'  r  r  r  r  r_   r   r;   slice_indicesr  r  s                          r/   r>   (InternVLForConditionalGeneration.forward[  s   r 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' ** 
%)%+'!5+I/!5)#
 
$  
8B>SV8W8W~ot4]kmA}a,?@A%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r1   c           	      P   > [         T
U ]  " U4UUUUUS.UD6n	US   S:X  a  XIS'   U	$ )N)rF  r  rQ   r  r  r   r   )r#   prepare_inputs_for_generation)r+   r~  rF  r  r   rQ   r  r  r_   model_inputsr.   s             r/   r  >InternVLForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !! ,8(r1   )r  r  r   )NNNNNNNNNNNNNNr   N)NNNNNN))rF   rG   rH   rI   r  _tied_weights_keysr   r$   r<  rf  r%   Moduler  rm  rp  r'   rZ  r   r   r   r  r  r|  propertyr]  r`  ra  r   r   r  r   r	   r  r   r   rA   r  r>   r  rJ   rK   rL   s   @r/   r  r    s    "8-"?#,	&" ++~ 1/ryy (( AE8<	
''
 'uS$s)^'<=
 )1	
 ) ) ' ' 0 0  '+*.1537+/59@D8<-1$(,0/3&*5934.2#i
##i
 ''i
 !.	i

 u//0i
 "%i
   1 12i
 'uS$s)^'<=i
 )1i
 ))*i
 D>i
 $D>i
 'tni
 d^i
 !!1!12i
  c5<</0!i
" ell+#i
$ +,%i
& 
u44	5'i
  i
\  r1   r  )r   r3  rD  r\  r  )r   )Bcollections.abcr   dataclassesr   typingr   r   r   r'   torch.nnr%   activationsr   cache_utilsr	   
generationr
   integrationsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor   configuration_internvlr   r   r  r    r   r  re   rg   r   r   r   r   r   r7  r  r   r  r3  rD  rJ  rX  r\  r  r  __all__r   r1   r/   <module>r     s  .  ! , ,   !   ) 7 B 9 d d F & a a  H Y'JBII J (J6 %II%<<% 
% <<	%
 U\\*% % %6Hbii HV KO K K2 
+E  !7BII !7L[7ryy [7|		  3H
I-/4 -/`#
BII #
L 2
7 2
 2
j 'o ' '")) $ 
<"9 < <  
M+ M
M` 
<[ < <2 
B'> B
BJr1   