
    Phe                        d dl Zd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.m/Z/  ej`                  e1      Z2	 d@de
jf                  dejh                  dejh                  dejh                  deejh                     de5de5fdZ6 G d de&      Z7 G d de$      Z8e ed !       G d" d#e                    Z9 G d$ d%e
jf                        Z: G d& d'e
jf                        Z; G d( d)e"      Z<e
jz                  e7d*Z> G d+ d,e      Z? G d- d.e
jf                        Z@e G d/ d0e             ZAe G d1 d2eA             ZB G d3 d4e,      ZCdZD G d5 d6e
jf                        ZE G d7 d8e+      ZF G d9 d:e*      ZG G d; d<e(      ZH G d= d>e)      ZIg d?ZJy)A    N)	dataclass)CallableOptionalUnion   )ACT2FN)Cache)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)check_model_inputs   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t        j                  ||j                  dd            |z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j
                  j                  |
d      }
t        j
                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr(   r/   
contiguous)r"   r#   r$   r%   r&   r'   r(   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                g/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr?   0   s     JL<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1 ==((2(>L==((6??([L,,|\:K''1-88:K$$    c                       e Zd Zy)InternVLVisionRMSNormN__name__
__module____qualname__ r@   r>   rB   rB   K       r@   rB   c                   p     e Zd Zdef fdZ	 ddej                  deej                     dee	   fdZ
 xZS )InternVLVisionAttentionconfigc                    t         |   |       | `d| _        |j                  }|rt        | j                        nt        j                         | _	        |rt        | j                        | _
        y t        j                         | _
        y NF)super__init__num_key_value_groups	is_causaluse_qk_normrB   	embed_dimr4   Identityq_normk_norm)selfrK   qk_norm	__class__s      r>   rO   z InternVLVisionAttention.__init__P   sd     % $$?F+DNN;BKKM?F+DNN;BKKMr@   hidden_statesr&   r8   c                    |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	| j	                  |      }| j                  |      }|j                  ||| j                  | j                        j                  dd      }|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rt        | j                  j                     }
 |
| |||	|f| j                  sdn| j                   | j"                  dd|\  }}|j                  ||| j$                        }| j'                  |      }| j)                  |      }||fS )Nr   r   eager        F)r(   r'   rQ   )sizeq_projk_projv_projrU   rV   reshape	num_headshead_dimr2   viewr?   rK   _attn_implementationr   r/   attention_dropoutscalerS   projection_layerprojection_dropout)rW   rZ   r&   r8   
batch_sizeseq_len_query_statesr9   r:   attention_interfacer=   r;   outputs                 r>   forwardzInternVLVisionAttention.forward[   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0|##r@   N)rD   rE   rF   r!   rO   r0   Tensorr   r   r   rq   __classcell__rY   s   @r>   rJ   rJ   O   sK    	Z3 	Z 26'$||'$ !.'$ +,	'$r@   rJ   z7
    Class for outputs of [`InternVLVisionModel`].
    )custom_introc                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolingaF  
    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
        Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
        *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
        will be returned.
    N)rD   rE   rF   __doc__rG   r@   r>   rx   rx      s    r@   rx   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)rN   rO   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper4   Conv2d
projection)	rW   rK   r   r   r   r   r   r   rY   s	           r>   rO   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir@   pixel_valuesreturnc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }}|j	                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r   r   )r3   r   
ValueErrorr   flattenr2   )	rW   r   rk   r   heightwidth
embeddingspatch_heightpatch_widths	            r>   rq   z%InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r@   )	rD   rE   rF   ry   rO   r0   rs   rq   rt   ru   s   @r>   r{   r{      s)    j7ELL 7U\\ 7r@   r{   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  de
ej                     dej                  fdZ xZS )InternVLVisionEmbeddingszc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rK   r   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )rN   rO   r4   	Parameterr0   zerosr   	cls_tokenuse_mask_token
mask_tokenr{   patch_embeddingsr   
isinstancer   collectionsabcIterabler    use_absolute_position_embeddingsposition_embeddingsDropouthidden_dropout_probr(   )rW   rK   r   rY   s      r>   rO   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r@   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr+   r         ?r   r   bicubicF)r^   modealign_cornersr,   )r3   r   r0   jit
is_tracingr   r   rb   permuter4   r5   interpolatere   cat)rW   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr-   
new_height	new_widthsqrt_num_positionss               r>   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding   sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr@   r   bool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS )Nr+   r   r,   )r3   r   r^   r   expand	unsqueezetype_asr   r0   r   r   r   r(   )rW   r   r   rm   r   r   r   r   r   rk   rl   mask_tokensw
cls_tokenss                 r>   rq   z InternVLVisionEmbeddings.forward   s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r@   rr   )rD   rE   rF   ry   r!   rO   r0   rs   intr   r   
BoolTensorrq   rt   ru   s   @r>   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7r@   r   c                       e Zd Zy)InternVLVisionMLPNrC   rG   r@   r>   r   r     rH   r@   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZdej                  dee	ej                     e	ej                  ej                  f   f   fdZ
 xZS )InternVLVisionLayerz?This corresponds to the Block class in the timm implementation.rK   r   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   epsT)requires_grad)rN   rO   chunk_size_feed_forwardseq_len_dimrJ   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterlayer_scale_init_valuer4   r   r0   oneslambda_1lambda_2r   r   r(   )rW   rK   init_valuesrY   s      r>   rO   zInternVLVisionLayer.__init__"  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::f>P>P3Q%Qaef[5::f>P>P3Q%Qaefzz&"<"<=r@   rZ   c                    | j                  | j                  |            \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }|S rr   )r   r   r   r   r   r(   r   )rW   rZ   attention_outputrm   layer_outputs        r>   rq   zInternVLVisionLayer.forward1  s     #nn!!-0
!  ==+;; )=8 ++M:xx-||L1==$==<7L $m3r@   )rD   rE   rF   ry   r!   rO   r0   rs   r   tuplerq   rt   ru   s   @r>   r   r     sZ    I>3 > >|| 
uU\\"E%,,*D$EE	Fr@   r   c                   `     e Zd Zdeddf fdZedej                  dee	e
f   fd       Z xZS )InternVLVisionEncoderrK   r   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w rM   )
rN   rO   rK   r4   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rW   rK   irY   s      r>   rO   zInternVLVisionEncoder.__init__N  sU    ]]vOgOgIh#iIhA$7$?Ih#ij
&+# $js   A#rZ   c                 L    | j                   D ]
  } ||      } t        |      S )N)last_hidden_state)r   r   )rW   rZ   layer_modules      r>   rq   zInternVLVisionEncoder.forwardT  s.    
 !JJL(7M ' +
 	
r@   )rD   rE   rF   r!   rO   r   r0   rs   r   r   r   rq   rt   ru   s   @r>   r   r   M  sK    ,3 , , 	
||	
 
uo%	&	
 	
r@   r   c                   V     e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZeedZ fdZ xZS )InternVLVisionPreTrainedModelrK   internvl_visionr   Tr   )rZ   
attentionsc                 V   t         |   |       t        |t              r|j                  j
                  j                          |j                  $|j                  j
                  j                          |j                  %|j                  j
                  j                          yyt        |t              rs|j                  j
                  j                  | j                  j                         |j                  j
                  j                  | j                  j                         yy)zInitialize the weightsN)rN   _init_weightsr   r   r   datazero_r   r   r   r   fill_rK   r   r   )rW   r"   rY   s     r>   r   z+InternVLVisionPreTrainedModel._init_weightsr  s    f%f67!!'')  ,!!&&,,.))5**//557 6 34OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r@   )rD   rE   rF   r!   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backendr   rJ   _can_record_outputsr   rt   ru   s   @r>   r   r   a  sV      )$O&*#./N"& --
K Kr@   r   c                        e Zd Zdeddf fdZd Zee	 d	dej                  de
ej                     deeef   fd              Z xZS )
InternVLVisionModelrK   r   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )rN   rO   rK   r   r   r   encoderuse_mean_poolingr4   rT   	LayerNormr   r   	layernorm	post_initrW   rK   rY   s     r>   rO   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r@   c                 .    | j                   j                  S rr   )r   r   )rW   s    r>   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r@   r   r   c                     | j                  ||      \  }}| j                  |      }|d   }| j                  |      }t        ||j                  |j
                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   r   )r   rZ   r   )r   r   r   rx   rZ   r   )rW   r   r   embedding_outputrm   encoder_outputssequence_outputs          r>   rq   zInternVLVisionModel.forward  se     #oolOo\!,,'78)!,..93-)77&11
 	
r@   rr   )rD   rE   rF   r!   rO   r  r   r   r0   rs   r   r   r   r   rx   rq   rt   ru   s   @r>   r   r     ss    3  0  7;
ll
 "%"2"23
 
u::	;	
  
r@   r   c                       e Zd Zy)InternVLPreTrainedModelNrC   rG   r@   r>   r  r    rH   r@   r  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrK   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )rN   rO   r4   r   vision_configr   r   downsample_ratior   Lineartext_configlinear_1r   projector_hidden_actactlinear_2r   s     r>   rO   z$InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar@   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rr   )r   r  r  r  )rW   image_featuresrZ   s      r>   rq   z#InternVLMultiModalProjector.forward  s@    7m4/m4r@   )rD   rE   rF   r    rO   rq   rt   ru   s   @r>   r
  r
    s    b~ br@   r
  c                       e Zd Zy)InternVLModelOutputWithPastNrC   rG   r@   r>   r  r    rH   r@   r  c                      e Zd Zddej                  defdZ	 	 ddej                  dee	e
ee
   f      dee   fdZee	 	 	 	 	 	 	 	 	 dd	eej                      deej                     d
eej                     deej                      dee   deej                     dee	e
ee
   f      dee   deej                      dee   de	eef   fd              Zy)InternVLModelvision_featuresscale_factorc           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r   )r^   r   re   r   r   r7   )rW   r  r  rk   r   r   channelss          r>   pixel_shufflezInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr@   Nr   vision_feature_layervision_feature_select_strategyc                    ||n| j                   j                  }||n| j                   j                  }|j                  | j                        }| j                   j
                  }|dk(  r| j                  |      j                  }n| j                  |      j                  |   }|dk(  r|ddddddf   }|j                  d   }t        |dz        }|j                  d   }	|j                  |	||d      }| j                  ||	      }|j                  |	d|j                  d         }| j                  |      }|S )
a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `list[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        N)dtyper+   )r   defaultr   r   r   )r  )rK   r  r   tor"  r  vision_towerr   vision_modelrZ   r3   r   rb   r  multi_modal_projector)
rW   r   r  r   r8   r  r  r  feature_sizerk   s
             r>   get_image_featuresz InternVLModel.get_image_features  sX   & %9$D $++JjJj 	
 .9 +;; 	'
 $TZZ8;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_Er@   	input_idsr&   position_idspast_key_valuesinputs_embedscache_positionr8   r   c
           	      8   ||n| j                   j                  }||n| j                   j                  }|d u |d uz  rt        d      | | j	                         |      }|`| j                  |||      }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d|||||	d|
}t        |j                  |j                  |j                  |j                   |      S d       S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r  r   )r-  r  )r&   r+  r,  r-  r.  )r   r,  rZ   r   image_hidden_statesrG   )rK   r  r   r   r  r)  r$  devicer"  get_placeholder_maskmasked_scatterlanguage_modelr  r   r,  rZ   r   )rW   r*  r   r&   r+  r,  r-  r  r   r.  r8   r  special_image_maskoutputss                 r>   rq   zInternVLModel.forward"  sj     %9$D $++JjJj 	
 .9 +;; 	' -t";<YZZ 7D557	BM#!44)%9/M 5 N
 ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M%$%% 
)%+')
 
 +%77#33!//))2>2J
 	

 QU
 	
r@   )r   )NN)	NNNNNNNNN)rD   rE   rF   r0   rs   floatr  FloatTensorr   r   r   liststrr)  r   r   
LongTensorr	   r   r   r   r  rq   rG   r@   r>   r  r    sr   !U\\ ! !L AE8<	4''4 'uS$s)^'<=4 )1	4l  15481537+/59@D8<597
E,,-7
 u0017
 !.	7

 u//07
 "%7
   1 127
 'uS$s)^'<=7
 )17
 !!1!127
 +,7
 
u11	27
  7
r@   r  c                       e Zd Zy)InternVLCausalLMOutputWithPastNrC   rG   r@   r>   r=  r=  ^  rH   r@   r=  c                        e Zd Z fdZ xZS ) InternVLForConditionalGenerationc                  :     t               j                  di |  y)ac  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NrG   )rN   rq   )super_kwargsrY   s    r>   rq   z(InternVLForConditionalGeneration.forwardc  s    H 	','r@   )rD   rE   rF   rq   rt   ru   s   @r>   r?  r?  b  s    $( $(r@   r?  )r   r   r  r  r?  )r]   )Kcollections.abcr   dataclassesr   typingr   r   r   r0   torch.nnr4   activationsr   cache_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   r   configuration_internvlr    r!   
get_loggerrD   loggerModulers   r7  r?   rB   rJ   rx   r{   r   r   r   r   r   r   r   r   r  INTERNVL_INPUTS_DOCSTRINGr
  r  r  r=  r?  __all__rG   r@   r>   <module>rX     s  "  ! , ,   !   9 K F & ] ] / ( 7 /  I 
		H	% %II%<<% 
% <<	%
 U\\*% % %6	L 	3$2 3$l 
+E  !7BII !7L[7ryy [7|	 	 3H
I+4 +\
BII 
( KO K K< '
7 '
 '
T	2 	 ! ")) $	": 	S
J S
l	%@ 	%('D %(Pr@   