
    <h?                         S SK JrJr  S SKrSSKJr  SSKJrJ	r	J
r
  SSKJrJrJrJrJr  SSKJrJr  SSKJrJr   " S	 S
\SS9r " S S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInputconcatenate_listmake_flat_list_of_images)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
VideoInputmake_batched_videosc                   F    \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)InternVLImagesKwargs   crop_to_patchesmin_patchesmax_patches N)	__name__
__module____qualname____firstlineno__r   bool__annotations__int__static_attributes__r       h/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/internvl/processing_internvl.pyr   r      s     d^###r"   r   F)totalc                   6    \ rS rSr% \\S'   SSS.SS00 S.rS	rg
)InternVLProcessorKwargs!   images_kwargsleftF)padding_sidereturn_mm_token_type_idsr   T)text_kwargsr(   videos_kwargsr   N)r   r   r   r   r   r   	_defaultsr!   r   r"   r#   r&   r&   !   s-    '' #(-

 t
 	Ir"   r&   c                   H  ^  \ rS rSrSr/ SQrSrSrSr     SS\	4U 4S jjjr
S	\\   S
\\	   S\\	   S\R                  S\R                  S\R                  4S jr    SS\\   S	\\\\\\   \\   4      S\\   S\\   S\4
S jjrSS jrS rS r\S 5       rSrU =r $ )InternVLProcessor/   a  
Constructs a InternVL processor which wraps a [`AutoImageProcessor`] and
[`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~InternVLProcessor.__call__`] and [`~InternVLProcessor.decode`] for more information.
Args:
    image_processor ([`AutoImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`AutoVideoProcessor`], *optional*):
        The video processor is a required input.
    image_seq_length (`int`, *optional*, defaults to 256):
        The number of image token to use per image patch. it should be set so that:
        image_seq_length = (config.image_size // config.patch_size) ** 2 * (config.scale_factor**2)
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)image_processor	tokenizervideo_processorAutoImageProcessorAutoVideoProcessorAutoTokenizerimage_seq_lengthc                 x  > X@l         UR                  U l        UR                  U l        UR                  U l        UR                  U l        UR
                  U l        UR                  U l        UR                  U l	        U R                  U R                  U R                  /U l
        [        TU ]0  " XU4SU0UD6  g )Nchat_template)r8   start_image_tokenend_image_tokenstart_image_token_idend_image_token_idcontext_image_tokenimage_tokenvideo_tokencontext_image_token_idimage_token_id	image_idssuper__init__)selfr2   r3   r4   r8   r:   kwargs	__class__s          r#   rF   InternVLProcessor.__init__G   s     !1!*!<!<(88$-$B$B!"+">">$88$00'>>--t/H/H$JaJab_lTaleklr"   textimage_num_patchesvideo_num_patchesimage_num_patches_indicesvideo_num_patches_indicesvideo_patch_indicesc	           	      x  ^ ^ Sn	Sn
/ n/ n/ nU GH$  nUnT R                   U;   d  T R                  U;   Ga  T R                   U;   a  T R                  U;  d8  UR                  T R                   5      UR                  T R                  5      :  a  U	S:  a  XiS-
     OSnXi   nUR                  UUU 5        UR	                  T R                   SS5      nUR                  T R
                   T R                   T R                  -  XI   -   T R                   35        U	S-  n	OU
S:  a  XS-
     OSnX   nU
S:  a  UU   OSnUUS-
     nUR                  UUU 5        [        UUU 5      mSR                  UU 4S j[        [        T5      5       5       5      nUR                  U5        UR	                  T R                  SS5      nU
S-  n
T R                   U;   a  GM  T R                  U;   a  GM  SU;   a,  UR                  S5      nUR	                  SUS5      nSU;   a  M,  UR                  U5        GM'     XX4$ )z
Processes interleaved text with <image> and <video> placeholders, replacing them with appropriate
image and video tokens while keeping track of the patches used.
r      z<placeholder>
c              3      >#    U HE  nS US-    STR                    TR                  TR                  -  TU   -   TR                   3v   MG     g7f)FramerR   z: N)r;   r@   r8   r<   ).0inum_patchesrG   s     r#   	<genexpr>?InternVLProcessor._insert_media_placeholders.<locals>.<genexpr>   sr      -!8A  Awb)?)?(@AQAQTXTiTiAilwxylzAz@{  }A  }Q  }Q  |R  S!8s   AA)r@   rA   indexappendreplacer;   r8   r<   listjoinrangelenpop)rG   rK   image_pixel_valuesvideo_pixel_valuesrL   rM   rN   rO   rP   image_indexvideo_indexprocessed_textimage_video_patchesreplace_stringsprompt
new_promptstart_index	end_indexcurrent_patch_indexend_patch_indexvideo_promptreplace_strrX   s   `                     @r#   _insert_media_placeholders,InternVLProcessor._insert_media_placeholders\   s      FJ""j0D4D4D
4R##z1$$J6!''(8(89J<L<LTM]M]<^^ Q\^_P_";!O"LefK 9 FI'../A+i/XY!+!3!3D4D4DoWX!YJ#**11243C3CdF[F[3[^o^|3|2}  C  S  S  ~T  U  1$K
 S^`aRa*=Ao*Ngh'&9&FOT_bcTc";<O"PijK 9/A:M NI'../A+i/XY"&'89L_']"^K#'99 -!&s;'7!8- $L $**<8!+!3!3D4D4DoWX!YJ1$KA ""j0D4D4D
4RB "Z/-11!4'//aP
 "Z/ !!*-M P KLLr"   imagesvideosrH   returnc           
         Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6n[        U[        [        45      (       d  U/n/ n/ n0 n	Sn
Sn[        R                  " S/5      n[        R                  " S/5      n[        R                  " S/5      nUbZ  [        U5      nU R                  " SSU0US   D6nUR                  S5      nUR                  S5      n
[        R                  " U5      nUb  [        U5      nU R                  " SS	U0US
   D6nUR                  S5      nU Vs/ sH  n[!        U5      PM     nnU VVs/ sH  n[#        U5       H  nSPM     M     nnn[        R                  " U5      n[        R                  " U5      nUR%                  SS5      nUc  Ubd  U R'                  UU
UUUUUU5      u  nnnnUb  U[!        U5      :w  a  [        S5      eUb  U[!        U5      :w  a  [        S5      eS[)        U5      0n	US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R+                  UUS/S9  U(       ai  [        R                  " US   5      n[        R,                  " US   5      nSU[        R.                  " UU R0                  5      '   UR3                  5       US'   [5        0 UEU	EUS9$ s  snf s  snnf )a  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text if `text`
is not `None`, otherwise encode default OCR queries which depends on the `format`, `box`, `color`, `multi_page` and
`crop_to_patches` arguments. To prepare the vision inputs, this method forwards the `images` and `kwrags` arguments to
GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
NzYou have to specify text.tokenizer_init_kwargsr   rt   r(   rX   pixel_valuesru   r-   pixel_values_videosrR   zONumber of image placeholders in the prompt does not match the number of images.zONumber of video placeholders in the prompt does not match the number of videos.r,   return_tensorsr+   image)
modalities	input_idsmm_token_type_ids)datatensor_typer   )
ValueError_merge_kwargsr&   r3   init_kwargs
isinstancer^   tuplenparrayr	   r2   rb   cumsumr   r4   ra   r`   flattenrr   r   _check_special_mm_tokens
zeros_likeisinrD   tolistr   )rG   rt   rK   audioru   rH   output_kwargsrL   rM   image_videos_inputsrc   rd   rN   rP   rO   image_inputsvideo_inputsvideonum_frames_per_videoframes_rh   re   rf   r{   r+   text_inputs	array_idsr   s                                r#   __call__InternVLProcessor.__call__   s-   R <899**#
"&.."<"<
 
 $u..6D  !!$&HHaSM! hhsm$&HHaSM!-f5F//`v`A_`L , 0 0 ?!-!1!1.!A(*		2C(D%(0F//`v`A_`L!-!1!12G!H =O#O<N5CJ<N #O1E ]1EvuU[}!}1E ]"$)),@"A(*		2C(D%!3!;!;Aq!A!3BFBaBa""!!))#	C?D%{K !kS[&@ !rss!kS[&@ !rss $23CDW3X"Y&}599:JDQ#0#?#C#CD^`d#e nnTJ]=-IJ%%dKWI%N#[!9:I "k+.F GDEbggi@A/@/G/G/IK+,!GK!G3F!GUcddI $P ]s    K3K8c                 Z   0 nUb  [         R                  R                  S0 5      nUR                  U5        U Vs/ sH!  nU R                  R
                  " / UQUP76 PM#     nnU Vs/ sH  nSU R                  U-  -   PM     nnUR                  XS.5        [        S0 UD6$ s  snf s  snf )a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r(      )num_image_tokensnum_image_patchesr   )r&   r.   getupdater2   get_number_of_image_patchesr8   r   )	rG   image_sizesrH   vision_datar(   
image_sizer   rX   r   s	            r#   _get_num_multimodal_tokens,InternVLProcessor._get_num_multimodal_tokens  s     "3==AA/SUVM  ( #.!"-J $$@@\*\m\"-  !
 ^oo]nkT%:%:[%H I]no4Dmn,,,!
  ps   'B#(B(c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r3   batch_decoderG   argsrH   s      r#   r   InternVLProcessor.batch_decode(  s    
 ~~**D;F;;r"   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r3   decoder   s      r#   r   InternVLProcessor.decode/  s    
 ~~$$d5f55r"   c                     U R                   R                  nU R                  R                  n[        U5      [        U5      -   $ N)r3   model_input_namesr2   r^   )rG   tokenizer_input_namesimage_processor_input_namess      r#   r   #InternVLProcessor.model_input_names6  s;     $ @ @&*&:&:&L&L#)*T2M-NNNr"   )	r<   r>   rD   r8   r@   rC   r;   r=   rA   )NNN   N)NNNNr   )!r   r   r   r   __doc__
attributesimage_processor_classvideo_processor_classtokenizer_classr    rF   r^   strr   ndarrayrr   r   r   r   r   r   r   r   r&   r   r   r   r   r   propertyr   r!   __classcell__)rI   s   @r#   r0   r0   /   sK   $ EJ00%O  #m
 m m*>M3i>M
  9>M  9>M $&::>M $&::>M  ZZ>MD (,hl'+ne$ne uY(94	?DQbLccdene
 $ne 01ne 
ne`-8<6 O Or"   r0   )typingr   r   numpyr   image_processing_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   r   tokenization_utils_baser   r   video_utilsr   r   r   r&   r0   __all__r   r"   r#   <module>r      sZ     #  2 Q Q f f C :<u .e KO KO\ 
r"   