
    <h<                         S SK JrJr  S SKrSSKJr  SSKJr  SSK	J
r
JrJrJrJrJr  SSKJrJr  SSKJr   " S	 S
\SS9r " S S\
5      r " S S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInput)ImagesKwargsMultiModalDataProcessingKwargsProcessorMixinUnpackVideosKwargs)PreTokenizedInput	TextInput)
VideoInputc                   0    \ rS rSr% \\\   \4   \S'   Srg)Glm4vVideosProcessorKwargs    fps N)	__name__
__module____qualname____firstlineno__r   listfloat__annotations____static_attributes__r       b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/glm4v/processing_glm4v.pyr   r       s    	tE{E!	""r   r   F)totalc                   F    \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)Glm4vImagesKwargs$   
patch_sizetemporal_patch_size
merge_sizer   N)r   r   r   r   r   intr   r   r   r   r   r"   r"   $   s     !#&r   r"   c                   8    \ rS rSr% \\S'   \\S'   SSSS.0rSrg)	Glm4vProcessorKwargs*   images_kwargsvideos_kwargstext_kwargsF)paddingreturn_mm_token_type_idsr   N)	r   r   r   r   r"   r   r   	_defaultsr   r   r   r   r)   r)   *   s#    $$--(-
Ir   r)   c                      ^  \ rS rSrSr/ SQrSrSrSrSU 4S jjr	   SS\
S	\\\\\   \\   4   S
\S\\   S\4
S jjrSS jrS rS r SS jr\S 5       rSrU =r$ )Glm4vProcessor5   a  
Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
[`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
Args:
    image_processor ([`Glm4vProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`Glm4vVideoProcessor`], *optional*):
        The video processor is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)image_processor	tokenizervideo_processorAutoImageProcessorAutoVideoProcessor)PreTrainedTokenizerPreTrainedTokenizerFastc                   > [         TU ]  XX4S9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        USS 5      (       a  UR                  U l        g UR                  U R                  5      U l        g )N)chat_templateimage_tokenz	<|image|>video_tokenz	<|video|>image_token_idvideo_token_id)	super__init__hasattrr=   r>   getattrr?   convert_tokens_to_idsr@   )selfr4   r5   r6   r<   kwargs	__class__s         r   rB   Glm4vProcessor.__init__K   s    _b.5i.O.O;U^UjUj.5i.O.O;U^UjUj y"2D99 $$001A1AB 	 y"2D99 $$ 	 001A1AB 	r   imagestextvideosrG   returnc                 ^   U R                   " [        4SU R                  R                  0UD6nUb  U R                  " SSU0US   D6nUS   nO0 nSnUb.  U R
                  " SSU0US   D6nUR                  S5      n	US	   n
O0 n/ n	Sn
[        U[        5      (       d  U/nUR                  5       nUb  U R                  R                  S
-  nSn[        [        U5      5       H  nU R                  X-   ;   aR  X|   R                  5       U-  nX-   R                  U R                  SU-  S5      X-'   US-  nU R                  X-   ;   a  MR  X-   R                  SU R                  5      X-'   M     U
Gb  U R
                  R                  S
-  nSn[        [        U5      5       GH  nU R                   X-   ;   Ga  X   S   nSn[#        U	S5      (       a  U	R%                  5       S   nO[        U	S   [        5      (       a  U	S   OU	n/ n[        S[        U5      5       H  nUR'                  UU   5        M     USU n[        U5      U:  a.  UR'                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H  nUU   nSU R                   SU 3nUU-  nM!     X-   R                  U R                   US5      X-'   X   R                  5       U-  X   S   -  n[        U5       H;  nU R                  X-   ;   d  M  X-   R                  U R                  SU-  S5      X-'   M=     US-  nU R                   X-   ;   a  GM  X-   R                  SU R                  5      X-'   GM     US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R)                  UUSS/S9  U(       aW  [*        R,                  " US   5      n[*        R.                  " US   5      nSUUU R0                  :H  '   UR%                  5       US'   [3        0 UEUEUEUS9$ )af	  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
        tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
    - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
    - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
tokenizer_init_kwargsNrJ   r+   image_grid_thwrL   r,   
timestampsvideo_grid_thw   r   z<|placeholder|>    tolistz<|begin_of_image|>z<|end_of_image|>r-   return_tensorsr/   Fimagevideo)
modalities	input_idsmm_token_type_ids)datatensor_typer   )_merge_kwargsr)   r5   init_kwargsr4   r6   pop
isinstancer   copyr&   rangelenr=   prodreplacer>   rC   rV   append_check_special_mm_tokensnparray
zeros_liker?   r   )rF   rJ   rK   rL   rG   output_kwargsimage_inputsrP   videos_inputsrQ   rR   merge_lengthindexinum_image_tokensvideo_index
num_framesvideo_structuretimestamps_listunique_timestampsidxselected_timestamps	frame_idxtimestamp_secframe_structurerX   r/   text_inputs	array_idsr]   s                                 r   __call__Glm4vProcessor.__call__Z   s   T ** 
"&.."<"<
 

 //`v`A_`L)*:;NL!N 00aa-P_B`aM&**<8J*+;<NMJ!N$%%6Dyy{%//::A=LE3t9%&&$'1'5'<'A'A'C|'S$"good.>.>@QTd@dfghDGQJE &&$'1 '//*;T=M=MN & %//::A=LK3t9%&&$'1!/!<Q!?J&(Oz844*4*;*;*=a*@;EjQRmUY;Z;Z*Q-`j(*%$QO(<=)001EF  > +<KZ*H'12Z?+22Na3Fr3Jghi 12Z? &+:%6	(;I(F,>t?O?O>PP`an`o*p'?: &7
 #good.>.>QRSDG&388:lJnNijkNll % &+:%6	++tw6&*good6F6FHY\lHlno&pDG &7  1$K= &&$'1@ '//*;T=M=MNC &D '}599:JDQ#0#?#C#CD^`e#f nnTJ]=-IJ%%dKWgDV%W#[!9:I "k+.F GBCi4+>+>>?/@/G/G/IK+,!QK!Q<!Q=!Q_mnnr   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ sH
  oUS-  -  PM     n
n	UR                  XS.5        Ubx  [         R                  R                  S0 5      nUR                  U5        U Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ sH
  oWS-  -  PM     nn	XS'   [        S0 UD6$ s  snf s  sn	f s  snf s  sn	f )	a  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    video_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (num_frames, height, width) per each video.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr+   r&   rS   )rt   num_image_patchesr,   num_video_tokensr   )
r)   r0   getupdater4   r&   get_number_of_image_patchesr6   get_number_of_video_patchesr	   )rF   image_sizesvideo_sizesrG   vision_datar+   r&   
image_sizer   num_patchesrt   r,   
video_sizenum_video_patchesr   s                  r   _get_num_multimodal_tokens)Glm4vProcessor._get_num_multimodal_tokens   s    "0::>>PRSM  (&**<>a$BVBVBaBaJ #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd4Dmn"0::>>PRSM  ( #.!"-J $$@@\*\m\"-  ! SddRc;
A!=Rcd.>*+,,,#!  e!  es   *'EE4'E!Ec                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
r5   batch_decoderF   argsrG   s      r   r   Glm4vProcessor.batch_decode   s    
 ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r5   decoder   s      r   r   Glm4vProcessor.decode  s    
 ~~$$d5f55r   c                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spacesr   )rF   generated_outputsr   r   rG   s        r   post_process_image_text_to_text.Glm4vProcessor.post_process_image_text_to_text  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      nUS/-   $ )Nsecond_per_grid_ts)r5   model_input_namesr4   r   dictfromkeys)rF   tokenizer_input_namesimage_processor_input_namesnames_from_processors       r   r    Glm4vProcessor.model_input_names'  sK     $ @ @&*&:&:&L&L##DMM2G2e$fg#';&<<<r   )r=   r?   r>   r@   )NNNN)NNN)NN)TF)r   r   r   r   __doc__
attributesimage_processor_classvideo_processor_classtokenizer_classrB   r   r   r   r   r   r   r   r)   r   r   r   r   r   r   propertyr   r   __classcell__)rH   s   @r   r2   r2   5   s     EJ00HO
" "^b!	|o|o I0$y/4HYCZZ[|o 	|o
 -.|o 
|o|$-L<6 Y^
6 = =r   r2   )typingr   r   numpyrk   feature_extraction_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   r   tokenization_utils_baser   r   video_utilsr   r   r"   r)   r2   __all__r   r   r   <module>r      sa   * #  4 % t t C %#U # +5 w=^ w=t 
r   