ó
    <±hˆ<  ã                   óÊ   • S SK JrJr  S SKrSSKJr  SSKJr  SSK	J
r
JrJrJrJrJr  SSKJrJr  SSKJr   " S	 S
\SS9r " S S\
5      r " S S\SS9r " S S\5      rS/rg)é    )ÚOptionalÚUnionNé   )ÚBatchFeature)Ú
ImageInput)ÚImagesKwargsÚMultiModalDataÚProcessingKwargsÚProcessorMixinÚUnpackÚVideosKwargs)ÚPreTokenizedInputÚ	TextInput)Ú
VideoInputc                   ó0   • \ rS rSr% \\\   \4   \S'   Srg)ÚGlm4vVideosProcessorKwargsé    Úfps© N)	Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   ÚlistÚfloatÚ__annotations__Ú__static_attributes__r   ó    Úb/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/glm4v/processing_glm4v.pyr   r       s   ‡ Ø	ˆtE‰{˜EÐ!Ñ	"Ö"r   r   F)Útotalc                   óF   • \ rS rSr% \\   \S'   \\   \S'   \\   \S'   Srg)ÚGlm4vImagesKwargsé$   Ú
patch_sizeÚtemporal_patch_sizeÚ
merge_sizer   N)r   r   r   r   r   Úintr   r   r   r   r   r"   r"   $   s    ‡ Ø˜‘ÓØ! #™Ó&Ø˜‘Ör   r"   c                   ó8   • \ rS rSr% \\S'   \\S'   SSSS.0rSrg)	ÚGlm4vProcessorKwargsé*   Úimages_kwargsÚvideos_kwargsÚtext_kwargsF)ÚpaddingÚreturn_mm_token_type_idsr   N)	r   r   r   r   r"   r   r   Ú	_defaultsr   r   r   r   r)   r)   *   s#   ‡ Ø$Ó$Ø-Ó-àØØ(-ñ
ðƒIr   r)   c                   ó¾   ^ • \ rS rSrSr/ SQrSrSrSrSU 4S jjr	   SS\
S	\\\\\   \\   4   S
\S\\   S\4
S jjrSS jrS rS r SS jr\S 5       rSrU =r$ )ÚGlm4vProcessoré5   a—  
Constructs a GLM-4V processor which wraps a GLM-4V image processor and a GLM-4 tokenizer into a single processor.
[`~Glm4vProcessor.__call__`] and [`~Glm4vProcessor.decode`] for more information.
Args:
    image_processor ([`Glm4vProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizerFast`], *optional*):
        The tokenizer is a required input.
    video_processor ([`Glm4vVideoProcessor`], *optional*):
        The video processor is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
)Úimage_processorÚ	tokenizerÚvideo_processorÚAutoImageProcessorÚAutoVideoProcessor)ÚPreTrainedTokenizerÚPreTrainedTokenizerFastc                 ó¶  >• [         TU ]  XX4S9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        [        USS 5      (       a  UR                  OUR                  U R                  5      U l        [        USS 5      (       a  UR                  U l        g UR                  U R                  5      U l        g )N)Úchat_templateÚimage_tokenz	<|image|>Úvideo_tokenz	<|video|>Úimage_token_idÚvideo_token_id)	ÚsuperÚ__init__Úhasattrr=   r>   Úgetattrr?   Úconvert_tokens_to_idsr@   )Úselfr4   r5   r6   r<   ÚkwargsÚ	__class__s         €r   rB   ÚGlm4vProcessor.__init__K   sË   ø€ Ü‰Ñ˜°_ÐÑbÜ.5°iÀ×.OÑ.O™;ÐU^×UjÑUjˆÔÜ.5°iÀ×.OÑ.O™;ÐU^×UjÑUjˆÔô yÐ"2°D×9Ñ9ð ×$Ò$à×0Ñ0°×1AÑ1AÓBð 	Ôô yÐ"2°D×9Ñ9ð ×$Ñ$ð 	Õð ×0Ñ0°×1AÑ1AÓBð 	Õr   ÚimagesÚtextÚvideosrG   Úreturnc                 ó^  • U R                   " [        4SU R                  R                  0UD6nUb  U R                  " SSU0US   D6nUS   nO0 nSnUb.  U R
                  " SSU0US   D6nUR                  S5      n	US	   n
O0 n/ n	Sn
[        U[        5      (       d  U/nUR                  5       nUbº  U R                  R                  S
-  nSn[        [        U5      5       H‡  nU R                  X-   ;   aR  X|   R                  5       U-  nX-   R                  U R                  SU-  S5      X-'   US-  nU R                  X-   ;   a  MR  X-   R                  SU R                  5      X-'   M‰     U
Gbü  U R
                  R                  S
-  nSn[        [        U5      5       GHÈ  nU R                   X-   ;   Ga‘  X¯   S   nSn[#        U	S5      (       a  U	R%                  5       S   nO[        U	S   [        5      (       a  U	S   OU	n/ n[        S[        U5      5       H  nUR'                  UU   5        M     USU n[        U5      U:  a.  UR'                  U(       a  US   OS5        [        U5      U:  a  M.  [        U5       H  nUU   nSU R                   SU 3nUU-  nM!     X-   R                  U R                   US5      X-'   X¯   R                  5       U-  X¯   S   -  n[        U5       H;  nU R                  X-   ;   d  M  X-   R                  U R                  SU-  S5      X-'   M=     US-  nU R                   X-   ;   a  GM‘  X-   R                  SU R                  5      X-'   GMË     US   R                  SS5      nUS   R                  SS5      nU R                  " U40 US   D6nU R)                  UUSS/S9  U(       aW  [*        R,                  " US   5      n[*        R.                  " US   5      nSUUU R0                  :H  '   UR%                  5       US'   [3        0 UEUEUEUS9$ )af	  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `List[str]`, `List[List[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
        The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
        tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
    - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
    - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
Útokenizer_init_kwargsNrJ   r+   Úimage_grid_thwrL   r,   Ú
timestampsÚvideo_grid_thwé   r   z<|placeholder|>é   Ú Útolistéÿÿÿÿz<|begin_of_image|>z<|end_of_image|>r-   Úreturn_tensorsr/   FÚimageÚvideo)Ú
modalitiesÚ	input_idsÚmm_token_type_ids)ÚdataÚtensor_typer   )Ú_merge_kwargsr)   r5   Úinit_kwargsr4   r6   ÚpopÚ
isinstancer   Úcopyr&   ÚrangeÚlenr=   ÚprodÚreplacer>   rC   rV   ÚappendÚ_check_special_mm_tokensÚnpÚarrayÚ
zeros_liker?   r   )rF   rJ   rK   rL   rG   Úoutput_kwargsÚimage_inputsrP   Úvideos_inputsrQ   rR   Úmerge_lengthÚindexÚiÚnum_image_tokensÚvideo_indexÚ
num_framesÚvideo_structureÚtimestamps_listÚunique_timestampsÚidxÚselected_timestampsÚ	frame_idxÚtimestamp_secÚframe_structurerX   r/   Útext_inputsÚ	array_idsr]   s                                 r   Ú__call__ÚGlm4vProcessor.__call__Z   s€  € ðT ×*Ò*Ü ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð
 ÑØ×/Ò/Ñ`°vÐ`ÀÈÑA_Ñ`ˆLØ)Ð*:Ñ;‰NàˆLØ!ˆNàÑØ ×0Ò0Ña¸ÐaÀ-ÐP_ÑB`ÑaˆMØ&×*Ñ*¨<Ó8ˆJØ*Ð+;Ñ<‰NàˆMØˆJØ!ˆNä˜$¤×%Ñ%Ø6ˆDày‰y‹{ˆØÑ%Ø×/Ñ/×:Ñ:¸AÑ=ˆLØˆEÜœ3˜t›9Ö%Ø×&Ñ&¨$©'Ó1Ø'5Ñ'<×'AÑ'AÓ'CÀ|Ñ'SÐ$Ø"™gŸo™o¨d×.>Ñ.>Ð@QÐTdÑ@dÐfgÓhD‘GØ˜Q‘JEð ×&Ñ&¨$©'Õ1ð ™'Ÿ/™/Ð*;¸T×=MÑ=MÓN“ñ &ð Ò%Ø×/Ñ/×:Ñ:¸AÑ=ˆLØˆKÜœ3˜t›9×%Ø×&Ñ&¨$©'Ô1Ø!/Ñ!<¸QÑ!?JØ&(Oä˜z¨8×4Ñ4Ø*4×*;Ñ*;Ó*=¸aÑ*@™ä;EÀjÐQRÁmÔUY×;ZÑ;Z¨*°Qª-Ð`j˜à(*Ð%Ü$ Q¬¨OÓ(<Ö=˜Ø)×0Ñ0°ÀÑ1EÖFñ  >ð +<¸K¸ZÐ*HÐ'ÜÐ1Ó2°ZÓ?Ø+×2Ñ2ÖNaÐ3FÀrÒ3JÐghÔiô Ð1Ó2°ZÕ?ô &+¨:Ö%6˜	Ø(;¸IÑ(F˜Ø,>¸t×?OÑ?OÐ>PÐP`ÐanÐ`oÐ*p˜Ø'¨?Ñ:šñ &7ð
 #™gŸo™o¨d×.>Ñ.>ÀÐQRÓSD‘Gà&Ñ3×8Ñ8Ó:¸lÑJÈnÑNiÐjkÑNlÑlð %ô &+¨:Ö%6˜	Ø×+Ñ+¨t©wÕ6Ø&*¡g§o¡o°d×6FÑ6FÐHYÐ\lÑHlÐnoÓ&p˜D›Gñ &7ð   1Ñ$Kð= ×&Ñ&¨$©'Ö1ð@ ™'Ÿ/™/Ð*;¸T×=MÑ=MÓN”ñC &ðD ' }Ñ5×9Ñ9Ð:JÈDÓQˆØ#0°Ñ#?×#CÑ#CÐD^Ð`eÓ#fÐ Ø—n’n TÑJ¨]¸=Ñ-IÑJˆØ×%Ñ% d¨KÀWÈgÐDVÐ%ÑWæ#ÜŸš ¨[Ñ!9Ó:ˆIÜ "§¢¨k¸+Ñ.FÓ GÐØBCÐ˜i¨4×+>Ñ+>Ñ>Ñ?Ø/@×/GÑ/GÓ/IˆKÐ+Ñ,ÜÐ!Q KÐ!Q°<Ð!QÀ=Ð!QÐ_mÑnÐnr   c                 ó¬  • 0 nUb¶  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ sH
  o™US-  -  PM     n
n	UR                  X¨S.5        Ubx  [         R                  R                  S0 5      nUR                  U5        U Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU V	s/ sH
  o™WS-  -  PM     nn	XäS'   [        S0 UD6$ s  snf s  sn	f s  snf s  sn	f )	aû  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
    video_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (num_frames, height, width) per each video.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr+   r&   rS   )rt   Únum_image_patchesr,   Únum_video_tokensr   )
r)   r0   ÚgetÚupdater4   r&   Úget_number_of_image_patchesr6   Úget_number_of_video_patchesr	   )rF   Úimage_sizesÚvideo_sizesrG   Úvision_datar+   r&   Ú
image_sizer„   Únum_patchesrt   r,   Ú
video_sizeÚnum_video_patchesr…   s                  r   Ú_get_num_multimodal_tokensÚ)Glm4vProcessor._get_num_multimodal_tokensØ   s  € ð ˆØÑ"Ü0×:Ñ:×>Ñ>¸ÐPRÓSˆMØ× Ñ  Ô(Ø&×*Ñ*¨<¸Ó>×aÀ$×BVÑBV×BaÑBaˆJñ #.ó!á"-Jð ×$Ñ$×@Ò@Ð\À*Ð\ÈmÕ\Ù"-ð ð !ñ SdÓdÑRcÀ;°
¸A±Ô!=ÑRcÐÐdØ×ÑÐ4DÑmÔnàÑ"Ü0×:Ñ:×>Ñ>¸ÐPRÓSˆMØ× Ñ  Ô(ñ #.ó!á"-Jð ×$Ñ$×@Ò@Ð\À*Ð\ÈmÕ\Ù"-ð ð !ñ SdÓdÑRcÀ;°
¸A±Ô!=ÑRcÐÐdØ.>Ð*Ñ+äÑ, Ñ,Ð,ùò#!ùò  eùò!ùò  es   Á*'EÂEÃ4'EÄ!Ec                 ó:   • U R                   R                  " U0 UD6$ )zª
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
©r5   Úbatch_decode©rF   ÚargsrG   s      r   r•   ÚGlm4vProcessor.batch_decodeþ   s   € ð
 ~‰~×*Ò*¨DÐ;°FÑ;Ð;r   c                 ó:   • U R                   R                  " U0 UD6$ )z¤
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r5   Údecoder–   s      r   rš   ÚGlm4vProcessor.decode  s   € ð
 ~‰~×$Ò$ dÐ5¨fÑ5Ð5r   c                 óB   • U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)Úskip_special_tokensÚclean_up_tokenization_spacesr”   )rF   Úgenerated_outputsr   rž   rG   s        r   Úpost_process_image_text_to_textÚ.Glm4vProcessor.post_process_image_text_to_text  s3   € ð( ~‰~×*Ò*Øð
à 3Ø)Eñ
ð ñ	
ð 	
r   c                 ó¦   • U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      nUS/-   $ )NÚsecond_per_grid_ts)r5   Úmodel_input_namesr4   r   ÚdictÚfromkeys)rF   Útokenizer_input_namesÚimage_processor_input_namesÚnames_from_processors       r   r¤   Ú Glm4vProcessor.model_input_names'  sK   € à $§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#Ü#¤D§M¡MÐ2GÑ2eÓ$fÓgÐØ#Ð';Ð&<Ñ<Ð<r   )r=   r?   r>   r@   )NNNN)NNN)NN)TF)r   r   r   r   Ú__doc__Ú
attributesÚimage_processor_classÚvideo_processor_classÚtokenizer_classrB   r   r   r   r   r   r   r   r)   r   r   r‘   r•   rš   r    Úpropertyr¤   r   Ú__classcell__)rH   s   @r   r2   r2   5   sÂ   ø† ñò E€Jà0ÐØ0ÐàH€O÷
ð" "Ø^bØ!ñ	|oàð|oð IÐ0°$°y±/À4ÐHYÑCZÐZÑ[ð|oð ð	|oð
 Ð-Ñ.ð|oð 
õ|oô|$-òL<ò6ð Y^ô
ð6 ñ=ó ö=r   r2   )Útypingr   r   Únumpyrk   Úfeature_extraction_utilsr   Úimage_utilsr   Úprocessing_utilsr   r	   r
   r   r   r   Útokenization_utils_baser   r   Úvideo_utilsr   r   r"   r)   r2   Ú__all__r   r   r   Ú<module>rº      sa   ð÷* #ã å 4Ý %ß t× tß CÝ %ô# °Uò #ô˜ô ôÐ+°5ò ôw=^ô w=ðt Ð
r   