ó
    <±h&G  ã            
       ó:  • S r SSKJrJr  SSKrSSKJr  SSKJ	r	J
r
  SSKJrJrJrJr  SSKJrJr   " S	 S
\SS9r " S S\SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R0                  4
S jrS\S\S\S\4S jr " S S\5      rS/rg)zProcessor class for Mllama.é    )ÚOptionalÚUnionNé   )ÚBatchFeature)Ú
ImageInputÚmake_nested_list_of_images)ÚImagesKwargsÚProcessingKwargsÚProcessorMixinÚUnpack)ÚPreTokenizedInputÚ	TextInputc                   ó&   • \ rS rSr% \\   \S'   Srg)ÚMllamaImagesKwargsé   Úmax_image_tiles© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   ÚintÚ__annotations__Ú__static_attributes__r   ó    Úd/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s   ‡ Ø˜c‘]Ö"r   r   F)Útotalc                   ó,   • \ rS rSr% \\S'   SSS00rSrg)ÚMllamaProcessorKwargsé    Úimages_kwargsÚimage_kwargsr   é   r   N)r   r   r   r   r   r   Ú	_defaultsr   r   r   r   r   r       s   ‡ Ø%Ó%ð 	Ø˜qð
ðƒIr   r   Ú	input_idsÚimage_token_idÚreturnc                 ó   • [        U 5       VVs/ sH  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ sH  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  X‰S'   U	S   nM     U$ s  snnf s  snnf )a·  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (list[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    list[list[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r   é   éÿÿÿÿN)Ú	enumerateÚlenÚzipÚappend)
r%   r&   ÚiÚtokenÚimage_token_locationsÚloc1Úloc2Úvision_masksÚlast_mask_endÚvision_masks
             r   Úget_cross_attention_token_maskr7   *   s  € ô, 09¸Ô/CÔ_Ñ/C¡8 1ÀuÑG^ŸQÑ/CÐÑ_ä
Ð Ó! QÓ&Øˆ	ô Ð Ó! QÓ&Ø& qÑ)¨2Ð.Ð/Ð/ä36Ð7LÈSÈbÐ7QÐShÐijÐikÐSlÔ3mÔnÑ3m¡Z TT“LÑ3m€LÑnð ×ÑÐ.¨rÑ2´C¸	³NÐCÔDð
 ! Ñ$ QÑ'€MØ#¡D b DÔ)ˆØq‰>˜[¨™^¨aÑ/Ó/Ø*˜‰NØ# A™Šñ *ð
 Ðùó/ `ùó os   
CCÁ#C
Úcross_attention_token_maskÚ	num_tilesÚmax_num_tilesÚlengthc           	      óž  • [        U 5      n[        U  Vs/ sH  n[        U5      PM     sn5      n[        R                  " XCXb4[        R                  S9n[        [        X5      5       H[  u  nu  pš[        [        Xš5      5       H;  u  nu  pÍ[        U5      S:X  d  M  Uu  pï[        Xó5      nUS:X  a  UnSXxXï2USU24'   M=     M]     U$ s  snf )a½  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
)ÚshapeÚdtypeé   r*   r)   N)r,   ÚmaxÚnpÚzerosÚint64r+   r-   Úmin)r8   r9   r:   r;   Ú
batch_sizeÚmasksÚmax_num_imagesÚcross_attention_maskÚ
sample_idxÚsample_masksÚsample_num_tilesÚmask_idxÚ	locationsÚmask_num_tilesÚstartÚends                   r   Ú,convert_sparse_cross_attention_mask_to_denserQ   Z   sÚ   € ô: Ð/Ó0€JÜÑ2LÓMÑ2L¨œ#˜ež*Ñ2LÑMÓN€NäŸ8š8Ø >ÐAÜh‰hñÐô
 9BÄ#ÐF`ÓBlÖ8mÑ4ˆ
Ñ4\Ü5>¼sÀ<Ó?bÖ5cÑ1ˆHÑ1yÜ9‹~ Õ"Ø&‘
Ü˜#Ó&Ø˜"“9Ø CØYZÐ$°°¸HÀoÀ~ÀoÐ%UÓVó 6dñ 9nð  Ðùò Ns   •C
ÚpromptÚ	bos_tokenÚimage_tokenc                 óª   • X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr)   )Ú
startswithr,   )rR   rS   rT   Únum_image_tokens_on_starts       r   Úbuild_string_from_inputrX   Š   so   € ð4 ÓØˆà !ÐØ
×
Ñ
˜K×
(Ñ
(Øœ˜KÓ(Ð*Ð+ˆØ! QÑ&Ð!ð ×
Ñ
˜K×
(Ó
(ð Ñ5Ð6°y°kÀ&ÀÐJÐJr   c                   óº   ^ • \ rS rSrSrSS/rSrSrSU 4S jjr    SS\	\
   S	\	\\\\\   \\   4      S
\\   S\4S jjrS rS r SS jr\S 5       rSrU =r$ )ÚMllamaProcessoré¯   a°  
Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import MllamaProcessor
    from PIL import Image

    processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

    processor(
        images=your_pil_image,
        text=["<|image|>If I had to write a haiku for this one"],
        images_kwargs = {"size": {"height": 448, "width": 448}},
        text_kwargs = {"padding": "right"},
        common_kwargs = {"return_tensors": "pt"},
    )
    ```

Args:
    image_processor ([`MllamaImageProcessor`]):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

Úimage_processorÚ	tokenizerÚMllamaImageProcessorÚPreTrainedTokenizerFastc                 óH  >• [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )NrT   z	<|image|>z<|python_tag|>)Úchat_template)	ÚhasattrrT   Úconvert_tokens_to_idsr&   Úpython_tokenÚpython_token_idrS   ÚsuperÚ__init__)Úselfr\   r]   ra   Ú	__class__s       €r   rg   ÚMllamaProcessor.__init__Ó   s   ø€ Üy -×0Ñ0Ø*ˆDÔØ"+×"AÑ"AÀ$×BRÑBRÓ"SˆDÕà(×4Ñ4ˆDÔØ"+×":Ñ":ˆDÔà,ˆÔØ(×>Ñ>¸t×?PÑ?PÓQˆÔØ"×,Ñ,ˆŒÜ‰Ñ˜À=ÐÒQr   ÚimagesÚtextÚkwargsr'   c           
      ó>  • Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   nSUS'   US   nUS   n	0 n
UGb"  [        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S	5      eU Vs/ sH  o»R                  U R                  5      PM     nnU Vs/ sH#  n[        XÐR                  U R                  5      PM%     nnUR                  S
S5      nU R                  " U40 UD6nU R                  X/S/S9  US    Vs/ sH  nUR                  U R                   5      PM      nnU
R#                  U5        S/nUb%  [%        U5      nU Vs/ sH  n['        U5      PM     nnUb¡  [)        S W 5       5      (       a"  [        S U 5       5      (       d  [        S5      e[+        U5      S:”  aY  UU:w  d  WU:w  aM  Uc  [        S5      eSn[+        U5      [+        U5      :X  a	  UU:w  a  SnOWU:w  a  Sn[        SU SU SU 35      eUb5  U R,                  " U40 UD6nUR                  S5      nU
R#                  U5        Ubc  Ub`  WS    Vs/ sH  n[/        UU R                   5      PM     nn[1        UWU R,                  R2                  [5        S US    5       5      S9nUU
S'   U	R                  SS5      n[7        U
US9nU$ s  snf s  snf s  snf s  snf s  snf )aF  
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
Nz'You must specify either text or images.Útokenizer_init_kwargsÚtext_kwargsÚreturn_tensorsr!   Úcommon_kwargsc              3   ó@   #   • U H  n[        U[        5      v •  M     g 7f©N)Ú
isinstanceÚstr)Ú.0Úts     r   Ú	<genexpr>Ú+MllamaProcessor.__call__.<locals>.<genexpr>  s   é € Ð=_ÑZ^ÐUV¼jÈÌC×>PÐ>PÒZ^ùs   ‚zAInvalid input text. Please provide a string, or a list of stringsÚpadding_sideÚimage)Ú
modalitiesr%   r   c              3   ó(   #   • U H	  oS :H  v •  M     g7f©r   Nr   ©rw   Ú	batch_imgs     r   ry   rz   )  s   é € ÐDÑ3C i –>Ò3Cùó   ‚c              3   ó(   #   • U H	  oS :H  v •  M     g7fr   r   r€   s     r   ry   rz   )  s   é € ð QÙ0@ 9˜Q–Ò0@ùr‚   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the promptÚ zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r9   c              3   ó6   #   • U H  n[        U5      v •  M     g 7frt   )r,   )rw   r%   s     r   ry   rz   N  s   é € ÐQÑ;P¨iœ3˜yŸ>˜>Ò;Pùs   ‚)r9   r:   r;   rH   )ÚdataÚtensor_type)Ú
ValueErrorÚ_merge_kwargsr   r]   Úinit_kwargsru   rv   ÚlistÚtupleÚallÚcountrT   rX   rS   ÚpopÚ_check_special_mm_tokensr&   Úupdater   r,   ÚanyÚsumr\   r7   rQ   r   r@   r   )rh   rk   rl   ÚaudioÚvideosrm   Úoutput_kwargsrp   r!   rr   r†   rx   Ún_images_in_textÚ	text_itemÚ_ÚencodingÚ	token_idsÚn_images_in_idsÚn_images_in_imagesÚsampleÚadd_messageÚimage_featuresr9   r8   rH   rq   Úbatch_features                              r   Ú__call__ÚMllamaProcessor.__call__à   s  € ðN ‰<˜F™NÜÐFÓGÐGà×*Ò*Ü!ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð $ MÑ2ˆØ(,ˆÐ$Ñ%Ø% oÑ6ˆØ% oÑ6ˆàˆØÒÜ˜$¤×$Ñ$Øv‘Ü  ¬¬e }×5Ñ5¼#Ñ=_ÑZ^Ó=_×:_Ñ:_Ü Ð!dÓeÐeÙCGÓHÁ4¸a§¡¨×(8Ñ(8Ö 9Á4ÐÐHÙjnÓoÑjnÐ]fÔ+¨I·~±~Àt×GWÑGWÖXÑjnˆDÐoØ—‘ °Ó5ˆAØ—~’~ dÑ:¨kÑ:ˆHØ×)Ñ)¨$ÀgÀYÐ)ÑOØU]Ð^iÒUjÓkÑUjÈ	˜yŸ™¨t×/BÑ/BÖCÑUjˆOÐkØK‰K˜Ô!à˜SÐØÑÜ/°Ó7ˆFÙ<BÓ!C¹F°&¤# f¦+¹FÐÐ!CàÑÜÑDÑ3CÓD×DÑDÌSñ QÙ0@óQ÷ Nñ Nô !Øwóð ô Ð#Ó$ qÓ(Ø"Ð&6Ó6¸/ÐM_Ó:_à‘>Ü$Ð%gÓhÐhà"$KÜÐ-Ó.´#Ð6FÓ2GÓGÐL^ÐbrÓLrð 'C™Ø(Ð,>Ó>ð 'Q˜ä$ØCÐDTÐCUð V@Ø@RÐ?SÐSVÐWbÐVcðeóð ð
 ÑØ!×1Ò1°&ÑJ¸MÑJˆNØ&×*Ñ*¨;Ó7ˆIØK‰K˜Ô'ð Ñ $Ñ"2à`hÐitÒ`uó*Ù`uÐS\Ô.¨y¸$×:MÑ:MÖNÑ`uð 'ð *ô $PØ*Ø#Ø"×2Ñ2×BÑBÜÑQ¸8ÀKÒ;PÓQÓQñ	$Ð ð ,@ˆDÐ'Ñ(à&×*Ñ*Ð+;¸TÓBˆÜ$¨$¸NÑKˆàÐùòu  IùÚoùò lùò "DùòB*s   Â3#LÃ)LÅ$LÆLÊLc                 ó:   • U R                   R                  " U0 UD6$ )z¯
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
©r]   Úbatch_decode©rh   Úargsrm   s      r   r¦   ÚMllamaProcessor.batch_decodeW  s   € ð
 ~‰~×*Ò*¨DÐ;°FÑ;Ð;r   c                 ó:   • U R                   R                  " U0 UD6$ )z©
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r]   Údecoder§   s      r   r«   ÚMllamaProcessor.decode^  s   € ð
 ~‰~×$Ò$ dÐ5¨fÑ5Ð5r   c                 óB   • U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)Úskip_special_tokensÚclean_up_tokenization_spacesr¥   )rh   Úgenerated_outputsr®   r¯   rm   s        r   Úpost_process_image_text_to_textÚ/MllamaProcessor.post_process_image_text_to_texte  s3   € ð( ~‰~×*Ò*Øð
à 3Ø)Eñ
ð ñ	
ð 	
r   c                 ó¶   • U R                   R                  nU R                  R                  nU Vs/ sH  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr9   rH   )r]   Úmodel_input_namesr\   r‹   )rh   Útokenizer_input_namesÚimage_processor_input_namesÚnames       r   r´   Ú!MllamaProcessor.model_input_names€  sb   € à $§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#ñ 9TÓ&kÑ8S°Ð_jÑWj§tÑ8SÐ#Ð&kÜÐ)ÑGÐKaÐJbÑbÓcÐcùò 'ls
   ±A½A)rS   rT   r&   rd   re   rt   )NNNN)TF)r   r   r   r   Ú__doc__Ú
attributesÚimage_processor_classÚtokenizer_classrg   r   r   r   r   r   r‹   r   r   r   r¢   r¦   r«   r±   Úpropertyr´   r   Ú__classcell__)ri   s   @r   rZ   rZ   ¯   s¹   ø† ñð> $ [Ð1€JØ2ÐØ/€O÷Rð (,ØhlØØñuà˜Ñ$ðuð u˜YÐ(9¸4À	¹?ÈDÐQbÑLcÐcÑdÑeðuð Ð.Ñ/ðuð 
õuòn<ò6ð Y^ô
ð6 ñdó ödr   rZ   )r¹   Útypingr   r   ÚnumpyrA   Úfeature_extraction_utilsr   Úimage_utilsr   r   Úprocessing_utilsr	   r
   r   r   Útokenization_utils_baser   r   r   r   r‹   r   r7   ÚndarrayrQ   rv   rX   rZ   Ú__all__r   r   r   Ú<module>rÇ      s÷   ðñ  "ç "ã å 4ß Aß VÓ Vß Cô#˜¨Uò #ôÐ,°Eò ð-¨d°3©ið -Èð -ÐQUÐVZÐ[^ÑV_ÑQ`ô -ð`- Ø $ T¨$¨s©)¡_Ñ 5ð- àD˜‘I‰ð- ð ð- ð ð	- ð
 ‡ZZô- ð`"K Cð "K°Cð "KÀcð "KÈcô "KôJYdnô Ydðx Ð
r   