
    <h&G              
       :   S r SSKJrJr  SSKrSSKJr  SSKJ	r	J
r
  SSKJrJrJrJr  SSKJrJr   " S	 S
\SS9r " S S\SS9rS\\   S\S\\\      4S jrS\\\\         S\\\      S\S\S\R0                  4
S jrS\S\S\S\4S jr " S S\5      rS/rg)zProcessor class for Mllama.    )OptionalUnionN   )BatchFeature)
ImageInputmake_nested_list_of_images)ImagesKwargsProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInputc                   &    \ rS rSr% \\   \S'   Srg)MllamaImagesKwargs   max_image_tiles N)__name__
__module____qualname____firstlineno__r   int__annotations____static_attributes__r       d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mllama/processing_mllama.pyr   r      s    c]"r   r   F)totalc                   ,    \ rS rSr% \\S'   SSS00rSrg)MllamaProcessorKwargs    images_kwargsimage_kwargsr      r   N)r   r   r   r   r   r   	_defaultsr   r   r   r   r   r       s    %% 	q
Ir   r   	input_idsimage_token_idreturnc                    [        U 5       VVs/ sH  u  p#X1:X  d  M  UPM     nnn[        U5      S:X  a  / $ [        U5      S:X  a  US   S//$ [        USS USS 5       VVs/ sH  u  pVXV/PM
     nnnUR                  US   [        U 5      /5        US   S   nUSSS2    H  n	U	S   U	S   S-
  :X  a  XS'   U	S   nM     U$ s  snnf s  snnf )a  
Generate a cross-attention token mask for image tokens in the input sequence.

This function identifies the positions of image tokens in the input sequence and creates
a mask that defines which subsequent tokens each image token should attend to.

Args:
    input_ids (list[int]): A list of token ids representing the input sequence.
    image_token_id (int): The id of the token used to represent images in the sequence.

Returns:
    list[list[int]]: A list of [start, end] pairs, where each pair represents the range
    of tokens an image token should attend to.

Notes:
    - If no image tokens are present, an empty list is returned.
    - For a single image token, it attends to all subsequent tokens until the end of the sequence.
    - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
    - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
r      N)	enumeratelenzipappend)
r%   r&   itokenimage_token_locationsloc1loc2vision_maskslast_mask_endvision_masks
             r   get_cross_attention_token_maskr7   *   s   , 09/C_/C81uG^Q/C_
 !Q&	  !Q&&q)2.//367LSb7QShijikSl3mn3mZTTL3mLn .r2C	NCD
 !$Q'M#DbD)q>[^a//*N#A *
 / ` os   
CC#C
cross_attention_token_mask	num_tilesmax_num_tileslengthc           	         [        U 5      n[        U  Vs/ sH  n[        U5      PM     sn5      n[        R                  " XCXb4[        R                  S9n[        [        X5      5       H[  u  nu  p[        [        X5      5       H;  u  nu  p[        U5      S:X  d  M  Uu  p[        X5      nUS:X  a  UnSXxX2USU24'   M=     M]     U$ s  snf )a  
Convert the cross attention mask indices to a cross attention mask 4D array.

This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.

Args:
    cross_attention_token_mask (list[list[list[int]]]): A nested list structure where:
        - The outer list represents the batch dimension.
        - The middle list represents different images within each batch item.
        - The inner list contains pairs of integers [start, end] representing token ranges for each image.
    num_tiles (list[list[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
    max_num_tiles (int): The maximum possible number of tiles.
    length (int): The total sequence length of the input.

Returns:
    np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
        The array contains `1` where attention is allowed and `0` where it is not.

Note:
    - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
)shapedtype   r*   r)   N)r,   maxnpzerosint64r+   r-   min)r8   r9   r:   r;   
batch_sizemasksmax_num_imagescross_attention_mask
sample_idxsample_maskssample_num_tilesmask_idx	locationsmask_num_tilesstartends                   r   ,convert_sparse_cross_attention_mask_to_denserQ   Z   s    : /0J2LM2L#e*2LMNN88>Ahh
 9B#F`Bl8m4
4\5>s<?b5c1H1y9~"&
#&"9 CYZ$Ho~o%UV 6d 9n   Ns   C
prompt	bos_tokenimage_tokenc                     X;   a  U $ SnU R                  U5      (       a+  U [        U5      S n US-  nU R                  U5      (       a  M+  X#-   U U  3$ )a  
Builds a string from the input prompt by adding `bos_token` if not already present.

Args:
    prompt (`str`):
        The input prompt string.
    bos_token (`str`):
        The beginning of sentence token to be added.
    image_token (`str`):
        The image token used to identify the start of an image sequence.

Returns:
    str: The modified prompt string with the `bos_token` added if necessary.

Examples:
    >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'

    >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
    '<|image|><begin_of_text>Hello world'

    >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
    '<begin_of_text>Hello world'
r   Nr)   )
startswithr,   )rR   rS   rT   num_image_tokens_on_starts       r   build_string_from_inputrX      so    4  !


K
(
(K(*+!Q&! 

K
(
( 56yk&JJr   c                      ^  \ rS rSrSrSS/rSrSrSU 4S jjr    SS\	\
   S	\	\\\\\   \\   4      S
\\   S\4S jjrS rS r SS jr\S 5       rSrU =r$ )MllamaProcessor   a  
Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
[`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
information.
The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
    ```python
    from transformers import MllamaProcessor
    from PIL import Image

    processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")

    processor(
        images=your_pil_image,
        text=["<|image|>If I had to write a haiku for this one"],
        images_kwargs = {"size": {"height": 448, "width": 448}},
        text_kwargs = {"padding": "right"},
        common_kwargs = {"return_tensors": "pt"},
    )
    ```

Args:
    image_processor ([`MllamaImageProcessor`]):
        The image processor is a required input.
    tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.

image_processor	tokenizerMllamaImageProcessorPreTrainedTokenizerFastc                 H  > [        US5      (       d(  SU l        UR                  U R                  5      U l        O"UR                  U l        UR                  U l        SU l        UR                  U R                  5      U l        UR                  U l        [        TU ]!  XUS9  g )NrT   z	<|image|>z<|python_tag|>)chat_template)	hasattrrT   convert_tokens_to_idsr&   python_tokenpython_token_idrS   super__init__)selfr\   r]   ra   	__class__s       r   rg   MllamaProcessor.__init__   s    y-00*D"+"A"A$BRBR"SD(44D"+":":D,(>>t?P?PQ",,=Qr   imagestextkwargsr'   c           
      >   Uc  Uc  [        S5      eU R                  " [        4SU R                  R                  0UD6nUS   nSUS'   US   nUS   n	0 n
UGb"  [        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S	5      eU Vs/ sH  oR                  U R                  5      PM     nnU Vs/ sH#  n[        XR                  U R                  5      PM%     nnUR                  S
S5      nU R                  " U40 UD6nU R                  X/S/S9  US    Vs/ sH  nUR                  U R                   5      PM      nnU
R#                  U5        S/nUb%  [%        U5      nU Vs/ sH  n['        U5      PM     nnUb  [)        S W 5       5      (       a"  [        S U 5       5      (       d  [        S5      e[+        U5      S:  aY  UU:w  d  WU:w  aM  Uc  [        S5      eSn[+        U5      [+        U5      :X  a	  UU:w  a  SnOWU:w  a  Sn[        SU SU SU 35      eUb5  U R,                  " U40 UD6nUR                  S5      nU
R#                  U5        Ubc  Ub`  WS    Vs/ sH  n[/        UU R                   5      PM     nn[1        UWU R,                  R2                  [5        S US    5       5      S9nUU
S'   U	R                  SS5      n[7        U
US9nU$ s  snf s  snf s  snf s  snf s  snf )aF  
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:
            - `'tf'`: Return TensorFlow `tf.constant` objects.
            - `'pt'`: Return PyTorch `torch.Tensor` objects.
            - `'np'`: Return NumPy `np.ndarray` objects.
            - `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
Nz'You must specify either text or images.tokenizer_init_kwargstext_kwargsreturn_tensorsr!   common_kwargsc              3   @   #    U H  n[        U[        5      v   M     g 7fN)
isinstancestr).0ts     r   	<genexpr>+MllamaProcessor.__call__.<locals>.<genexpr>  s     =_Z^UVjC>P>PZ^s   zAInvalid input text. Please provide a string, or a list of stringspadding_sideimage)
modalitiesr%   r   c              3   (   #    U H	  oS :H  v   M     g7fr   Nr   rw   	batch_imgs     r   ry   rz   )  s     D3Ci>3C   c              3   (   #    U H	  oS :H  v   M     g7fr   r   r   s     r   ry   rz   )  s      Q0@9Q0@r   zaIf a batch of text is provided, there should be either no images or at least one image per samplez@No image were provided, but there are image tokens in the prompt zZMake sure to pass your images as a nested list, where each sub-list holds images per batchzhIf you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped.z)The number of image tokens in each text (zA) should be the same as the number of provided images per batch (z). r9   c              3   6   #    U H  n[        U5      v   M     g 7frt   )r,   )rw   r%   s     r   ry   rz   N  s     Q;Pi3y>>;Ps   )r9   r:   r;   rH   )datatensor_type)
ValueError_merge_kwargsr   r]   init_kwargsru   rv   listtupleallcountrT   rX   rS   pop_check_special_mm_tokensr&   updater   r,   anysumr\   r7   rQ   r   r@   r   )rh   rk   rl   audiovideosrm   output_kwargsrp   r!   rr   r   rx   n_images_in_text	text_item_encoding	token_idsn_images_in_idsn_images_in_imagessampleadd_messageimage_featuresr9   r8   rH   rq   batch_features                              r   __call__MllamaProcessor.__call__   s   N <FNFGG**!
"&.."<"<
 
 $M2(,$%%o6%o6$$$v e}55#=_Z^=_:_:_ !deeCGH4a(8(8 94Hjnojn]f+I~~tGWGWXjnDo5A~~d:k:H))$gY)OU]^iUjkUj	yt/B/BCUjOkKK!S/7F<B!CF&#f+F!CD3CDDDS Q0@Q N N !w  #$q("&66/M_:_>$%ghh"$K-.#6F2GGL^brLr 'C(,>> 'Q$CDTCU V@@R?SSVWbVce 
 !11&JMJN&**;7IKK' $"2`hit`u*`uS\.y$:M:MN`u ' * $P*#"22BBQ8K;PQQ	$  ,@D'(&**+;TB$$NKu  Io l "DB*s   3#L)L$LLLc                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
r]   batch_decoderh   argsrm   s      r   r   MllamaProcessor.batch_decodeW  s    
 ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r]   decoder   s      r   r   MllamaProcessor.decode^  s    
 ~~$$d5f55r   c                 B    U R                   R                  " U4UUS.UD6$ )a*  
Post-process the output of the model to decode the text.

Args:
    generated_outputs (`torch.Tensor` or `np.ndarray`):
        The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
        or `(sequence_length,)`.
    skip_special_tokens (`bool`, *optional*, defaults to `True`):
        Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
    **kwargs:
        Additional arguments to be passed to the tokenizer's `batch_decode method`.

Returns:
    `list[str]`: The decoded text.
)skip_special_tokensclean_up_tokenization_spacesr   )rh   generated_outputsr   r   rm   s        r   post_process_image_text_to_text/MllamaProcessor.post_process_image_text_to_texte  s3    ( ~~**
 3)E
 	
 	
r   c                     U R                   R                  nU R                  R                  nU Vs/ sH  o3S:w  d  M
  UPM     nn[        X-   S/-   5      $ s  snf )Nr9   rH   )r]   model_input_namesr\   r   )rh   tokenizer_input_namesimage_processor_input_namesnames       r   r   !MllamaProcessor.model_input_names  sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&k)GKaJbbcc 'ls
   AA)rS   rT   r&   rd   re   rt   )NNNN)TF)r   r   r   r   __doc__
attributesimage_processor_classtokenizer_classrg   r   r   r   r   r   r   r   r   r   r   r   r   r   propertyr   r   __classcell__)ri   s   @r   rZ   rZ      s    > $[1J2/OR (,hlu$u uY(94	?DQbLccdeu ./u 
un<6 Y^
6 d dr   rZ   )r   typingr   r   numpyrA   feature_extraction_utilsr   image_utilsr   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   r   r   r   r   r7   ndarrayrQ   rv   rX   rZ   __all__r   r   r   <module>r      s     " "  4 A V V C#U #,E -d3i - -QUVZ[^V_Q` -`-  $T$s)_ 5- DI-  -  	- 
 ZZ- `"KC "KC "Kc "Kc "KJYdn Ydx 
r   