
    <h4                         S r SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJrJrJrJr  SSKJrJr  SS	KJrJr  \" 5       (       a  S
SKJr  \R0                  " \5      r " S S\SS9rS\4S jrS r " S S\5      rS/r g)z
Processor class for Pixtral.
    )UnionN   )BatchFeature)
ImageInputis_valid_image
load_image)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)is_vision_availablelogging   )get_resize_output_image_sizec                   *    \ rS rSrSSS.0 SS0S.rSrg)	PixtralProcessorKwargs*   F)paddingreturn_mm_token_type_idsreturn_tensorspt)text_kwargsimages_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/pixtral/processing_pixtral.pyr   r   *   s%     (-
 d
	Ir$   r   F)totalreturnc                 R    [        U [        5      =(       a    U R                  S5      $ )Nhttp)
isinstancestr
startswith)vals    r%   is_urlr.   8   s    c3:CNN6$::r$   c                 <    [        U 5      =(       d    [        U 5      $ N)r.   r   )elems    r%   is_image_or_image_urlr2   =   s    $</>$//r$   c            
          ^  \ rS rSrSrSS/rSrSr        SS\S\4U 4S	 jjjr	    SS
\
S\\\\\   \\   4   S\\   S\4S jjrSS jrS rS r\S 5       rSrU =r$ )PixtralProcessorA   a  
Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.

[`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.

Args:
    image_processor ([`PixtralImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`LlamaTokenizerFast`], *optional*):
        The tokenizer is a required input.
    patch_size (`int`, *optional*, defaults to 16):
        Patch size from the vision tower.
    spatial_merge_size (`int`, *optional*, defaults to 1):
        The downsampling factor for the spatial merge operation.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
    image_token (`str`, *optional*, defaults to `"[IMG]"`):
        Special token used to denote image location.
    image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
        Special token used to denote the end of a line of pixels in an image.
    image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
        Special token used to denote the end of an image input.
image_processor	tokenizerAutoImageProcessorAutoTokenizer
patch_sizespatial_merge_sizec	                   > X0l         X@l        X`l        UR                  U R                  5      U l        Xpl        Xl        UR                  U R                  5      U l        UR                  U R
                  5      U l        UR                  U R                  5      U l        U R                  U R                  U R                  /U l	        [        T
U ]-  XUS9  g )N)chat_template)r:   r;   image_tokenconvert_tokens_to_idsimage_token_idimage_break_tokenimage_end_tokenimage_break_token_idimage_end_token_id	image_idssuper__init__)selfr6   r7   r:   r;   r=   r>   rA   rB   kwargs	__class__s             r%   rG   PixtralProcessor.__init___   s     %"4&'==d>N>NO!2.'==d>N>NO$-$C$CDDZDZ$[!"+"A"A$BVBV"W--t/H/H$JaJab=Qr$   imagestextrI   r'   c                 ~   U R                   " [        4SU R                  R                  0UD6nU R                  U R
                  -  nUGb  [        U5      (       a  U/nO[        U[        [        45      (       a  [        US   5      (       a  Ou[        U[        [        45      (       aO  [        US   [        [        45      (       a1  [        US   S   5      (       a  U VV	s/ sH  o H  oPM     M     nnn	O[        S5      eU V
s/ sH%  n
[        U
[        5      (       a  [        U
5      OU
PM'     nn
U R                  " U4SU0US   D6nO0 n[        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S5      eUnUR                  S5      Gb%  [!        US	   5      n/ n/ nU GH  nU R"                  U;   a  [%        U5      u  nnUU-  nUU-  nU R"                  /U-  U R&                  /-   /U-  nU VVs/ sH  o H  nUPM     M     nnnU R(                  US
'   SR+                  U5      nUR-                  U5        UR/                  U R"                  SS5      nU R"                  U;   a  M  SU;   a,  UR1                  S5      nUR/                  SUS5      nSU;   a  M,  UR-                  U5        GM     US   R1                  SS5      nUS   R1                  SS5      nU R                  " U40 US   DSS0D6nU R3                  UUS/S9  U(       ai  [4        R6                  " US   5      n[4        R8                  " US   5      nSU[4        R:                  " UU R<                  5      '   UR?                  5       US'   [A        0 UEUEUS9$ s  sn	nf s  sn
f s  snnf )a  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
    `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
tokenizer_init_kwargsNr   zdInvalid input images. Please provide a single image, a list of images, or a list of lists of images.r:   r   zAInvalid input text. Please provide a string, or a list of stringspixel_valuesimage_sizes z<placeholder>r   r   r   r   Fimage)
modalities	input_idsmm_token_type_ids)datatensor_type)!_merge_kwargsr   r7   init_kwargsr:   r;   r2   r*   listtuple
ValueErrorr+   r   r6   	TypeErrorgetiterr>   nextrA   rB   joinappendreplacepop_check_special_mm_tokensnparray
zeros_likeisinrE   tolistr   )rH   rL   rM   audiovideosrI   output_kwargsr:   sublistrT   imimage_inputsprompt_stringsrQ   replace_stringssampleheightwidthnum_height_tokensnum_width_tokensreplace_tokensitemreplace_strr   r   text_inputs	array_idsrW   s                               r%   __call__PixtralProcessor.__call__w   s   R **"
"&.."<"<
 
 __t'>'>>
$V,, FT5M227LVTUY7W7W6D%=11vay4-88)&)A,77/5KvG7%%7%vK z  OUUf
2s(;(;jnCfFU//p:pQ^_nQopLLdC  6DD$''
47C0H0H_`` N+7|M:;KN O&&&0$($5MFE(.*(<%',
':$))*-==AWAW@XX&)&*N ;I%].wU\TdU\d.N%])-)=)=N2&"$''."9K#**;7#^^D,<,<oqQF &&&0 &/"1"5"5a"8K#^^O[!LF &/ %%f-% ( '}599:JDQ#0#?#C#CD^`e#f nn^i}]7Sidhi%%nkwi%X#[!9:I "k+.F GDEbggi@A/@/G/G/IK+,!@K!@<!@n]]m L
 V6 &^s   "N.+N4N9c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU R                  U R                  -  n/ nU HP  u  p[        [        R                  " XS45      US   US   4Xf4S9u  pX-  nX-  nUR                  US-   U-  5        MR     S/[        U5      -  nUR                  X~S.5        [        S	0 UD6$ )
a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   sizer   longest_edge)r   r:   r   )num_image_tokensnum_image_patchesr   )r   r"   r`   updater6   r   r:   r;   r   rh   zerosrd   lenr	   )rH   rQ   rI   vision_datar   r   r:   r   rv   rw   resized_heightresized_widthrx   ry   r   s                  r%   _get_num_multimodal_tokens+PixtralProcessor._get_num_multimodal_tokens   s    "2<<@@RTUM  ( $$VT2Od6J6J6O6OD4+B+BBJ!!,0LHHfQ/0~.^0DE *71-
 %3$@!#0#>  '')9A)=AR(RS "- "#c+&6 64Dmn,,,r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r7   batch_decoderH   argsrI   s      r%   r   PixtralProcessor.batch_decode  s    
 ~~**D;F;;r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r7   decoder   s      r%   r   PixtralProcessor.decode  s    
 ~~$$d5f55r$   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      $ r0   )r7   model_input_namesr6   r\   dictfromkeys)rH   tokenizer_input_namesimage_processor_input_namess      r%   r   "PixtralProcessor.model_input_names  s>     !% @ @&*&:&:&L&L#DMM"7"UVWWr$   )	rA   rC   rB   rD   rE   r>   r@   r:   r;   )NN   r   Nz[IMG]z[IMG_BREAK]z	[IMG_END])NNNNr0   )r   r   r    r!   __doc__
attributesimage_processor_classtokenizer_classintrG   r   r   r   r   r\   r   r   r   r   r   r   r   propertyr   r#   __classcell__)rJ   s   @r%   r4   r4   A   s    2 $[1J0%O "#'#R 	R
  R R4 "^bq^q^ I0$y/4HYCZZ[q^ /0q^ 
q^f"-J<6 X Xr$   r4   )!r   typingr   numpyrh   feature_extraction_utilsr   image_utilsr   r   r   processing_utilsr	   r
   r   r   tokenization_utils_baser   r   utilsr   r   image_processing_pixtralr   
get_loggerr   loggerr   boolr.   r2   r4   __all__r   r$   r%   <module>r      s      4 A A  D 1 F 
		H	%
-U 
;4 ;
0bX~ bXJ 
r$   