ó
    <±hà4  ã                   ó   • S r SSKJr  SSKrSSKJr  SSKJrJ	r	J
r
  SSKJrJrJrJr  SSKJrJr  SS	KJrJr  \" 5       (       a  S
SKJr  \R0                  " \5      r " S S\SS9rS\4S jrS r " S S\5      rS/r g)z
Processor class for Pixtral.
é    )ÚUnionNé   )ÚBatchFeature)Ú
ImageInputÚis_valid_imageÚ
load_image)ÚMultiModalDataÚProcessingKwargsÚProcessorMixinÚUnpack)ÚPreTokenizedInputÚ	TextInput)Úis_vision_availableÚloggingé   )Úget_resize_output_image_sizec                   ó*   • \ rS rSrSSS.0 SS0S.rSrg)	ÚPixtralProcessorKwargsé*   F)ÚpaddingÚreturn_mm_token_type_idsÚreturn_tensorsÚpt)Útext_kwargsÚimages_kwargsÚcommon_kwargs© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú	_defaultsÚ__static_attributes__r   ó    Úf/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/pixtral/processing_pixtral.pyr   r   *   s%   † ð Ø(-ñ
ð à˜dð
ñ	ƒIr$   r   F)ÚtotalÚreturnc                 óR   • [        U [        5      =(       a    U R                  S5      $ )NÚhttp)Ú
isinstanceÚstrÚ
startswith)Úvals    r%   Úis_urlr.   8   s   € Ücœ3Ó×: C§N¡N°6Ó$:Ð:r$   c                 ó<   • [        U 5      =(       d    [        U 5      $ ©N)r.   r   )Úelems    r%   Úis_image_or_image_urlr2   =   s   € Ü$‹<×/œ>¨$Ó/Ð/r$   c            
       óÈ   ^ • \ rS rSrSrSS/rSrSr        SS\S\4U 4S	 jjjr	    SS
\
S\\\\\   \\   4   S\\   S\4S jjrSS jrS rS r\S 5       rSrU =r$ )ÚPixtralProcessoréA   a  
Constructs a Pixtral processor which wraps a Pixtral image processor and a Pixtral tokenizer into a single processor.

[`PixtralProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
[`~PixtralProcessor.__call__`] and [`~PixtralProcessor.decode`] for more information.

Args:
    image_processor ([`PixtralImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`LlamaTokenizerFast`], *optional*):
        The tokenizer is a required input.
    patch_size (`int`, *optional*, defaults to 16):
        Patch size from the vision tower.
    spatial_merge_size (`int`, *optional*, defaults to 1):
        The downsampling factor for the spatial merge operation.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
    image_token (`str`, *optional*, defaults to `"[IMG]"`):
        Special token used to denote image location.
    image_break_token (`str`, *optional*, defaults to `"[IMG_BREAK]"`):
        Special token used to denote the end of a line of pixels in an image.
    image_end_token (`str`, *optional*, defaults to `"[IMG_END]"`):
        Special token used to denote the end of an image input.
Úimage_processorÚ	tokenizerÚAutoImageProcessorÚAutoTokenizerÚ
patch_sizeÚspatial_merge_sizec	                 ó®  >• X0l         X@l        X`l        UR                  U R                  5      U l        Xpl        X€l        UR                  U R                  5      U l        UR                  U R
                  5      U l        UR                  U R                  5      U l        U R                  U R                  U R                  /U l	        [        T
U ]-  XUS9  g )N)Úchat_template)r:   r;   Úimage_tokenÚconvert_tokens_to_idsÚimage_token_idÚimage_break_tokenÚimage_end_tokenÚimage_break_token_idÚimage_end_token_idÚ	image_idsÚsuperÚ__init__)Úselfr6   r7   r:   r;   r=   r>   rA   rB   ÚkwargsÚ	__class__s             €r%   rG   ÚPixtralProcessor.__init___   s¸   ø€ ð %ŒØ"4ÔØ&ÔØ'×=Ñ=¸d×>NÑ>NÓOˆÔØ!2ÔØ.ÔØ'×=Ñ=¸d×>NÑ>NÓOˆÔØ$-×$CÑ$CÀD×DZÑDZÓ$[ˆÔ!Ø"+×"AÑ"AÀ$×BVÑBVÓ"WˆÔØ×-Ñ-¨t×/HÑ/HÈ$×JaÑJaÐbˆŒÜ‰Ñ˜À=ÐÒQr$   ÚimagesÚtextrI   r'   c                 ó~  • U R                   " [        4SU R                  R                  0UD6nU R                  U R
                  -  nUGb  [        U5      (       a  U/nO¤[        U[        [        45      (       a  [        US   5      (       a  Ou[        U[        [        45      (       aO  [        US   [        [        45      (       a1  [        US   S   5      (       a  U VV	s/ sH  oˆ H  o™PM     M     nnn	O[        S5      eU V
s/ sH%  n
[        U
[        5      (       a  [        U
5      OU
PM'     nn
U R                  " U4SU0US   D6nO0 n[        U[        5      (       a  U/nO8[        U[        5      (       d#  [        US   [        5      (       d  [        S5      eUnUR                  S5      Gb%  [!        US	   5      n/ n/ nU GH  nU R"                  U;   aµ  [%        U5      u  nnUU-  nUU-  nU R"                  /U-  U R&                  /-   /U-  nU VVs/ sH  oˆ H  nUPM     M     nnnU R(                  US
'   SR+                  U5      nUR-                  U5        UR/                  U R"                  SS5      nU R"                  U;   a  Mµ  SU;   a,  UR1                  S5      nUR/                  SUS5      nSU;   a  M,  UR-                  U5        GM     US   R1                  SS5      nUS   R1                  SS5      nU R                  " U40 US   DSS0D6nU R3                  UUS/S9  U(       ai  [4        R6                  " US   5      n[4        R8                  " US   5      nSU[4        R:                  " UU R<                  5      '   UR?                  5       US'   [A        0 UEUEUS9$ s  sn	nf s  sn
f s  snnf )aö  
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
    `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
Útokenizer_init_kwargsNr   zdInvalid input images. Please provide a single image, a list of images, or a list of lists of images.r:   r   zAInvalid input text. Please provide a string, or a list of stringsÚpixel_valuesÚimage_sizeséÿÿÿÿÚ z<placeholder>r   r   r   r   FÚimage)Ú
modalitiesÚ	input_idsÚmm_token_type_ids)ÚdataÚtensor_type)!Ú_merge_kwargsr   r7   Úinit_kwargsr:   r;   r2   r*   ÚlistÚtupleÚ
ValueErrorr+   r   r6   Ú	TypeErrorÚgetÚiterr>   ÚnextrA   rB   ÚjoinÚappendÚreplaceÚpopÚ_check_special_mm_tokensÚnpÚarrayÚ
zeros_likeÚisinrE   Útolistr   )rH   rL   rM   ÚaudioÚvideosrI   Úoutput_kwargsr:   ÚsublistrT   ÚimÚimage_inputsÚprompt_stringsrQ   Úreplace_stringsÚsampleÚheightÚwidthÚnum_height_tokensÚnum_width_tokensÚreplace_tokensÚitemÚreplace_strr   r   Útext_inputsÚ	array_idsrW   s                               r%   Ú__call__ÚPixtralProcessor.__call__w   sÀ  € ðR ×*Ò*Ü"ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð —_‘_ t×'>Ñ'>Ñ>ˆ
àÒÜ$ V×,Ñ,Ø ˜‘Ü˜F¤T¬5 M×2Ñ2Ô7LÈVÐTUÉY×7WÑ7WØä˜6¤D¬% =×1Ñ1Ü˜v a™y¬4´¨-×8Ñ8Ü)¨&°©)°A©,×7Ñ7á/5ÔK©v GÂ7¸%š%Á7™%©vÑKä Øzóð ñ OUÓUÉfÈ¬
°2´s×(;Ñ(;”j ”nÀÒCÉfˆFÐUØ×/Ò/°ÑpÀ:ÐpÐQ^Ð_nÑQoÑp‰LàˆLädœC× Ñ Ø6‰DÜ˜D¤$×'Ñ'´
¸4À¹7ÄC×0HÑ0HÜÐ_Ó`Ð`ð ˆØ×Ñ˜NÓ+Ò7ä˜|¨MÑ:Ó;ˆKØˆNØ ˆOäØ×&Ñ&¨&Ó0Ü$(¨Ó$5‘MF˜EØ(.°*Ñ(<Ð%Ø',°
Ñ':Ð$à×)Ñ)Ð*Ð-=Ñ=À×AWÑAWÐ@XÑXð&à)ñ&*Nñ ;IÔ%]¹.¨wÒU\ÈT£dÑU\¡d¹.NÑ%]Ø)-×)=Ñ)=N 2Ñ&Ø"$§'¡'¨.Ó"9KØ#×*Ñ*¨;Ô7Ø#Ÿ^™^¨D×,<Ñ,<¸oÈqÓQFð ×&Ñ&¨&Õ0ð &¨Ó/Ø"1×"5Ñ"5°aÓ"8KØ#Ÿ^™^¨O¸[È!ÓLFð &¨Õ/ð ×%Ñ% f×-ñ% ð( ' }Ñ5×9Ñ9Ð:JÈDÓQˆØ#0°Ñ#?×#CÑ#CÐD^Ð`eÓ#fÐ Ø—n’n ^Ñi°}À]Ñ7SÑiÐdhÒiˆØ×%Ñ% n°kÈwÈiÐ%ÑXæ#ÜŸš ¨[Ñ!9Ó:ˆIÜ "§¢¨k¸+Ñ.FÓ GÐØDEÐœbŸgšg i°·±Ó@ÑAØ/@×/GÑ/GÓ/IˆKÐ+Ñ,äÐ!@ KÐ!@°<Ð!@ÈnÑ]Ð]ùóm Lùò
 Vùó6 &^s   Ã"N.Ä+N4ÈN9c                 ó  • 0 nUbó  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU R                  U R                  -  n/ nU HP  u  p‰[        [        R                  " X‰S45      US   US   4Xf4S9u  p«X¦-  nX¶-  nUR                  US-   U-  5        MR     S/[        U5      -  nUR                  X~S.5        [        S	0 UD6$ )
a{  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.

Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   Úsizer   Úlongest_edge)r‚   r:   r   )Únum_image_tokensÚnum_image_patchesr   )r   r"   r`   Úupdater6   r‚   r:   r;   r   rh   Úzerosrd   Úlenr	   )rH   rQ   rI   Úvision_datar   r‚   r:   r„   rv   rw   Úresized_heightÚresized_widthrx   ry   r…   s                  r%   Ú_get_num_multimodal_tokensÚ+PixtralProcessor._get_num_multimodal_tokensê   s  € ð ˆØÑ"Ü2×<Ñ<×@Ñ@ÀÐRTÓUˆMØ× Ñ  Ô(à ×$Ñ$ V¨TÓ2×O°d×6JÑ6J×6OÑ6OˆDØŸ™¨4×+BÑ+BÑBˆJà!ÐÛ!,‘Ü0LÜ—H’H˜f¨QÐ/Ó0Ø˜~Ñ.°°^Ñ0DÐEØ *Ð7ñ1Ñ-ð
 %3Ñ$@Ð!Ø#0Ñ#>Ð Ø ×'Ñ'Ð)9¸AÑ)=ÐARÑ(RÖSñ "-ð "# ¤c¨+Ó&6Ñ 6ÐØ×ÑÐ4DÑmÔnäÑ, Ñ,Ð,r$   c                 ó:   • U R                   R                  " U0 UD6$ )zª
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r7   Úbatch_decode©rH   ÚargsrI   s      r%   r   ÚPixtralProcessor.batch_decode  s   € ð
 ~‰~×*Ò*¨DÐ;°FÑ;Ð;r$   c                 ó:   • U R                   R                  " U0 UD6$ )z¤
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r7   Údecoder   s      r%   r”   ÚPixtralProcessor.decode  s   € ð
 ~‰~×$Ò$ dÐ5¨fÑ5Ð5r$   c                 óš   • U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      $ r0   )r7   Úmodel_input_namesr6   r\   ÚdictÚfromkeys)rH   Útokenizer_input_namesÚimage_processor_input_namess      r%   r—   Ú"PixtralProcessor.model_input_names  s>   € ð !%§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#Ü”D—M‘MÐ"7Ñ"UÓVÓWÐWr$   )	rA   rC   rB   rD   rE   r>   r@   r:   r;   )NNé   r   Nz[IMG]z[IMG_BREAK]z	[IMG_END])NNNNr0   )r   r   r    r!   Ú__doc__Ú
attributesÚimage_processor_classÚtokenizer_classÚintrG   r   r   r   r   r\   r   r   r   r   rŒ   r   r”   Úpropertyr—   r#   Ú__classcell__)rJ   s   @r%   r4   r4   A   sä   ø† ñð2 $ [Ð1€JØ0ÐØ%€Oð ØØØ"#ØØØ'Ø#ñRð ð	Rð
  ÷Rð Rð4 "Ø^bØØñq^àðq^ð IÐ0°$°y±/À4ÐHYÑCZÐZÑ[ðq^ð Ð/Ñ0ðq^ð 
õq^ôf"-òJ<ò6ð ñXó öXr$   r4   )!rž   Útypingr   Únumpyrh   Úfeature_extraction_utilsr   Úimage_utilsr   r   r   Úprocessing_utilsr	   r
   r   r   Útokenization_utils_baser   r   Úutilsr   r   Úimage_processing_pixtralr   Ú
get_loggerr   Úloggerr   Úboolr.   r2   r4   Ú__all__r   r$   r%   Ú<module>r±      s‰   ðñõ ã å 4ß AÑ A÷ó ÷ Dß 1ñ ×ÑÝFð 
×	Ò	˜HÓ	%€ô
Ð-°Uò 
ð;4ô ;ò
0ôbX~ô bXðJ Ð
r$   