ó
    <±h¯&  ã                   ó¤   • S SK JrJr  S SKrSSKJr  SSKJr  SSK	J
r
JrJrJr  SSKJrJr  SSKJr  S	S
KJr   " S S\SS9r " S S\5      rS/rg)é    )ÚOptionalÚUnionNé   )ÚBatchFeature)Ú
ImageInput)ÚMultiModalDataÚProcessingKwargsÚProcessorMixinÚUnpack)ÚPreTokenizedInputÚ	TextInput)Ú
TensorTypeé   )ÚAutoTokenizerc                   ó@   • \ rS rSrSSS.SSS.\R
                  S.rSrg)	ÚAriaProcessorKwargsé!   F)ÚpaddingÚreturn_mm_token_type_idséÔ  )Úmax_image_sizeÚsplit_image)Útext_kwargsÚimages_kwargsÚreturn_tensors© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__r   ÚPYTORCHÚ	_defaultsÚ__static_attributes__r   ó    Ú`/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aria/processing_aria.pyr   r   !   s.   † ð Ø(-ñ
ð
 "Ø ñ
ð %×,Ñ,ñ
ƒIr$   r   F)Útotalc                   óò   ^ • \ rS rSrSrSS/rSrSr    SS\\	\
4   S\\
   S\\\\\4   \4      4U 4S	 jjjr   SS
\\\\\   \\   4   S\\   S\\   S\4S jjrSS jrS rS r\S 5       rSrU =r$ )ÚAriaProcessoré/   a³  
AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.

Args:
    image_processor (`AriaImageProcessor`, *optional*):
        The AriaImageProcessor to use for image preprocessing.
    tokenizer (`PreTrainedTokenizerBase`, *optional*):
        An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
    chat_template (`str`, *optional*):
        A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
    size_conversion (`Dict`, *optional*):
        A dictionary indicating size conversions for images.
Úimage_processorÚ	tokenizerÚAriaImageProcessorr   Úchat_templateÚsize_conversionc                 ó*  >• Uc  SSS.nUR                  5        VVs0 sH  u  pV[        U5      U_M     snnU l        UR                  U l        UR                  U l        Ub  UR
                  c  UR                  Ul        [        TU ]!  XUS9  g s  snnf )Né€   é   )iê  r   )r-   )	ÚitemsÚintr.   Úimage_tokenÚimage_token_idÚ	pad_tokenÚ	unk_tokenÚsuperÚ__init__)Úselfr*   r+   r-   r.   ÚkÚvÚ	__class__s          €r%   r9   ÚAriaProcessor.__init__B   s”   ø€ ð Ñ"Ø$'¨cÑ2ˆOØ6E×6KÑ6KÔ6MÔNÑ6M©d¨a¤ A£¨¢	Ñ6MÒNˆÔà$×0Ñ0ˆÔØ'×6Ñ6ˆÔØÑ  Y×%8Ñ%8Ñ%@Ø"+×"5Ñ"5ˆIÔä‰Ñ˜À=ÐÒQùó  Os   BÚtextÚimagesÚkwargsÚreturnc                 óà  • U R                   " [        4SU R                  R                  0UD6n[	        U[
        5      (       a  U/nO8[	        U[        5      (       d#  [	        US   [
        5      (       d  [        S5      eUbª  U R                  " U40 US   D6nU R                  UR                  R                  S      n/ n	UR                  S5      U-  n
U HQ  nUR                  U R                  R                  U R                  R                  U
-  5      nU	R                  U5        MS     O0 nUn	US   R                  S	S5      nUS   R                  S
S5      nU R                  " U	40 US   DS	S0D6nU R!                  XžS/S9  U(       aV  ["        R$                  " US   5      n["        R&                  " US   5      nSUXðR(                  :H  '   UR+                  5       US'   [-        0 UEUEUS9$ )a  
Main method to prepare for the model one or several sequences(s) and image(s).

Args:
    text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    images (`ImageInput`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.


Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:
    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
    `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
    `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
Útokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNr   r   Ú	num_cropsr   r   r   FÚimage)Ú
modalitiesÚ	input_idsé   Úmm_token_type_ids)ÚdataÚtensor_type)Ú_merge_kwargsr   r+   Úinit_kwargsÚ
isinstanceÚstrÚlistÚ	TypeErrorr*   r.   Úpixel_valuesÚshapeÚpopÚreplacer4   ÚappendÚ_check_special_mm_tokensÚnpÚarrayÚ
zeros_liker5   Útolistr   )r:   r?   r@   ÚaudioÚvideosrA   Úoutput_kwargsÚimage_inputsÚtokens_per_imageÚprompt_stringsrE   Úsampler   r   Útext_inputsÚ	array_idsrJ   s                    r%   Ú__call__ÚAriaProcessor.__call__T   sí  € ð< ×*Ò*Üñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆô dœC× Ñ Ø6‰DÜ˜D¤$×'Ñ'´
¸4À¹7ÄC×0HÑ0HÜÐ_Ó`Ð`àÑØ×/Ò/°ÑY¸-ÈÑ:XÑYˆLà#×3Ñ3°L×4MÑ4M×4SÑ4SÐTUÑ4VÑWÐØˆNØ$×(Ñ(¨Ó5Ð8HÑHˆIÛØŸ™¨¯©×(BÑ(BÀDÇNÁN×D^ÑD^ÐajÑDjÓkØ×%Ñ% fÖ-ò ð
 ˆLØ!ˆNà& }Ñ5×9Ñ9Ð:JÈDÓQˆØ#0°Ñ#?×#CÑ#CÐD^Ð`eÓ#fÐ Ø—n’n ^Ñi°}À]Ñ7SÑiÐdhÒiˆØ×%Ñ% nÈwÈiÐ%ÑXæ#ÜŸš ¨[Ñ!9Ó:ˆIÜ "§¢¨k¸+Ñ.FÓ GÐØBCÐ˜i×+>Ñ+>Ñ>Ñ?Ø/@×/GÑ/GÓ/IˆKÐ+Ñ,äÐ!@ KÐ!@°<Ð!@ÈnÑ]Ð]r$   c                 ó¶  • 0 nUbÀ  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU Vs/ sH  o€R                  U   U-  PM     n	nUR                  X—S.5        [        S0 UD6$ s  snf s  snf )ay  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   r   )Únum_image_tokensÚnum_image_patchesr   )	r   r"   ÚgetÚupdater*   r   Úget_number_of_image_patchesr.   r   )
r:   Úimage_sizesrA   Úvision_datar   Úmax_sizeÚ
image_sizerj   Únum_patchesri   s
             r%   Ú_get_num_multimodal_tokensÚ(AriaProcessor._get_num_multimodal_tokens˜   sé   € ð ˆØÑ"Ü/×9Ñ9×=Ñ=¸oÈrÓRˆMØ× Ñ  Ô(à$×(Ñ(Ð)9¸4Ó@×gÀD×DXÑDX×DgÑDgˆHñ #.ó!á"-Jð ×$Ñ$×@Ò@Ð\À*Ð\ÈmÕ\Ù"-ð ð !ñ arÓrÑ`qÐQ\× 4Ñ 4°XÑ >ÀÔ LÑ`qÐÐrØ×ÑÐ4DÑmÔnäÑ, Ñ,Ð,ùò!ùò  ss   Á*'CÂCc                 ó:   • U R                   R                  " U0 UD6$ )zª
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r+   Úbatch_decode©r:   ÚargsrA   s      r%   rv   ÚAriaProcessor.batch_decode²   s   € ð
 ~‰~×*Ò*¨DÐ;°FÑ;Ð;r$   c                 ó:   • U R                   R                  " U0 UD6$ )z¤
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r+   Údecoderw   s      r%   r{   ÚAriaProcessor.decode¹   s   € ð
 ~‰~×$Ò$ dÐ5¨fÑ5Ð5r$   c                 óÔ   • U R                   R                  nU R                  R                  nU Vs/ sH  o3S:w  d  M
  UPM     nn[        [        R                  X-   5      5      $ s  snf )NrE   )r+   Úmodel_input_namesr*   rQ   ÚdictÚfromkeys)r:   Útokenizer_input_namesÚimage_processor_input_namesÚnames       r%   r~   ÚAriaProcessor.model_input_namesÀ   sb   € à $§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#ñ 9TÓ&kÑ8S°Ð_jÑWj§tÑ8SÐ#Ð&kÜ”D—M‘MÐ"7Ñ"UÓVÓWÐWùò 'ls
   ±A%½A%)r4   r5   r.   )NNNN)NNN)N)r   r   r   r    Ú__doc__Ú
attributesÚimage_processor_classÚtokenizer_classr   r   rP   r   r   Úfloatr3   r9   r   r   rQ   r   r   r   r   rf   rs   rv   r{   Úpropertyr~   r#   Ú__classcell__)r=   s   @r%   r(   r(   /   s
  ø† ñð $ [Ð1€JØ0ÐØ%€Oð Ø/3Ø'+ØBFñRð ˜¨Ð+Ñ,ðRð   ‘}ð	Rð
 " $ u¨U°C¨ZÑ'8¸#Ð'=Ñ">Ñ?÷Rð Rð* (,ØØñB^àIÐ0°$°y±/À4ÐHYÑCZÐZÑ[ðB^ð ˜Ñ$ðB^ð Ð,Ñ-ðB^ð 
õB^ôH-ò4<ò6ð ñXó öXr$   r(   )Útypingr   r   ÚnumpyrY   Úimage_processing_utilsr   Úimage_utilsr   Úprocessing_utilsr   r	   r
   r   Útokenization_utilsr   r   Úutilsr   Úautor   r   r(   Ú__all__r   r$   r%   Ú<module>r•      sL   ð÷* #ã å 2Ý %ß XÓ Xß >Ý Ý  ôÐ*°%ò ôYXNô YXðx Ð
r$   