
    <h&                         S SK JrJr  S SKrSSKJr  SSKJr  SSK	J
r
JrJrJr  SSKJrJr  SSKJr  S	S
KJr   " S S\SS9r " S S\5      rS/rg)    )OptionalUnionN   )BatchFeature)
ImageInput)MultiModalDataProcessingKwargsProcessorMixinUnpack)PreTokenizedInput	TextInput)
TensorType   )AutoTokenizerc                   @    \ rS rSrSSS.SSS.\R
                  S.rSrg)	AriaProcessorKwargs!   F)paddingreturn_mm_token_type_ids  )max_image_sizesplit_image)text_kwargsimages_kwargsreturn_tensors N)__name__
__module____qualname____firstlineno__r   PYTORCH	_defaults__static_attributes__r       `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/aria/processing_aria.pyr   r   !   s.     (-

 " 
 %,,
Ir$   r   F)totalc                      ^  \ rS rSrSrSS/rSrSr    SS\\	\
4   S\\
   S\\\\\4   \4      4U 4S	 jjjr   SS
\\\\\   \\   4   S\\   S\\   S\4S jjrSS jrS rS r\S 5       rSrU =r$ )AriaProcessor/   a  
AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer.

Args:
    image_processor (`AriaImageProcessor`, *optional*):
        The AriaImageProcessor to use for image preprocessing.
    tokenizer (`PreTrainedTokenizerBase`, *optional*):
        An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input.
    chat_template (`str`, *optional*):
        A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
    size_conversion (`Dict`, *optional*):
        A dictionary indicating size conversions for images.
image_processor	tokenizerAriaImageProcessorr   chat_templatesize_conversionc                 *  > Uc  SSS.nUR                  5        VVs0 sH  u  pV[        U5      U_M     snnU l        UR                  U l        UR                  U l        Ub  UR
                  c  UR                  Ul        [        TU ]!  XUS9  g s  snnf )N      )i  r   )r-   )	itemsintr.   image_tokenimage_token_id	pad_token	unk_tokensuper__init__)selfr*   r+   r-   r.   kv	__class__s          r%   r9   AriaProcessor.__init__B   s     "$'c2O6E6K6K6MN6MdaA	6MN$00'66 Y%8%8%@"+"5"5I=Q  Os   Btextimageskwargsreturnc                    U R                   " [        4SU R                  R                  0UD6n[	        U[
        5      (       a  U/nO8[	        U[        5      (       d#  [	        US   [
        5      (       d  [        S5      eUb  U R                  " U40 US   D6nU R                  UR                  R                  S      n/ n	UR                  S5      U-  n
U HQ  nUR                  U R                  R                  U R                  R                  U
-  5      nU	R                  U5        MS     O0 nUn	US   R                  S	S5      nUS   R                  S
S5      nU R                  " U	40 US   DS	S0D6nU R!                  XS/S9  U(       aV  ["        R$                  " US   5      n["        R&                  " US   5      nSUXR(                  :H  '   UR+                  5       US'   [-        0 UEUEUS9$ )a  
Main method to prepare for the model one or several sequences(s) and image(s).

Args:
    text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    images (`ImageInput`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. Both channels-first and channels-last formats are supported.


Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:
    - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
    `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
    `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
    - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`.
tokenizer_init_kwargsr   zAInvalid input text. Please provide a string, or a list of stringsNr   r   	num_cropsr   r   r   Fimage)
modalities	input_ids   mm_token_type_ids)datatensor_type)_merge_kwargsr   r+   init_kwargs
isinstancestrlist	TypeErrorr*   r.   pixel_valuesshapepopreplacer4   append_check_special_mm_tokensnparray
zeros_liker5   tolistr   )r:   r?   r@   audiovideosrA   output_kwargsimage_inputstokens_per_imageprompt_stringsrE   sampler   r   text_inputs	array_idsrJ   s                    r%   __call__AriaProcessor.__call__T   s   < **
"&.."<"<
 
 dC  6DD$''
47C0H0H_``//Y-:XYL#33L4M4M4S4STU4VWN$((58HHI(B(BDNND^D^ajDjk%%f- 
 L!N&}599:JDQ#0#?#C#CD^`e#f nn^i}]7Sidhi%%nwi%X#[!9:I "k+.F GBCi+>+>>?/@/G/G/IK+,!@K!@<!@n]]r$   c                    0 nUb  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU Vs/ sH  oR                  U   U-  PM     n	nUR                  XS.5        [        S0 UD6$ s  snf s  snf )ay  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   r   )num_image_tokensnum_image_patchesr   )	r   r"   getupdater*   r   get_number_of_image_patchesr.   r   )
r:   image_sizesrA   vision_datar   max_size
image_sizerj   num_patchesri   s
             r%   _get_num_multimodal_tokens(AriaProcessor._get_num_multimodal_tokens   s     "/99==orRM  ($(()94@gDDXDXDgDgH #.!"-J $$@@\*\m\"-  ! arr`qQ\ 4 4X > L`qr4Dmn,,,!  ss   *'CCc                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r+   batch_decoder:   argsrA   s      r%   rv   AriaProcessor.batch_decode   s    
 ~~**D;F;;r$   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r+   decoderw   s      r%   r{   AriaProcessor.decode   s    
 ~~$$d5f55r$   c                     U R                   R                  nU R                  R                  nU Vs/ sH  o3S:w  d  M
  UPM     nn[        [        R                  X-   5      5      $ s  snf )NrE   )r+   model_input_namesr*   rQ   dictfromkeys)r:   tokenizer_input_namesimage_processor_input_namesnames       r%   r~   AriaProcessor.model_input_names   sb     $ @ @&*&:&:&L&L# 9T&k8S_jWjt8S#&kDMM"7"UVWW 'ls
   A%A%)r4   r5   r.   )NNNN)NNN)N)r   r   r   r    __doc__
attributesimage_processor_classtokenizer_classr   r   rP   r   r   floatr3   r9   r   r   rQ   r   r   r   r   rf   rs   rv   r{   propertyr~   r#   __classcell__)r=   s   @r%   r(   r(   /   s
    $[1J0%O /3'+BFR +,R  }	R
 "$uUCZ'8#'=">?R R* (,B^I0$y/4HYCZZ[B^ $B^ ,-B^ 
B^H-4<6 X Xr$   r(   )typingr   r   numpyrY   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   tokenization_utilsr   r   utilsr   autor   r   r(   __all__r   r$   r%   <module>r      sL   * #  2 % X X >   *% YXN YXx 
r$   