ó
    <±hMN  ã                   ó´   • S SK JrJr  SSKJr  SSKJrJr  SSKJ	r	J
r
JrJr  SSKJrJr  SSKJr  \" 5       (       a  S SKr " S	 S
\
SS9r " S S\5      rS/rg)é    )ÚOptionalÚUnioné   )ÚBatchFeature)Ú
ImageInputÚis_valid_image)ÚMultiModalDataÚProcessingKwargsÚProcessorMixinÚUnpack)ÚPreTokenizedInputÚ	TextInput)Úis_torch_availableNc                   ó.   • \ rS rSrSS0SSS.SS0S	.rS
rg)ÚColQwen2ProcessorKwargsé#   ÚpaddingÚlongestÚchannels_firstT)Údata_formatÚdo_convert_rgbÚreturn_tensorsÚpt)Útext_kwargsÚimages_kwargsÚcommon_kwargs© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú	_defaultsÚ__static_attributes__r   ó    Úh/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/colqwen2/processing_colqwen2.pyr   r   #   s,   † ð yð
ð ,Ø"ñ
ð +¨DÐ1ñ	ƒIr$   r   F)Útotalc                   ó  ^ • \ rS rSrSrSS/rSrSr     S S\\	   S\\	   4U 4S	 jjjr
    S!S
\S\\\\\   \\   4   S\\   S\4S jjrS"S jrS rS r\S 5       r\S\	4S j5       r S"S
\S\\   S\4S jjrS\\\\   4   S\\   S\4S jr   S#S\S\S   4   S\S\S   4   S\S\S   S\S\	4   SS4S jjrSrU =r$ )$ÚColQwen2Processoré0   a[  
Constructs a ColQwen2 processor which wraps a Qwen2VLProcessor and special methods to process images and queries, as
well as to compute the late-interaction retrieval score.

[`ColQwen2Processor`] offers all the functionalities of [`Qwen2VLProcessor`]. See the [`~Qwen2VLProcessor.__call__`]
for more information.

Args:
    image_processor ([`Qwen2VLImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`Qwen2TokenizerFast`], *optional*):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
    visual_prompt_prefix (`str`, *optional*): A string that gets tokenized and prepended to the image tokens.
    query_prefix (`str`, *optional*): A prefix to be used for the query.
Úimage_processorÚ	tokenizerÚAutoImageProcessor)ÚQwen2TokenizerÚQwen2TokenizerFastÚvisual_prompt_prefixÚquery_prefixc                 óÞ   >• [         TU ]  XUS9  [        US5      (       d  SOUR                  U l        [        US5      (       d  SOUR                  U l        Uc  SnX@l        Uc  SnXPl        g )N)Úchat_templateÚimage_tokenz<|image_pad|>Úvideo_tokenz<|video_pad|>zf<|im_start|>user
<|vision_start|><|image_pad|><|vision_end|>Describe the image.<|im_end|><|endoftext|>zQuery: )ÚsuperÚ__init__Úhasattrr3   r4   r/   r0   )Úselfr*   r+   r2   r/   r0   ÚkwargsÚ	__class__s          €r%   r6   ÚColQwen2Processor.__init__H   sw   ø€ ô 	‰Ñ˜À=ÐÑQÜ29¸)À]×2SÑ2S™?ÐYb×YnÑYnˆÔÜ29¸)À]×2SÑ2S™?ÐYb×YnÑYnˆÔàÑ'ð $MÐ Ø$8Ô!àÑØ$ˆLØ(Õr$   ÚimagesÚtextr9   Úreturnc                 óä  • U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLnUc  Uc  [        S5      eUb  Ub  [        S5      eUGbV  [        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU R                  /[        U5      -  n	U R                  " SS	U0US
   D6n
U
S   nUb¸  U R                  R                  S-  nSn[        [        U	5      5       H…  nU R                  Xž   ;   aP  Xž   R                  U R                  SX½   R!                  5       U-  -  S5      Xž'   US-  nU R                  Xž   ;   a  MP  Xž   R                  SU R                  5      Xž'   M‡     U R                  " U	4SS0US   D6n[#        0 UEU
ES9nUS   SS2S4   US   SS2S4   -  n[        [$        R&                  " US   UR)                  5       5      5      n[$        R*                  R,                  R.                  R1                  USS9US'   U(       a.  US   R3                  US   S:H  S5      nUR5                  SU05        U$ Ub«  [        U[6        5      (       a  U/nO8[        U[        5      (       a  [        US   [6        5      (       d  [        S5      eUc  U R8                  S-  n/ nU H&  nU R:                  U-   U-   nUR=                  U5        M(     U R                  " U4SS0US   D6nU$ g)aø  
Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
wrapper around the Qwen2VLProcessor's [`~Qwen2VLProcessor.__call__`] method adapted for the ColQwen2 model. It cannot process
both text and images at the same time.

When preparing the the text(s), this method forwards the `text` and `kwargs` arguments to Qwen2TokenizerFast's
[`~Qwen2TokenizerFast.__call__`].
When preparing the the image(s), this method forwards the `images` and `kwargs` arguments to Qwen2VLImageProcessor's
[`~Qwen2VLImageProcessor.__call__`].
Please refer to the doctsring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
Útokenizer_init_kwargsr   ÚsuffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesr<   r   Úimage_grid_thwé   z<|placeholder|>é   Úreturn_token_type_idsF)ÚdataÚpixel_valuesT)Úbatch_firstÚ	input_idsÚtoken_type_idsiœÿÿÿÚlabelsz*Text must be a string or a list of stringsé
   r   )Ú_merge_kwargsr   r+   Úinit_kwargsÚpopÚ
ValueErrorr   Ú
isinstanceÚlistr/   Úlenr*   Ú
merge_sizeÚranger3   ÚreplaceÚprodr   ÚtorchÚsplitÚtolistÚnnÚutilsÚrnnÚpad_sequenceÚmasked_fillÚupdateÚstrÚquery_augmentation_tokenr0   Úappend)r8   r<   r=   ÚaudioÚvideosr9   Úoutput_kwargsrA   rE   Ú	texts_docÚimage_inputsrB   Úmerge_lengthÚindexÚiÚtext_inputsÚreturn_dataÚoffsetsrG   rK   Útexts_queryÚqueryÚaugmented_queryÚbatch_querys                           r%   Ú__call__ÚColQwen2Processor.__call__]   s»  € ðZ ×*Ò*Ü#ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð
 ˜}Ñ-×1Ñ1°(¸DÓAˆà &¨dÐ 2Ðà‰<˜F™NÜÐEÓFÐFØÑ Ñ 2ÜÐTÓUÐUàÒÜ˜f×%Ñ%Ø ˜‘Ü˜F¤D×)Ñ)¬n¸VÀA¹Y×.GÑ.GØÜ  ¬×.Ñ.´:¸fÀQ¹iÌ×3NÑ3NÔSaÐbhÐijÑbkÐlmÑbn×SoÑSoÜ Ð!dÓeÐeà×2Ñ2Ð3´c¸&³kÑAˆIà×/Ò/Ñ`°vÐ`ÀÈÑA_Ñ`ˆLØ)Ð*:Ñ;ˆNàÑ)Ø#×3Ñ3×>Ñ>ÀÑAØÜœs 9›~Ö.AØ×*Ñ*¨i©lÓ:Ø'0¡|×';Ñ';Ø ×,Ñ,Ð.?À>ÑCX×C]ÑC]ÓC_ÐcoÑCoÑ.pÐrsó(˜	™ð  ™
˜ð	 ×*Ñ*¨i©lÕ:ð
 $-¡<×#7Ñ#7Ð8IÈ4×K[ÑK[Ó#\I“Lñ /ð Ÿ.š.Øñà&+ðð   Ñ.ñˆKô 'Ð,K¨{Ð,K¸lÐ,KÑLˆKð "Ð"2Ñ3²A°q°DÑ9¸KÐHXÑ<YÒZ[Ð]^ÐZ^Ñ<_Ñ_ˆGô  Ü—’˜K¨Ñ7¸¿¹Ó9IÓJóˆLô
 +0¯(©(¯.©.×*<Ñ*<×*IÑ*IØ¨$ð +Jð +ˆK˜Ñ'ö %Ø$ [Ñ1×=Ñ=¸kÐJZÑ>[Ð_`Ñ>`ÐbfÓgØ×"Ñ" H¨fÐ#5Ô6àÐàÑÜ˜$¤×$Ñ$Øv‘Ü  ¤t×,Ñ,´¸DÀ¹GÄS×1IÑ1IÜ Ð!MÓNÐNà‰~Ø×6Ñ6¸Ñ;à%'ˆKãØ"&×"3Ñ"3°eÑ";¸fÑ"DØ×"Ñ" ?Ö3ñ ð Ÿ.š.Øñà&+ðð   Ñ.ñˆKð Ðð+ r$   c                 ó¢  • 0 nUb¶  [         R                  R                  S0 5      nUR                  U5        UR                  SS5      =(       d    U R                  R
                  nU Vs/ sH!  nU R                  R                  " / UQUP76 PM#     nnU Vs/ sH
  oˆUS-  -  PM     n	nUR                  X—S.5        [        S0 UD6$ s  snf s  snf )ay  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
Args:
    image_sizes (`list[list[int]]`, *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
Nr   rT   rC   )Únum_image_tokensÚnum_image_patchesr   )r   r"   Úgetr`   r*   rT   Úget_number_of_image_patchesr	   )
r8   Úimage_sizesr9   Úvision_datar   rT   Ú
image_sizerw   Únum_patchesrv   s
             r%   Ú_get_num_multimodal_tokensÚ,ColQwen2Processor._get_num_multimodal_tokensâ   sá   € ð ˆØÑ"Ü3×=Ñ=×AÑAÀ/ÐSUÓVˆMØ× Ñ  Ô(Ø&×*Ñ*¨<¸Ó>×aÀ$×BVÑBV×BaÑBaˆJñ #.ó!á"-Jð ×$Ñ$×@Ò@Ð\À*Ð\ÈmÕ\Ù"-ð ð !ñ SdÓdÑRcÀ;°
¸A±Ô!=ÑRcÐÐdØ×ÑÐ4DÑmÔnäÑ, Ñ,Ð,ùò!ùò  es   Á*'CÂCc                 ó:   • U R                   R                  " U0 UD6$ )zª
This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r+   Úbatch_decode©r8   Úargsr9   s      r%   r   ÚColQwen2Processor.batch_decodeü   s   € ð
 ~‰~×*Ò*¨DÐ;°FÑ;Ð;r$   c                 ó:   • U R                   R                  " U0 UD6$ )z¤
This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r+   Údecoder‚   s      r%   r†   ÚColQwen2Processor.decode  s   € ð
 ~‰~×$Ò$ dÐ5¨fÑ5Ð5r$   c                 óš   • U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      $ ©N)r+   Úmodel_input_namesr*   rR   ÚdictÚfromkeys)r8   Útokenizer_input_namesÚimage_processor_input_namess      r%   rŠ   Ú#ColQwen2Processor.model_input_names
  s<   € à $§¡× @Ñ @ÐØ&*×&:Ñ&:×&LÑ&LÐ#Ü”D—M‘MÐ"7Ñ"UÓVÓWÐWr$   c                 ó.   • U R                   R                  $ )zr
Return the query augmentation token.

Query augmentation buffers are used as reasoning buffers during inference.
)r+   Ú	pad_token)r8   s    r%   rb   Ú*ColQwen2Processor.query_augmentation_token  s   € ð ~‰~×'Ñ'Ð'r$   c                 ó*   • U R                   " SSU0UD6$ )aè  
Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColQwen2Processor's
[`ColQwen2Processor.__call__`].

This method forwards the `images` and `kwargs` arguments to the image processor.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
r<   r   ©rs   )r8   r<   r9   s      r%   Úprocess_imagesÚ ColQwen2Processor.process_images  s   € ðB }Š}Ñ5 FÐ5¨fÑ5Ð5r$   c                 ó*   • U R                   " SSU0UD6$ )aA  
Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColQwen2Processor's
[`ColQwen2Processor.__call__`].

This method forwards the `text` and `kwargs` arguments to the tokenizer.

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
r=   r   r”   )r8   r=   r9   s      r%   Úprocess_queriesÚ!ColQwen2Processor.process_queries<  s   € ð@ }Š}Ñ1 $Ð1¨&Ñ1Ð1r$   Úquery_embeddingsztorch.TensorÚpassage_embeddingsÚ
batch_sizeÚoutput_dtypeztorch.dtypeÚoutput_deviceztorch.devicec           	      óÊ  • [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X£-    SSS9nUR                  [
        R                  " SX›5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )aÃ  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColQwen2, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)rH   Úpadding_valuezbnd,csd->bcnsr   )ÚdimrC   rD   )rS   rP   ÚdeviceÚdtyperU   rX   r[   r\   r]   r^   rc   ÚeinsumÚmaxÚsumÚcatÚto)r8   rš   r›   rœ   r   rž   Úscoresrk   Úbatch_scoresÚbatch_queriesÚjÚbatch_passagess               r%   Úscore_retrievalÚ!ColQwen2Processor.score_retrieval^  sÍ  € ô@ ÐÓ  AÓ%ÜÐ2Ó3Ð3ÜÐ!Ó" aÓ'ÜÐ3Ó4Ð4à˜AÑ×%Ñ%Ð);¸AÑ)>×)EÑ)EÓEÜÐNÓOÐOà˜AÑ×$Ñ$Ð(:¸1Ñ(=×(CÑ(CÓCÜÐLÓMÐMàÑØ+¨AÑ.×4Ñ4ˆLà%'ˆäqœ#Ð.Ó/°×<ˆAØ/1ˆLÜ!ŸH™HŸN™N×.Ñ.×;Ñ;Ø  Q¡^Ð4À$ÐVWð <ð ˆMô ˜1œcÐ"4Ó5°zÖBÜ!&§¡§¡×!3Ñ!3×!@Ñ!@Ø&¨1©>Ð:ÈÐ\]ð "Að "ð ×#Ñ#Ü—L’L °-ÓP×TÑTÐYZÐTÐ[Ð\]Ñ^×bÑbÐghÐbÐiöñ	 Cð M‰Mœ%Ÿ)š) L°aÑ8×;Ñ;¸LÓI×LÑLÈ]Ó[×\ñ =ô yŠy˜ QÑ'Ð'r$   )r3   r0   r4   r/   )NNNNN)NNNNr‰   )é€   NÚcpu) r   r   r    r!   Ú__doc__Ú
attributesÚimage_processor_classÚtokenizer_classr   ra   r6   r   r   r   r   rR   r   r   r   rs   r~   r   r†   ÚpropertyrŠ   rb   r•   r˜   Úintr®   r#   Ú__classcell__)r:   s   @r%   r(   r(   0   sÓ  ø† ñð$ $ [Ð1€Jà0ÐØ>€Oð ØØØ.2Ø&*ñ)ð
 ' s™mð)ð ˜s‘m÷)ð )ð. "Ø^bØØñCàðCð IÐ0°$°y±/À4ÐHYÑCZÐZÑ[ðCð Ð0Ñ1ðCð 
õCôJ-ò4<ò6ð ñXó ðXð
 ð(¨#ó (ó ð(ð "ñ!6àð!6ð Ð0Ñ1ð!6ð 
õ	!6ðF 2àI˜t I™Ð.Ñ/ð 2ð Ð0Ñ1ð 2ð 
ô	 2ðL Ø04Ø49ñ>(à °°^Ñ0DÐ DÑEð>(ð " .°$°~Ñ2FÐ"FÑGð>(ð ð	>(ð
 ˜}Ñ-ð>(ð ˜^¨SÐ0Ñ1ð>(ð 
÷>(ó >(r$   r(   )Útypingr   r   Úfeature_extraction_utilsr   Úimage_utilsr   r   Úprocessing_utilsr	   r
   r   r   Útokenization_utils_baser   r   r\   r   rX   r   r(   Ú__all__r   r$   r%   Ú<module>r¿      sR   ð÷, #å 4ß 5ß XÓ Xß CÝ 'ñ ×ÑÛô
Ð.°eò 
ôl(˜ô l(ð^ Ð
r$   