ó
    <±hG?  ã                   óì   • S SK JrJr  S SKJrJrJr  SSKJr  SSK	J
r
JrJr  SSKJrJr  SSKJrJr  SSKJrJr  \" 5       (       a  S S	Kr\R.                  " \5      r " S
 S\SS9r " S S\5      rS/rg	)é    )ÚOptionalÚUnion)ÚIMAGE_TOKENÚPaliGemmaProcessorÚbuild_string_from_inputé   )ÚBatchFeature)Ú
ImageInputÚis_valid_imageÚmake_flat_list_of_images)ÚProcessingKwargsÚUnpack)ÚPreTokenizedInputÚ	TextInput)Úis_torch_availableÚloggingNc                   ó.   • \ rS rSrSS0SSS.SS0S	.rS
rg)ÚColPaliProcessorKwargsé#   ÚpaddingÚlongestÚchannels_firstT)Údata_formatÚdo_convert_rgbÚreturn_tensorsÚpt)Útext_kwargsÚimages_kwargsÚcommon_kwargs© N)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú	_defaultsÚ__static_attributes__r    ó    Úc/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/colpali/modular_colpali.pyr   r   #   s,   † ð yð
ð ,Ø"ñ
ð +¨DÐ1ñ	ƒIr'   r   F)Útotalc                   óN  ^ • \ rS rSrSr     SS\S\4U 4S jjjr\S\4S j5       r    SS\	S	\
\\\\   \\   4   S
\\   S\4S jjr SS\	S
\\   S\4S jjrS	\
\\\   4   S
\\   S\4S jr   SS\
S\S   4   S\
S\S   4   S\S\S   S\
S\4   SS4S jjrSrU =r$ )ÚColPaliProcessoré0   a¯  
Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
well as to compute the late-interaction retrieval score.

[`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
for more information.

Args:
    image_processor ([`SiglipImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`LlamaTokenizerFast`], *optional*):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
    visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
        A string that gets tokenized and prepended to the image tokens.
    query_prefix (`str`, *optional*, defaults to `"Question: "`):
        A prefix to be used for the query.
Úvisual_prompt_prefixÚquery_prefixc                 ó:   >• [         TU ]  XUS9  X@l        XPl        g )N)Úimage_processorÚ	tokenizerÚchat_template)ÚsuperÚ__init__r-   r.   )Úselfr0   r1   r2   r-   r.   Ú	__class__s         €r(   r4   ÚColPaliProcessor.__init__E   s$   ø€ ô 	‰Ñ¨Ð]jÐÑkØ$8Ô!Ø(Õr'   Úreturnc                 ó.   • U R                   R                  $ )zr
Return the query augmentation token.

Query augmentation buffers are used as reasoning buffers during inference.
)r1   Ú	pad_token)r5   s    r(   Úquery_augmentation_tokenÚ)ColPaliProcessor.query_augmentation_tokenQ   s   € ð ~‰~×'Ñ'Ð'r'   ÚimagesÚtextÚkwargsc                 óx  • U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLnUc  Uc  [        S5      eUb  Ub  [        S5      eUGbà  [        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU R                  /[        U5      -  n	U V
s/ sH  oªR                  S	5      PM     nn
[        X‘5       VVs/ sHT  u  p¼[        UU R                  R                  U R                  [         [        U[        5      (       a  [        U5      OS
S9PMV     nnn[#        U5      nU R$                  " U40 US   D6S   nUS   R'                  SS5      b  US   S==   U R                  -  ss'   U R                  " U4SS0US   D6n0 UESU0EnU(       a.  US   R)                  US   S:H  S5      nUR+                  SU05        [-        US9$ Ubà  [        U[.        5      (       a  U/nO8[        U[        5      (       a  [        US   [.        5      (       d  [        S5      eUc  U R0                  S-  n/ nU H@  nU R                  R                  U R2                  -   U-   U-   S-   nUR5                  U5        MB     US   R'                  SS5      US   S'   U R                  " U4SS0US   D6nU$ gs  sn
f s  snnf )añ  
Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
both text and images at the same time.

When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
[`~LlamaTokenizerFast.__call__`].
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
[`~SiglipImageProcessor.__call__`].
Please refer to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
Útokenizer_init_kwargsr   ÚsuffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesÚRGBé   )ÚpromptÚ	bos_tokenÚimage_seq_lenÚimage_tokenÚ
num_imagesr   Úpixel_valuesÚ
max_lengthÚreturn_token_type_idsFÚ	input_idsÚtoken_type_idsiœÿÿÿÚlabels)Údataz*Text must be a string or a list of stringsé
   Ú
é2   )Ú_merge_kwargsr   r1   Úinit_kwargsÚpopÚ
ValueErrorr   Ú
isinstanceÚlistr-   ÚlenÚconvertÚzipr   rF   Úimage_seq_lengthr   r   r0   ÚgetÚmasked_fillÚupdater	   Ústrr;   r.   Úappend)r5   r=   r>   ÚaudioÚvideosr?   Úoutput_kwargsrB   rL   Ú	texts_docÚimagerE   Ú
image_listÚinput_stringsrJ   ÚinputsÚreturn_datarO   Útexts_queryÚqueryÚbatch_querys                        r(   Ú__call__ÚColPaliProcessor.__call__Z   s|  € ðZ ×*Ò*Ü"ñ
à"&§.¡.×"<Ñ"<ð
ð ñ
ˆð
 ˜}Ñ-×1Ñ1°(¸DÓAˆà &¨dÐ 2Ðà‰<˜F™NÜÐEÓFÐFØÑ Ñ 2ÜÐTÓUÐUàÒÜ˜f×%Ñ%Ø ˜‘Ü˜F¤D×)Ñ)¬n¸VÀA¹Y×.GÑ.GØÜ  ¬×.Ñ.´:¸fÀQ¹iÌ×3NÑ3NÔSaÐbhÐijÑbkÐlmÑbn×SoÑSoÜ Ð!dÓeÐeà×2Ñ2Ð3´c¸&³kÑAˆIÙ8>Ó?¹¨u—m‘m EÖ*¹ˆFÐ?ô +.¨iÔ*@ô	ñ +AÑ&Fô (Ø!Ø"Ÿn™n×6Ñ6Ø"&×"7Ñ"7Ü +Ü2<¸ZÌ×2NÑ2Nœs :œÐTUôñ +Að ñ 	ô .¨fÓ5ˆFØ×/Ò/°ÑY¸-ÈÑ:XÑYÐZhÑiˆLð ˜]Ñ+×/Ñ/°¸dÓCÑOØ˜mÑ,¨\Ó:¸d×>SÑ>SÑSÓ:à—^’^Øñà&+ðð   Ñ.ñˆFð C˜VÐB ^°\ÑBˆKæ$Ø Ñ,×8Ñ8¸Ð@PÑ9QÐUVÑ9VÐX\Ó]Ø×"Ñ" H¨fÐ#5Ô6ä [Ñ1Ð1àÑÜ˜$¤×$Ñ$Øv‘Ü  ¤t×,Ñ,´¸DÀ¹GÄS×1IÑ1IÜ Ð!MÓNÐNà‰~Ø×6Ñ6¸Ñ;à%'ˆKÛØŸ™×0Ñ0°4×3DÑ3DÑDÀuÑLÈvÑUÐX\Ñ\Ø×"Ñ" 5Ö)ñ ð :GÀ}Ñ9U×9YÑ9YÐZfÐhjÓ9kˆM˜-Ñ(¨Ñ6àŸ.š.Øñà&+ðð   Ñ.ñˆKð Ðð- ùòC @ùó	s   ÄL1Ä>AL6c                 ó*   • U R                   " SSU0UD6$ )aæ  
Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
[`ColPaliProcessor.__call__`].

This method forwards the `images` and `kwargs` arguments to the image processor.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
r=   r    ©ro   )r5   r=   r?   s      r(   Úprocess_imagesÚColPaliProcessor.process_images×   s   € ðB }Š}Ñ5 FÐ5¨fÑ5Ð5r'   c                 ó*   • U R                   " SSU0UD6$ )a?  
Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
[`ColPaliProcessor.__call__`].

This method forwards the `text` and `kwargs` arguments to the tokenizer.

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
r>   r    rr   )r5   r>   r?   s      r(   Úprocess_queriesÚ ColPaliProcessor.process_queriesú   s   € ð@ }Š}Ñ1 $Ð1¨&Ñ1Ð1r'   Úquery_embeddingsztorch.TensorÚpassage_embeddingsÚ
batch_sizeÚoutput_dtypeztorch.dtypeÚoutput_deviceztorch.devicec           	      óÊ  • [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X£-    SSS9nUR                  [
        R                  " SX›5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )aÂ  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)Úbatch_firstÚpadding_valuezbnd,csd->bcnsr   )Údimé   rD   )rZ   rW   ÚdeviceÚdtypeÚrangeÚtorchÚnnÚutilsÚrnnÚpad_sequencerb   ÚeinsumÚmaxÚsumÚcatÚto)r5   rx   ry   rz   r{   r|   ÚscoresÚiÚbatch_scoresÚbatch_queriesÚjÚbatch_passagess               r(   Úscore_retrievalÚ ColPaliProcessor.score_retrieval  sÍ  € ô@ ÐÓ  AÓ%ÜÐ2Ó3Ð3ÜÐ!Ó" aÓ'ÜÐ3Ó4Ð4à˜AÑ×%Ñ%Ð);¸AÑ)>×)EÑ)EÓEÜÐNÓOÐOà˜AÑ×$Ñ$Ð(:¸1Ñ(=×(CÑ(CÓCÜÐLÓMÐMàÑØ+¨AÑ.×4Ñ4ˆLà%'ˆäqœ#Ð.Ó/°×<ˆAØ/1ˆLÜ!ŸH™HŸN™N×.Ñ.×;Ñ;Ø  Q¡^Ð4À$ÐVWð <ð ˆMô ˜1œcÐ"4Ó5°zÖBÜ!&§¡§¡×!3Ñ!3×!@Ñ!@Ø&¨1©>Ð:ÈÐ\]ð "Að "ð ×#Ñ#Ü—L’L °-ÓP×TÑTÐYZÐTÐ[Ð\]Ñ^×bÑbÐghÐbÐiöñ	 Cð M‰Mœ%Ÿ)š) L°aÑ8×;Ñ;¸LÓI×LÑLÈ]Ó[×\ñ =ô yŠy˜ QÑ'Ð'r'   )r.   r-   )NNNzDescribe the image.z
Question: )NNNN)N)é€   NÚcpu)r!   r"   r#   r$   Ú__doc__ra   r4   Úpropertyr;   r
   r   r   r   rY   r   r   r	   ro   rs   rv   Úintr   r•   r&   Ú__classcell__)r6   s   @r(   r+   r+   0   s  ø† ñð, ØØØ$9Ø(ñ
)ð
 "ð
)ð ÷
)ð 
)ð ð(¨#ó (ó ð(ð "Ø^bØØñ{àð{ð IÐ0°$°y±/À4ÐHYÑCZÐZÑ[ð{ð Ð/Ñ0ð{ð 
õ{ð~ "ñ!6àð!6ð Ð/Ñ0ð!6ð 
õ	!6ðF 2àI˜t I™Ð.Ñ/ð 2ð Ð/Ñ0ð 2ð 
ô	 2ðL Ø04Ø49ñ>(à °°^Ñ0DÐ DÑEð>(ð " .°$°~Ñ2FÐ"FÑGð>(ð ð	>(ð
 ˜}Ñ-ð>(ð ˜^¨SÐ0Ñ1ð>(ð 
÷>(ó >(r'   r+   )Útypingr   r   Ú2transformers.models.paligemma.processing_paligemmar   r   r   Úfeature_extraction_utilsr	   Úimage_utilsr
   r   r   Úprocessing_utilsr   r   Útokenization_utils_baser   r   r‡   r   r   r…   Ú
get_loggerr!   Úloggerr   r+   Ú__all__r    r'   r(   Ú<module>r¦      sm   ð÷" #ç wÑ wå 4ß OÑ Oß 8ß Cß 0ñ ×ÑÛð 
×	Ò	˜HÓ	%€ô
Ð-°Uò 
ôj(Ð)ô j(ð\	 ðr'   