
    <h&Q                     P   S SK JrJr  SSKJr  SSKJrJrJr  SSK	J
r
JrJrJr  SSKJrJrJr  SSKJr  \" 5       (       a  S SKr " S	 S
\SS9rSr\" S5       V s/ sH
  n SU S S3PM     sn \" S5       V s/ sH
  n SU S S3PM     sn -   rS r " S S\5      rS/rgs  sn f s  sn f )    )OptionalUnion   )BatchFeature)
ImageInputis_valid_imagemake_flat_list_of_images)MultiModalDataProcessingKwargsProcessorMixinUnpack)
AddedTokenPreTokenizedInput	TextInput)is_torch_availableNc                   .    \ rS rSrSS0SSS.SS0S	.rS
rg)ColPaliProcessorKwargs$   paddinglongestchannels_firstT)data_formatdo_convert_rgbreturn_tensorspt)text_kwargsimages_kwargscommon_kwargs N)__name__
__module____qualname____firstlineno__	_defaults__static_attributes__r       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/colpali/processing_colpali.pyr   r   $   s,     y
 ,"
 +D1	Ir&   r   F)totalz<image>i   z<locz0>4>   z<segz0>3c                     X2-  U-   U U  S3$ )a  
Builds a string from the input prompt and image tokens.
For example, for the call:
build_string_from_input(
    prompt="Prefix str"
    bos_token="<s>",
    image_seq_len=3,
    image_token="<im>",
)
The output will be:
"<im><im><im><s>Initial str"
Args:
    prompt (`list[Union[str, ImageInput]]`): The input prompt.
    bos_token (`str`): The beginning of sentence token.
    image_seq_len (`int`): The length of the image sequence.
    image_token (`str`): The image token.
    num_images (`int`): Number of images in the prompt.

r   prompt	bos_tokenimage_seq_lenimage_token
num_imagess        r'   build_string_from_inputr3   5   s"    & )J67	{6("MMr&   c                     ^  \ rS rSrSrSS/rSrSr     S S\S\4U 4S	 jjjr	    S!S
\
S\\\\\   \\   4   S\\   S\4S jjrS"S jrS rS r\S 5       r\S\4S j5       r S"S
\
S\\   S\4S jjrS\\\\   4   S\\   S\4S jr   S#S\S\S   4   S\S\S   4   S\S\S   S\S\4   SS4S jjrSrU =r$ )$ColPaliProcessorK   a  
Constructs a ColPali processor which wraps a PaliGemmaProcessor and special methods to process images and queries, as
well as to compute the late-interaction retrieval score.

[`ColPaliProcessor`] offers all the functionalities of [`PaliGemmaProcessor`]. See the [`~PaliGemmaProcessor.__call__`]
for more information.

Args:
    image_processor ([`SiglipImageProcessor`], *optional*):
        The image processor is a required input.
    tokenizer ([`LlamaTokenizerFast`], *optional*):
        The tokenizer is a required input.
    chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
        in a chat into a tokenizable string.
    visual_prompt_prefix (`str`, *optional*, defaults to `"Describe the image."`):
        A string that gets tokenized and prepended to the image tokens.
    query_prefix (`str`, *optional*, defaults to `"Question: "`):
        A prefix to be used for the query.
image_processor	tokenizer)SiglipImageProcessorSiglipImageProcessorFast)GemmaTokenizerGemmaTokenizerFastvisual_prompt_prefixquery_prefixc                   > [         TU ]  XUS9  Uc  [        S5      eUc  [        S5      e[        US5      (       d  [        S5      eUR                  U l        [        US5      (       dK  [        [        SSS	9nS
U/0nUR                  U5        UR                  [        5      U l	        [        U l
        O"UR                  U l	        UR                  U l
        UR                  [        5        SUl        SUl        X@l        XPl        g )N)chat_templatez)You need to specify an `image_processor`.z"You need to specify a `tokenizer`.image_seq_lengthz;Image processor is missing an `image_seq_length` attribute.r1   FT)
normalizedspecialadditional_special_tokens)super__init__
ValueErrorhasattrrA   r   IMAGE_TOKENadd_special_tokensconvert_tokens_to_idsimage_token_idr1   
add_tokensEXTRA_TOKENSadd_bos_tokenadd_eos_tokenr=   r>   )	selfr7   r8   r@   r=   r>   r1   tokens_to_add	__class__s	           r'   rF   ColPaliProcessor.__init__d   s     	=Q"HIIABB(:;;Z[[ / @ @y-00$[UDQK8;-HM((7"+"A"A+"ND*D"+":":D(44D\*"'	"'	$8!(r&   imagestextkwargsreturnc                 x   U R                   " [        4SU R                  R                  0UD6nUS   R	                  SS5      nUSLnUc  Uc  [        S5      eUb  Ub  [        S5      eUGb  [        U5      (       a  U/nOw[        U[        5      (       a  [        US   5      (       a  ON[        U[        5      (       a.  [        US   [        5      (       a  [        US   S   5      (       d  [        S5      eU R                  /[        U5      -  n	U V
s/ sH  oR                  S	5      PM     nn
[        X5       VVs/ sHT  u  p[        UU R                  R                  U R                  [         [        U[        5      (       a  [        U5      OS
S9PMV     nnn[#        U5      nU R$                  " U40 US   D6S   nUS   R'                  SS5      b  US   S==   U R                  -  ss'   U R                  " U4SS0US   D6n0 UESU0EnU(       a.  US   R)                  US   S:H  S5      nUR+                  SU05        [-        US9$ Ub  [        U[.        5      (       a  U/nO8[        U[        5      (       a  [        US   [.        5      (       d  [        S5      eUc  U R0                  S-  n/ nU H@  nU R                  R                  U R2                  -   U-   U-   S-   nUR5                  U5        MB     US   R'                  SS5      US   S'   U R                  " U4SS0US   D6nU$ gs  sn
f s  snnf )a  
Main method to prepare for the model either (1) one or several texts, either (2) one or several image(s). This method is a custom
wrapper around the PaliGemmaProcessor's [`~PaliGemmaProcessor.__call__`] method adapted for the ColPali model. It cannot process
both text and images at the same time.

When preparing the text(s), this method forwards the `text` and `kwargs` arguments to LlamaTokenizerFast's
[`~LlamaTokenizerFast.__call__`].
When preparing the image(s), this method forwards the `images` and `kwargs` arguments to SiglipImageProcessor's
[`~SiglipImageProcessor.__call__`].
Please refer to the docstring of the above two methods for more information.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
tokenizer_init_kwargsr   suffixNz&Either text or images must be providedz5Only one of text or images can be processed at a timer   zAimages must be an image, list of images or list of list of imagesRGB   r-   r   pixel_values
max_lengthreturn_token_type_idsF	input_idstoken_type_idsilabels)dataz*Text must be a string or a list of strings
   r,   2   )_merge_kwargsr   r8   init_kwargspoprG   r   
isinstancelistr=   lenconvertzipr3   r/   rA   rI   r	   r7   getmasked_fillupdater   strquery_augmentation_tokenr>   append)rQ   rU   rV   audiovideosrW   output_kwargsr[   r`   	texts_docimager.   
image_listinput_stringsr^   inputsreturn_datarc   texts_queryquerybatch_querys                        r'   __call__ColPaliProcessor.__call__   s|   Z **"
"&.."<"<
 

 }-11(DA &d 2<FNEFF 2TUUf%% FD))nVAY.G.G ..:fQi3N3NSabhijbklmbnSoSo !dee223c&kAI8>?ummE*F? +.i*@	 +A&F (!"nn66"&"7"7 +2<Z2N2Ns:TU +A  	 .f5F//Y-:XYZhiL ]+//dCOm,\:d>S>SS:^^&+  .F CVB^\BK$,88@P9QUV9VX\]""Hf#56[11$$$v t,,DGS1I1I !MNN~66;%'K0043D3DDuLvUX\\""5)  :G}9U9Y9YZfhj9kM-(6..&+  .K - C @	s   L1>AL6c                     0 nUb;  U R                   /[        U5      -  nS/[        U5      -  nUR                  XES.5        [        S0 UD6$ )ax  
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.

Args:
    image_sizes (list[list[str]], *optional*):
        The input sizes formatted as (height, width) per each image.
Returns:
    `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
    input modalities, along with other useful data.
r]   )num_image_tokensnum_image_patchesr   )rA   rl   rq   r
   )rQ   image_sizesrW   vision_datar   r   s         r'   _get_num_multimodal_tokens+ColPaliProcessor._get_num_multimodal_tokens  sZ     " $ 5 56[9II!"c+&6 64Dmn,,,r&   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
refer to the docstring of this method for more information.
)r8   batch_decoderQ   argsrW   s      r'   r   ColPaliProcessor.batch_decode  s    
 ~~**D;F;;r&   c                 :    U R                   R                  " U0 UD6$ )z
This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
the docstring of this method for more information.
)r8   decoder   s      r'   r   ColPaliProcessor.decode  s    
 ~~$$d5f55r&   c                     U R                   R                  nU R                  R                  n[        [        R                  X-   5      5      $ N)r8   model_input_namesr7   rk   dictfromkeys)rQ   tokenizer_input_namesimage_processor_input_namess      r'   r   "ColPaliProcessor.model_input_names#  s<     $ @ @&*&:&:&L&L#DMM"7"UVWWr&   c                 .    U R                   R                  $ )zr
Return the query augmentation token.

Query augmentation buffers are used as reasoning buffers during inference.
)r8   	pad_token)rQ   s    r'   rs   )ColPaliProcessor.query_augmentation_token)  s     ~~'''r&   c                 *    U R                   " SSU0UD6$ )a  
Prepare for the model one or several image(s). This method is a wrapper around the `__call__` method of the ColPaliProcessor's
[`ColPaliProcessor.__call__`].

This method forwards the `images` and `kwargs` arguments to the image processor.

Args:
    images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
        The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
        tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
        number of channels, H and W are image height and width.
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
    - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
rU   r   r   )rQ   rU   rW   s      r'   process_imagesColPaliProcessor.process_images2  s    B }}5F5f55r&   c                 *    U R                   " SSU0UD6$ )a?  
Prepare for the model one or several texts. This method is a wrapper around the `__call__` method of the ColPaliProcessor's
[`ColPaliProcessor.__call__`].

This method forwards the `text` and `kwargs` arguments to the tokenizer.

Args:
    text (`str`, `list[str]`, `list[list[str]]`):
        The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
        (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
        `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
    return_tensors (`str` or [`~utils.TensorType`], *optional*):
        If set, will return tensors of a particular framework. Acceptable values are:

        - `'tf'`: Return TensorFlow `tf.constant` objects.
        - `'pt'`: Return PyTorch `torch.Tensor` objects.
        - `'np'`: Return NumPy `np.ndarray` objects.
        - `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
    [`BatchFeature`]: A [`BatchFeature`] with the following fields:

    - **input_ids** -- List of token ids to be fed to a model.
    - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
      `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
      `None`).
rV   r   r   )rQ   rV   rW   s      r'   process_queries ColPaliProcessor.process_queriesU  s    @ }}1$1&11r&   query_embeddingsztorch.Tensorpassage_embeddings
batch_sizeoutput_dtypeztorch.dtypeoutput_deviceztorch.devicec           	         [        U5      S:X  a  [        S5      e[        U5      S:X  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUS   R                  US   R                  :w  a  [        S5      eUc  US   R                  n/ n[	        S[        U5      U5       GH  n/ n[
        R                  R                  R                  R                  XXs-    SSS9n	[	        S[        U5      U5       H}  n
[
        R                  R                  R                  R                  X*X-    SSS9nUR                  [
        R                  " SX5      R                  S	S
9S   R                  SS
95        M     UR                  [
        R                  " USS
9R                  U5      R                  U5      5        GM     [
        R                  " USS
9$ )a  
Compute the late-interaction/MaxSim score (ColBERT-like) for the given multi-vector
query embeddings (`qs`) and passage embeddings (`ps`). For ColPali, a passage is the
image of a document page.

Because the embedding tensors are multi-vector and can thus have different shapes, they
should be fed as:
(1) a list of tensors, where the i-th tensor is of shape (sequence_length_i, embedding_dim)
(2) a single tensor of shape (n_passages, max_sequence_length, embedding_dim) -> usually
    obtained by padding the list of tensors.

Args:
    query_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Query embeddings.
    passage_embeddings (`Union[torch.Tensor, list[torch.Tensor]`): Passage embeddings.
    batch_size (`int`, *optional*, defaults to 128): Batch size for computing scores.
    output_dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): The dtype of the output tensor.
        If `None`, the dtype of the input embeddings is used.
    output_device (`torch.device` or `str`, *optional*, defaults to "cpu"): The device of the output tensor.

Returns:
    `torch.Tensor`: A tensor of shape `(n_queries, n_passages)` containing the scores. The score
    tensor is saved on the "cpu" device.
r   zNo queries providedzNo passages providedz/Queries and passages must be on the same devicez-Queries and passages must have the same dtypeT)batch_firstpadding_valuezbnd,csd->bcnsr   )dim   r]   )rl   rG   devicedtyperangetorchnnutilsrnnpad_sequencert   einsummaxsumcatto)rQ   r   r   r   r   r   scoresibatch_scoresbatch_queriesjbatch_passagess               r'   score_retrieval ColPaliProcessor.score_retrievalw  s   @  A%233!"a'344A%%);A)>)E)EENOOA$$(:1(=(C(CCLMM+A.44L%'q#./<A/1L!HHNN..;; Q^4$VW < M 1c"45zB!&!3!3!@!@&1>:\] "A " ##LL-PTTYZT[\]^bbghbi	 C MM%))La8;;LILL][\ = yyQ''r&   )rA   r1   rL   r>   r=   )NNNzDescribe the image.z
Question: )NNNNr   )r*   Ncpu) r    r!   r"   r#   __doc__
attributesimage_processor_classtokenizer_classrr   rF   r   r   r   r   rk   r   r   r   r   r   r   r   propertyr   rs   r   r   intr   r   r%   __classcell__)rS   s   @r'   r5   r5   K   s   ( $[1JP>O $9( )
 " )  )  )H "^b{{ I0$y/4HYCZZ[{ /0{ 
{z-$<6 X X
 (# ( ( "!6!6 /0!6 
	!6F 2ItI./ 2 /0 2 
	 2L 0449>(^0D DE>( ".$~2F"FG>( 	>(
 }->( ^S01>( 
>( >(r&   r5   )typingr   r   feature_extraction_utilsr   image_utilsr   r   r	   processing_utilsr
   r   r   r   tokenization_utils_baser   r   r   r   r   r   r   rI   r   rN   r3   r5   __all__)r   s   0r'   <module>r      s   . # 4 O O X X O O ' 
-U 
 ).t5A$qgQ5RWX[R\8]R\Q4#waR\8]]N,j(~ j(Z 
M 68]s   B7B#