
    hx              
          d Z ddlZddlZddlZddlmZmZ ddlmZ ddl	m
Z
 ddlmZmZmZmZmZ ddlmZ dd	lmZmZ eeeeef      eeeeeef      eeeeef         eeeeeef         f   Z G d
 ded      Z G d ded      Z G d ded      Z G d de      Zdeeeeef   dedeeef   fdZdededefdZ d Z!d Z"d Z#ddZ$dgZ%y) zProcessor class for KOSMOS-2.    N)OptionalUnion   )BatchFeature)
ImageInput)ImagesKwargsProcessingKwargsProcessorMixin
TextKwargsUnpack)
AddedToken)BatchEncoding	TextInputc                   D    e Zd ZU eee      ed<   ee   ed<   ee   ed<   y)Kosmos2ImagesKwargsbboxesnum_image_tokensfirst_image_token_idN)__name__
__module____qualname__r   listfloat__annotations__int     m/var/www/html/aiagenthome/venv/lib/python3.12/site-packages/transformers/models/kosmos2/processing_kosmos2.pyr   r   %   s%    T%[!!sm#"3-'r   r   F)totalc                       e Zd ZU ee   ed<   y)Kosmos2TextKwargsadd_eos_tokenN)r   r   r   r   boolr   r   r   r   r!   r!   +   s    D>!r   r!   c            
       D    e Zd ZU eed<   eed<   dddddddddd	ddid	Zy
)Kosmos2ProcessorKwargstext_kwargsimages_kwargsTFr   )	add_special_tokenspaddingstridereturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_token_type_idsverboser"   r   @   )r&   r'   N)r   r   r   r!   r   r   	_defaultsr   r   r   r%   r%   /   sC    ""&& #').*/&+%*"

 
Ir   r%   c                   `    e Zd ZdZddgZdZdZd fd	Z	 	 	 	 ddee	   de
eee   f   d	ee   d
efdZd Zd Z	 	 	 dde
eee   f   dee	   dedee   d
e
eee   f   f
dZddZddZed        Zdede
eee      eee      f   d
efdZde
eeef   eeeeef   f   d
eeef   fdZ xZS )Kosmos2Processora,  
    Constructs an KOSMOS-2 processor which wraps a KOSMOS-2 image processor and a KOSMOS-2 tokenizer into a single
    processor.

    [`Kosmos2Processor`] offers all the functionalities of [`CLIPImageProcessor`] and some functionalities of
    [`XLMRobertaTokenizerFast`]. See the docstring of [`~Kosmos2Processor.__call__`] and [`~Kosmos2Processor.decode`]
    for more information.

    Args:
        image_processor (`CLIPImageProcessor`):
            An instance of [`CLIPImageProcessor`]. The image processor is a required input.
        tokenizer (`XLMRobertaTokenizerFast`):
            An instance of ['XLMRobertaTokenizerFast`]. The tokenizer is a required input.
        num_patch_index_tokens (`int`, *optional*, defaults to 1024):
            The number of tokens that represent patch indices.
    image_processor	tokenizer)CLIPImageProcessorCLIPImageProcessorFastAutoTokenizerc                    d|_         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d	| _        d
| _	        d| _
        d| _        | j                  | j                  | j                  | j                  | j
                  | j                  | j                  | j                  | j                  | j                  | j                  g| _        || _        t        | j                        D cg c]   }dt        |      j!                  d       d" }}g }| j                  |z   D ]   }|j#                  t%        |ddd             " |j'                  |       t(        	| U  ||       y c c}w )NFz</doc>z<image>z</image>z</chunk>z</line>z<phrase>z	</phrase>z<object>z	</object></delimiter_of_multi_objects/>z<grounding><patch_index_   >T)lstriprstrip
normalized)r.   	eod_token	boi_token	eoi_token	eoc_token	eol_token	bop_token	eop_token	boo_token	eoo_token	dom_token	grd_token
tag_tokensnum_patch_index_tokensrangestrzfillappendr   
add_tokenssuper__init__)
selfr4   r5   rM   kwargsxpatch_index_tokenstokens_to_addtoken	__class__s
            r   rT   zKosmos2Processor.__init__Z   sI   */	'!"##"#$#$9& NNNNNNNNNNNNNNNNNNNNNN
 '=#JOPTPkPkJlmJlQc!fll1o->a@Jlm__'99E  E$uY^!_` :]+)4 ns   4%E1imagestextrV   returnc           
      <
   ||t        d       | j                  t        fd| j                  j                  i|}|d   j                  dd      }|d   j                  dd      }|d   j                  dd      }	|d	   j                  d
d      }
|d	   d   }|d	   d   }|d	   j                  dd      }t               }|' | j                  |fi |d   }|j                  |       || j                  ||||      }|rd|
sbt        |t              r| j                  j                   | }n7t        |t              r'|D cg c]  }| j                  j                   |  }}|d	   d   xr |
|d	   d<   ||nd|d	   d<   ||nd|d	   d<    | j                  dd|i|d	   }|j                  |       ||d	   d<   ||d	   d<   ||d	   d<   | ||	| j                  j                  dz   }	|}t!        |      dz   }t        t#        |	|	|z               }dgdg|z  z   dgz   }g }g }|d   }t        |t              r|g}|d   g|d<   |D ]p  }|d| |z   |||z   d z   }|j%                  |       t'        j&                  |      }|rdg|z   }|dgt)        |      t)        |      z
  z  z  }|j%                  |       r t        |t              rt+        t-        j.                        D cg c]  \  }}|t)        |      f c}}d       }|d   \  }}|d   \  }}|d	   d   xr |
|d	   d<   d|d	   d<    | j                  dd||   gi|d	   }t)        |j.                  d         } || k7  r5| j                  j0                  dk(  r|D cg c]+  }|| j                  j2                  g| t)        |      z
  z  z   - }}|D cg c]  }|dg| t)        |      z
  z  z    }}|d   D cg c]  }|dg| t)        |      z
  z  z    c}|d<   n| j                  j0                  dk(  r|D cg c]+  }| j                  j2                  g| t)        |      z
  z  |z   - }}|D cg c]  }dg| t)        |      z
  z  |z    }}|d   D cg c]  }dg| t)        |      z
  z  |z    c}|d<   t        |t              r||d   }|d   d   |d<   |d   }|j                  t5        ||d   |d|             |S c c}w c c}}w c c}w c c}w c c}w c c}w c c}w c c}w )a	  
        This method uses [`CLIPImageProcessor.__call__`] method to prepare image(s) for the model, and
        [`XLMRobertaTokenizerFast.__call__`] to prepare text for the model.

        Please refer to the docstring of the above two methods for more information.

        The rest of this documentation shows the arguments specific to `Kosmos2Processor`.

        Args:
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional* defaults to 64):
                The number of (consecutive) places that are used to mark the placeholders to store image information.
                This should be the same as `latent_query_num` in the instance of `Kosmos2Config` you are using.
            first_image_token_id (`int`, *optional*):
                The token id that will be used for the first place of the subsequence that is reserved to store image
                information. If unset, will default to `self.tokenizer.unk_token_id + 1`.
            add_eos_token (`bool`, defaults to `False`):
                Whether or not to include `EOS` token id in the encoding when `add_special_tokens=True`.
        Nz*You have to specify either images or text.tokenizer_init_kwargsr'   r   r   r0   r   r&   r"   Fr(   r)   return_tensors)r   r]      r   	input_idsattention_maskc                     | d   S Nr   )rW   s    r   <lambda>z+Kosmos2Processor.__call__.<locals>.<lambda>   s    defhdir   )keyrg   rightleft)rc   rd   image_embeds_position_mask)datatensor_typer   )
ValueError_merge_kwargsr%   r5   init_kwargspop
setdefaultr   r4   updatepreprocess_examples
isinstancerO   	bos_tokenr   unk_token_idr   rN   rQ   copylensorted	enumeraterc   padding_sidepad_token_idr   )!rU   r\   r]   audiovideosrV   output_kwargsr   r   r   r"   r(   r)   ra   encodingimage_encodingstext_encodingwith_bosstart_indeximage_token_idsbase_image_embeds_position_maskrc   rl   all_input_idstext_idsmaskidxrW   sorted_length_min_len_not_paddedmax_len_paddeds!                                    r   __call__zKosmos2Processor.__call__   so   8 >dlIJJ***"
"&.."<"<
 
 /33HdC(9==>PRTU,_=AABXZ^_%m488%P*=9:NO.y9&}5@@AQSWX>1T11&[M/<Z[NOON+++D&&Sc+dD!-dC("nn667v>Dd+FJKdt~~778<dDKm,-ABT} -()=> BHgUZM-(3OU~^cgM-()9:*DNNUUm8TUMOOM*=Om$%9:29m$Y/9Gm$%56 2#+'+~~'B'BQ'F$ *H h-!+K #5)=?SVf?f#ghO/0cQC:J4J.JaS.P+ I)+&$[1M$$!..67G.H-I)*)#L[1OCh{]mOmOoFpp  *yy!@A3:Ds8}s4y899*11$7 * $% &1:=;R;R1ST1SvsAc3q6]1STZi! )6a(8%%&r*Q!-01EFX= m,-AB BFm,-=> . `T#YK `=Q^C_ `!$]%<%<Q%?!@%7~~22g=lu$vlughQ$..*E*E)F.[^_`[aJa)b%blu	$vIc6IcAA~A'> ??Ic 3 6 JRRbIc6IcAA~A'> ??Ic6!12 44>lu$vlughdnn&A&A%BnWZ[\W]F]%^ab%blu	$vIc6IcAQC>CF#:;a?Ic 3 6 JRRbIc6IcAQC>CF#:;a?Ic6!12
 $$)?%aL	-56F-G-J)*-G-J* OO%.*23C*D6P
 !/	 K Lj U %w66 %w66s0   !S5/S:
0T T,T
,0T"TTc                 @   |yt        |t              st        d      |D ]{  }|t        |t              s|g}|D ]^  }t        |t              rBt	        |      dk(  rt        d |D              r4t	        |      dk(  rt        d |D              rUt        d       } y)a  
        Check `bboxes` for a single text example. It could be
            - `None`: no bounding box associated to a text.
            - A list with each element being the bounding boxes associated to one `<phrase> ... </phrase>` pair found
              in a text. This could be:
                  - `None`: no bounding box associated to a `<phrase> ... </phrase>` pair.
                  - A tuple of 2 integers: A single bounding box specified by patch indices.
                  - A tuple of 4 float point number: A single bounding box specified by (normalized) coordinates.
                  - A list containing the above 2 tuple types: Multiple bounding boxes for a
                   `<phrase> ... </phrase>` pair.
        Nz@`bboxes` (for a single text example) should be `None` or a list.   c              3   <   K   | ]  }t        |t                y wN)rv   r   .0rW   s     r   	<genexpr>zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>B  s     .S7az!S/A7   r<   c              3   <   K   | ]  }t        |t                y wr   )rv   r   r   s     r   r   zAKosmos2Processor._check_bboxes_for_single_text.<locals>.<genexpr>C  s     1XPW1*Q2FPWr   a'  Each element in `bboxes` (for a single text example) should be either `None`, a tuple containing 2 integers or 4 float point numbers, or a list containing such tuples. Also make sure the arguments `texts` and `bboxes` passed to `preprocess_text` are both in batches or both for a single example.)rv   r   ro   tuplerz   all)rU   r   bboxelements       r   _check_bboxes_for_single_textz.Kosmos2Processor._check_bboxes_for_single_text)  s     >FD)_`` D|d+v!'51\Q&3.S7.S+SG)c1XPW1X.X$@    r   c                 \    |j                         }|| d| }| j                  ||      }|S )N )strip_insert_patch_index_tokens)rU   r]   imager   img_info_tokenss        r   _preprocess_single_examplez+Kosmos2Processor._preprocess_single_exampleL  s=    zz|%&av.D ..tV<r   textsr   r   c                     | j                   g|z  }dj                  | j                   g|z   | j                  gz         }d}t        |t              rd}|g}|dgt        |      z  }nt        |t              s|g}t        |      t        |      k7  r$t        dt        |       dt        |       d      |s| j                  |       |g}nE|4t        |t              st        d      |D ]  }| j                  |        ndgt        |      z  }t        |      t        |      k7  r$t        d	t        |       dt        |       d      t        |||      D 	
cg c]  \  }	}
}| j                  |	|
||       }}
}	}|s|d
   }|S c c}}
}	w )a-  Add image and bounding box information to `texts` as image and patch index tokens.

        Args:
            texts (`Union[TextInput, list[TextInput]]`): The texts to be processed.
            images (`ImageInput`, *optional*): The images associated to `texts`.
            bboxes (`Union[list[tuple[int]], list[tuple[float]], list[list[tuple[int]]], list[list[tuple[float]]]]`, *optional*):
                The bounding bboxes associated to `texts`.
            num_image_tokens (`int`, *optional*, defaults to 64):
                The number of image tokens (used as latent queries). This should corresponds to the `latent_query_num`
                attribute in `Kosmos2Config`.

        Returns:
            `Union[TextInput, list[TextInput]]`: The processed texts with image and patch index tokens.
        r   TFNzGThe number of examples in `texts` and `images` should be the same. Got  v.s. 	 instead.zS`bboxes` should be `None` or a list (as a batch) when `texts` is passed as a batch.zGThe number of examples in `texts` and `bboxes` should be the same. Got r   )rB   joinrC   rv   rO   rz   r   ro   r   zipr   )rU   r   r\   r   r   
img_tokensr   batchedrW   r]   r   r   results                r   ru   z$Kosmos2Processor.preprocess_examplesV  s   , nn%(88
((DNN#3j#@DNNCS#ST eS!GGE>Vc%j(FFD)XFu:V$YZ]^cZdYeeklopvlwkx  yB  C  ..v6XFfd+ !vww2215  Vc%j(Fv;#e*$YZ]^cZdYeeklopvlwkx  yB  C  &)%?
%?!eT ++D%O%? 	 

 AYF
s   F	c                 \    |j                  | j                        d   }|rt        |      S |S rf   )splitrC   +clean_text_and_extract_entities_with_bboxes)rU   r]   cleanup_and_extractcaptions       r   post_process_generationz(Kosmos2Processor.post_process_generation  s,    **T^^,R0>wGGr   c                 x     | j                   |fd|i|}|D cg c]  }| j                  |d       c}S c c}w )a  
        Post-process the output of the model to decode the text.

        Args:
            generated_outputs (`torch.Tensor` or `np.ndarray`):
                The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
                or `(sequence_length,)`.
            skip_special_tokens (`bool`, *optional*, defaults to `True`):
                Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
            **kwargs:
                Additional arguments to be passed to the tokenizer's `batch_decode method`.

        Returns:
            `list[str]`: The decoded text.
        skip_special_tokensF)r   )batch_decoder   )rU   generated_outputsr   rV   generated_textsr]   s         r   post_process_image_text_to_textz0Kosmos2Processor.post_process_image_text_to_text  sN      ,$++,=qSfqjpqZijZiRV,,Tu,MZijjjs   7c                 l    | j                   j                  }| j                  j                  }||z   dgz   S )Nrl   )r5   model_input_namesr4   )rU   tokenizer_input_namesimage_processor_input_namess      r   r   z"Kosmos2Processor.model_input_names  s9     $ @ @&*&:&:&L&L#$'BBFbEcccr   c                    |t        |      dk(  r|S t        t        j                  d|            }t        |      t        |      k7  r$t	        dt        |       dt        |       d      d}g }t        ||      D ]  \  }}|j                         \  }}	|j                  |||	        |	}|2t        |t              r|g}g }
t        d |D              st	        d      |D ],  }| j                  |      \  }}|
j                  | d	|        . t        |
      dk(  rd
j                  |
      }|j                  d| d        |t        |      k  r|j                  ||d         dj                  |      }|S )Nr   z<phrase>.+?</phrase>)stringzuThe number of elements in `bboxes` should be the same as the number of `<phrase> ... </phrase>` pairs in `text`. Got r   r   c              3   $   K   | ]  }|d u 
 y wr   r   )r   boxs     r   r   z>Kosmos2Processor._insert_patch_index_tokens.<locals>.<genexpr>  s     7$3s$$s   zTThe multiple bounding boxes for a single phrase should not contain any `None` value.r   z  </delimiter_of_multi_objects/> z	<object> z
 </object> )rz   r   refinditerro   r   spanrQ   rv   r   r   #_convert_bbox_to_patch_index_tokensr   )rU   r]   r   matched_phrasescurr_posbuffermatchedr   r   endpatch_index_stringsr   patch_index_1patch_index_2position_strs                  r   r   z+Kosmos2Processor._insert_patch_index_tokens  s   >S[A-Kr{{+B4PQ3v;. H  IL  M\  I]  H^  ^d  eh  io  ep  dq  qz  {   &9MGT\\^FAsMM$x,-H|$&v"$7$77 j  /3/W/WX[/\,}#**m_Am_+MN  &'1,=BBCVWLMMIl^:>?/ :2 c$iMM$xy/*wwvr   r   c                    t        |      dk(  r|\  }}n7t        t        j                  | j                              }t        ||      \  }}dt        |      j                  d       d}dt        |      j                  d       d}||fS )Nr   r;   r<   r=   )rz   r   mathsqrtrM   coordinate_to_patch_indexrO   rP   )rU   r   idx_1idx_2num_patches_per_sidetoken_1token_2s          r   r   z4Kosmos2Processor._convert_bbox_to_patch_index_tokens  s     t9>LE5 $'tyy1L1L'M#N 4T;OPLE5!#e*"2"21"5!6a8!#e*"2"21"5!6a8r   )i   )NNNN)NNr0   )T) r   r   r   __doc__
attributesimage_processor_classtokenizer_classrT   r   r   r   r   r   r   r%   r   r   r   r   	BboxInputr   rO   ru   r   r   propertyr   r   r   r   r   __classcell__)r[   s   @r   r3   r3   D   s   " $[1JL%O+5^ (,26`$` ItI./` /0` 
`D!F (, *,@YY/0@ $@ 	@
 #3-@ 
sDI~	@Dk& d d
+s +E$uSzBRTXY^_dYeTfBf<g +lo +Z %S/5ue1K+LLM 	sCx r   r3   r   r   r^   c                 .   | \  }}}}||kD  r||kD  st        d      t        j                  ||z        }t        j                  ||z        }t        j                  ||z  dz
        }t        j                  ||z  dz
        }	||z  |z   }
|	|z  |z   }|
|fS )a  Convert a bounding box to a pair of patch indices.

    Args:
        bbox (`tuple[float, float, float, float]`):
            The 4 coordinates of the bounding box, with the format being (x1, y1, x2, y2) specifying the upper-left and
            lower-right corners of the box. It should have x2 > x1 and y2 > y1.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[int, int]`: A pair of patch indices representing the upper-left patch and lower-right patch.
    zTThe coordinates in `bbox` should be `(x1, y1, x2, y2)` with `x2 > x1` and `y2 > y1`.rb   )ro   r   floorceil)r   r   x1y1x2y2ul_xul_ylr_xlr_yul_idxlr_idxs               r   r   r     s     RRGRopp::b//0D::b//0D99R..23D99R..23D((4/F((4/F6>r   r   r   c                 "   d|z  }| |z  }| |z  }||z  }||z  }| |k(  r||z  }||z  }	||z  |z   }
||z  |z   }nQ||k(  s||k(  r||z  }||z  }	||z  |z   }
||z  |z   }n,||z  |dz  z   }||z  |dz  z   }	||z  |dz  z   }
||z  |dz  z   }||	|
|fS )a  
    Given a grid of length `num_patches_per_side` and the indices of the upper-left and lower-right corners of a
    bounding box, returns the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).

    Args:
        ul_idx (`int`): the index of the grid cell that corresponds to the upper-left corner of the bounding box.
        lr_idx (`int`): the index of the grid cell that corresponds to the lower-right corner of the bounding box.
        num_patches_per_side (`int`): the number of patches along each side.

    Returns:
        `tuple[float]`: the normalized coordinates of the bounding box, in the form (x1, y1, x2, y2).
    g      ?r   r   )r   r   r   	cell_sizer   r   r   r   r   r   r   r   s               r   patch_index_to_coordinater     s    **I ((D))D((D))D III	)I	)	III	)I	)I	A-I	A-I	A-I	A-r2r>r   c           
      $   d}t        j                  ||       }g }|D ]o  }|j                  d      }|j                         \  }}}|s*d}|j                  d      d   |j                  d      d   f}|j	                  d      }	g }
|	D ]  }t        j
                  d|      }t        j
                  d|dd       }|s5|s8|rD|
j                  t        |j                  d            t        |j                  d            f       ~|
j                  t        |j                  d            t        |j                  d            f        |r|j                  |||
f       E|
D ]&  }d|d    d	|d    d
}|j                  |||gf       ( r |S )a  Extract entities contained in `text`. The bounding bboxes is given in the form of patch indices.

    This function is only intended to be used within `clean_text_and_extract_entities_with_bboxes` where further
    processing happens, including converting to normalized coordinates and whitespace character cleaning up.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> entities = extract_entities_with_patch_indices(text)
    >>> entities
    [(' a snowman', (31, 41), [(44, 863)]), (' a fire', (130, 137), [(5, 911)])]
    ```z(?:(<phrase>([^<]+)</phrase>))?<object>((?:<patch_index_\d+><patch_index_\d+></delimiter_of_multi_objects/>)*<patch_index_\d+><patch_index_\d+>)</object>r   Nr   r:   z<patch_index_(\d+)>rb   r;   z><patch_index_r=   )	r   r   r   groupsr   searchrQ   r   group)r]   patternmatchesentities_with_patch_indicesmatchr   
phrase_tagphrasematch_contentpatch_index_pairsentity_bboxespairrW   yr   entitys                   r   #extract_entities_with_patch_indicesr  B  s    kG kk'4(G #%zz!},1LLN)
FMFJJqM!$ejjmA&67D *//0PQ%D		0$7A		0$qr(;AQ!((#aggaj/3qwwqz?)KL!((#aggaj/3qwwqz?)KL & '..m/LM%(a	QyJ+22FD4&3IJ &7 @ '&r   c           	          | \  }\  }}t        t        j                  dd|d|             }t        t        j                  dd|d|             }|||ff}|S )zfAdjust the positions of the entities in `text` to be relative to the text with special fields removed.<.*?>r   N)rz   r   sub)r  r]   entity_namestartr   adjusted_startadjusted_endadjusted_entitys           r   adjust_entity_positionsr  |  s_     &K%T&5\:;Nrvvgr4:67L"^\$BCOr   c                    | j                         }t        |       t        | j                               z
  }g }|D ]  \  }\  }}}t        |      t        |j                               z
  }	t        |      t        |j                               z
  }
||z
  |	z   }||z
  |
z
  }|j                         }|j	                  |||f|f        ||fS )z9Remove the spaces around the text and the entities in it.)r   rz   r>   r?   rQ   )r]   entitiesnew_textleading_spacesnew_entitiesr  r  r   r   entity_name_leading_spacesentity_name_trailing_spacess              r   _cleanup_spacesr    s    zz|HYT[[]!33NL-5)\eS6%(%5K<N<N<P8Q%Q"&)+&6[=O=O=Q9R&R#&)CCN"%@@!'')[5#,?@ .6 \!!r   c           	         t        j                  dd|       }t        |       }g }|D ]M  }|dd |d   }}t        ||       }|D 	cg c]  }	t	        |	d   |	d   |       }
}	|j                  ||
fz          O t        ||      S c c}	w )a  Remove the tag tokens from `text`, extract entities in it with some cleaning up of white characters.

    Examples:

    ```python
    >>> text = "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>."
    >>> clean_text, entities = clean_text_and_extract_entities_with_bboxes(text)
    >>> clean_text
    'An image of a snowman warming himself by a fire.'

    >>> entities
    [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
    ```r  r   r   r   rb   )r   r  r  r  r   rQ   r  )r]   r   processed_textr   r  itemr  r   r  r   bboxes_in_coordss              r   r   r     s     VVGR.N"Ed"KH+aDG1&$?jpqjpbf5d1gtAwH\]jpq+;*==> , >844	 rs   B)    )&r   ry   r   r   typingr   r   image_processing_utilsr   image_utilsr   processing_utilsr   r	   r
   r   r   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r!   r%   r3   r   r   r  r  r  r   __all__r   r   r   <module>r!     s7   $   	 " 2 % b b , ? sCxueUE)	*+eCHo	eE5%'(	)*,	(,e ("
% "-U *o ~ o dE%u*D$E ]` ejknpskset >(c (3 (c (Z7't"*5: 
r   