
    <he(                     .   S SK JrJrJr  SSKJr  SSKJrJrJ	r	J
r
JrJr  SSKJrJr  \
" 5       (       a  S SKJr  SSKJr  \" 5       (       a  SS	KJr  \	" 5       (       a
  S S
KrSSKJr  \R2                  " \5      r\" \" SSS95       " S S\5      5       rg
)    )AnyUnionoverload   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)has_tokenizerhas_image_processorc                      ^  \ rS rSrSrSrSrSrSrSr	\
" SS9rU 4S jrSS jr\S	\\S
4   S\S\\\\4      4S j5       r\S	\\\   \S
   4   S\S\\\\\4         4S j5       rS	\\\\   S
\S
   4   4U 4S jjrSS jrS rS rSrU =r$ )ImageToTextPipeline.   a  
Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

Unless the model you're using explicitly sets these generation parameters in its configuration files
(`generation_config.json`), the following default values will be used:
- max_new_tokens: 256

Example:

```python
>>> from transformers import pipeline

>>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
>>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'generated_text': 'two birds are standing next to each other '}]
```

Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
"image-to-text".

See the list of available models on
[huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
TF   )max_new_tokensc                    > [         TU ]  " U0 UD6  [        U S5        U R                  U R                  S:X  a  [
        5        g [        5        g )Nvisiontf)super__init__r   check_model_type	frameworkr   r   )selfargskwargs	__class__s      \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_text.pyr    ImageToTextPipeline.__init__T   sD    $)&)$)7;~~7M3	
Sw	
    c                    0 n0 nUb  X6S'   Ub  XFS'   Ub  XS'   Ub%  Ub  SU;   a  [        S5      eUR                  U5        U R                  b  U R                  US'   U R                  b  U R                  US'   U R                  US'   Xe0 4$ )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater-   r/   r.   )r#   r   generate_kwargsr+   r,   forward_paramspreprocess_paramss          r'   _sanitize_parameters(ImageToTextPipeline._sanitize_parameters[   s    *0h'+2i(%/=+,&).>/.Q &  !!/2+040D0DN,-##/*...N;'484L4LN01 "44r)   inputszImage.Imager%   returnc                     g N r#   r7   r%   s      r'   __call__ImageToTextPipeline.__call__v   s    ber)   c                     g r:   r;   r<   s      r'   r=   r>   y   s    twr)   c                 p   > SU;   a  UR                  S5      nUc  [        S5      e[        TU ]  " U40 UD6$ )a'  
Assign labels to the image(s) passed as inputs.

Args:
    inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
        The pipeline handles three types of images:

        - A string containing a HTTP(s) link pointing to an image
        - A string containing a local path to an image
        - An image loaded in PIL directly

        The pipeline accepts either a single image or a batch of images.

    max_new_tokens (`int`, *optional*):
        The amount of maximum tokens to generate. By default it will use `generate` default.

    generate_kwargs (`Dict`, *optional*):
        Pass it to send all of these arguments directly to `generate` allowing full control of this function.

    timeout (`float`, *optional*, defaults to None):
        The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
        the call may block forever.

Return:
    A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

    - **generated_text** (`str`) -- The generated text.
imageszBCannot call the image-to-text pipeline without an inputs argument!)popr0   r   r=   )r#   r7   r%   r&   s      r'   r=   r>   |   sA    < vZZ)F>abbw1&11r)   c                    [        XS9nUGb  [        R                  S5        [        U[        5      (       d  [        S[        U5       S35      eU R                  R                  R                  nUS:X  a  U R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  USS	9R                  nU R                  R                   /U-   n["        R$                  " U5      R'                  S
5      nUR)                  SU05        GOUS:X  aF  U R                  XU R                  S9nU R                  S:X  a  UR                  U R                  5      nOUS:w  ao  U R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  X R                  S9nUR)                  U5        OS[        SU S35      eU R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  R                  R                  S:X  a  Uc  S US'   U$ )N)r,   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)rA   return_tensorsptF)textadd_special_tokensr   	input_ids
pix2struct)rA   header_textrE   zvision-encoder-decoder)rE   zModel type z- does not support conditional text generation)r   loggerwarning_once
isinstancestrr0   typemodelconfig
model_typeimage_processorr"   totorch_dtyper.   rI   cls_token_idtorchtensor	unsqueezer1   )r#   imager+   r,   rS   model_inputsrI   text_inputss           r'   
preprocessImageToTextPipeline.preprocess   s2   52W fc** <T&\N Ko o 
 **55JU"#335Q_Q_3`>>T)#/??43C3C#DL NN5NQ[[	!^^889IE	!LL3==a@	##[)$<=|+#335eieses3t>>T)#/??43C3C#DL77#335Q_Q_3`>>T)#/??43C3C#DL"nnVNNnS##K0 !;zl:g!hii  //u^^/\L~~%+t/?/?@::''50V^(,L%r)   c                 2   SU;   a7  [        US   [        5      (       a  [        S US    5       5      (       a  S US'   SU;  a  U R                  US'   UR	                  U R
                  R                  5      nU R
                  R                  " U40 UDUD6nU$ )NrI   c              3   &   #    U H  oS L v   M
     g 7fr:   r;   ).0xs     r'   	<genexpr>/ImageToTextPipeline._forward.<locals>.<genexpr>   s     A'@!I'@s   generation_config)rN   listallrf   rB   rQ   main_input_namegenerate)r#   r\   r2   r7   model_outputss        r'   _forwardImageToTextPipeline._forward   s     <'<4d;;A|K'@AAA(,L% o5373I3IO/0 !!$**"<"<=

++FVlVoVr)   c                 v    / nU H0  nSU R                   R                  USS90nUR                  U5        M2     U$ )Ngenerated_textT)skip_special_tokens)r.   decodeappend)r#   rk   records
output_idsrecords        r'   postprocessImageToTextPipeline.postprocess   sL    'J $.."7"7(, #8 #F NN6" ( r)   r;   )NNNN)NN)__name__
__module____qualname____firstlineno____doc___pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr    r5   r   r   rO   r   rg   dictr=   r^   rl   rv   __static_attributes____classcell__)r&   s   @r'   r   r   .   s    4  $O #O!1"
56 euS-%78eCeDQUVY[^V^Q_L`e ewuT#Y]0C%CDwPSwX\]abfgjlogobp]qXrw w"2uS$s)]DDW%WX "2H1f,
 
r)   r   )typingr   r   r   
generationr   utilsr   r	   r
   r   r   r   baser   r   PILr   image_utilsr   models.auto.modeling_tf_autor   rX   models.auto.modeling_autor   
get_loggerrx   rL   r   r;   r)   r'   <module>r      s     ( ' )  5 (VP			H	% ,4UYZ[D( D \Dr)   