ó
    <±he(  ã                   ó.  • S SK JrJrJr  SSKJr  SSKJrJrJ	r	J
r
JrJr  SSKJrJr  \
" 5       (       a  S SKJr  SSKJr  \" 5       (       a  SS	KJr  \	" 5       (       a
  S S
KrSSKJr  \R2                  " \5      r\" \" SSS95       " S S\5      5       rg
)é    )ÚAnyÚUnionÚoverloadé   )ÚGenerationConfig)Úadd_end_docstringsÚis_tf_availableÚis_torch_availableÚis_vision_availableÚloggingÚrequires_backendsé   )ÚPipelineÚbuild_pipeline_init_args)ÚImage)Ú
load_image)Ú'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)Ú$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)Úhas_tokenizerÚhas_image_processorc                   ó   ^ • \ rS rSrSrSrSrSrSrSr	\
" SS9rU 4S jrSS jr\S	\\S
4   S\S\\\\4      4S j5       r\S	\\\   \S
   4   S\S\\\\\4         4S j5       rS	\\\\   S
\S
   4   4U 4S jjrSS jrS rS rSrU =r$ )ÚImageToTextPipelineé.   a  
Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

Unless the model you're using explicitly sets these generation parameters in its configuration files
(`generation_config.json`), the following default values will be used:
- max_new_tokens: 256

Example:

```python
>>> from transformers import pipeline

>>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
>>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
[{'generated_text': 'two birds are standing next to each other '}]
```

Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
"image-to-text".

See the list of available models on
[huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
TFé   )Úmax_new_tokensc                 óž   >• [         TU ]  " U0 UD6  [        U S5        U R                  U R                  S:X  a  [
        5        g [        5        g )NÚvisionÚtf)ÚsuperÚ__init__r   Úcheck_model_typeÚ	frameworkr   r   )ÚselfÚargsÚkwargsÚ	__class__s      €Ú\/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/pipelines/image_to_text.pyr    ÚImageToTextPipeline.__init__T   sD   ø€ Ü‰Ò˜$Ð) &Ò)Ü˜$ Ô)Ø×ÑØ7;·~±~ÈÓ7MÔ3õ	
ÜSwõ	
ó    c                 ó  • 0 n0 nUb  X6S'   Ub  XFS'   Ub  XS'   Ub%  Ub  SU;   a  [        S5      eUR                  U5        U R                  b  U R                  US'   U R                  b  U R                  US'   U R                  US'   Xe0 4$ )NÚpromptÚtimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionÚassistant_modelÚ	tokenizerÚassistant_tokenizer)Ú
ValueErrorÚupdater-   r/   r.   )r#   r   Úgenerate_kwargsr+   r,   Úforward_paramsÚpreprocess_paramss          r'   Ú_sanitize_parametersÚ(ImageToTextPipeline._sanitize_parameters[   sÁ   € ØˆØÐàÑØ*0˜hÑ'ØÑØ+2˜iÑ(àÑ%Ø/=Ð+Ñ,ØÑ&ØÑ)Ð.>À/Ó.QÜ ð&óð ð ×!Ñ! /Ô2à×ÑÑ+Ø04×0DÑ0DˆNÐ,Ñ-Ø×#Ñ#Ñ/Ø*.¯.©.ˆN˜;Ñ'Ø48×4LÑ4LˆNÐ0Ñ1à °"Ð4Ð4r)   ÚinputszImage.Imager%   Úreturnc                 ó   • g ©N© ©r#   r7   r%   s      r'   Ú__call__ÚImageToTextPipeline.__call__v   s   € Øber)   c                 ó   • g r:   r;   r<   s      r'   r=   r>   y   s   € Øtwr)   c                 óp   >• SU;   a  UR                  S5      nUc  [        S5      e[        TU ]  " U40 UD6$ )a'  
Assign labels to the image(s) passed as inputs.

Args:
    inputs (`str`, `list[str]`, `PIL.Image` or `list[PIL.Image]`):
        The pipeline handles three types of images:

        - A string containing a HTTP(s) link pointing to an image
        - A string containing a local path to an image
        - An image loaded in PIL directly

        The pipeline accepts either a single image or a batch of images.

    max_new_tokens (`int`, *optional*):
        The amount of maximum tokens to generate. By default it will use `generate` default.

    generate_kwargs (`Dict`, *optional*):
        Pass it to send all of these arguments directly to `generate` allowing full control of this function.

    timeout (`float`, *optional*, defaults to None):
        The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
        the call may block forever.

Return:
    A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

    - **generated_text** (`str`) -- The generated text.
ÚimageszBCannot call the image-to-text pipeline without an inputs argument!)Úpopr0   r   r=   )r#   r7   r%   r&   s      €r'   r=   r>   |   sA   ø€ ð< vÓØ—Z‘Z Ó)ˆFØ‰>ÜÐaÓbÐbÜ‰wÒ Ñ1¨&Ñ1Ð1r)   c                 óÔ  • [        XS9nUGbê  [        R                  S5        [        U[        5      (       d  [        S[        U5       S35      eU R                  R                  R                  nUS:X  a²  U R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  USS	9R                  nU R                  R                   /U-   n["        R$                  " U5      R'                  S
5      nUR)                  SU05        GOUS:X  aF  U R                  XU R                  S9nU R                  S:X  a  UR                  U R                  5      nOÈUS:w  ao  U R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  X R                  S9nUR)                  U5        OS[        SU S35      eU R                  XR                  S9nU R                  S:X  a  UR                  U R                  5      nU R                  R                  R                  S:X  a  Uc  S US'   U$ )N)r,   u¦   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of ðŸ¤— Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.Úgit)rA   Úreturn_tensorsÚptF)ÚtextÚadd_special_tokensr   Ú	input_idsÚ
pix2struct)rA   Úheader_textrE   zvision-encoder-decoder)rE   zModel type z- does not support conditional text generation)r   ÚloggerÚwarning_onceÚ
isinstanceÚstrr0   ÚtypeÚmodelÚconfigÚ
model_typeÚimage_processorr"   ÚtoÚtorch_dtyper.   rI   Úcls_token_idÚtorchÚtensorÚ	unsqueezer1   )r#   Úimager+   r,   rS   Úmodel_inputsrI   Útext_inputss           r'   Ú
preprocessÚImageToTextPipeline.preprocess    s2  € Ü˜5Ñ2ˆàÒÜ×ÑðWôô ˜f¤c×*Ñ*Ü Ø<¼TÀ&»\¸Nð Koð oóð ð
 Ÿ™×*Ñ*×5Ñ5ˆJà˜UÓ"Ø#×3Ñ3¸5×Q_ÑQ_Ð3Ð`Ø—>‘> TÓ)Ø#/§?¡?°4×3CÑ3CÓ#DLØ ŸN™N°È5˜NÐQ×[Ñ[	Ø!Ÿ^™^×8Ñ8Ð9¸IÑE	Ü!ŸLšL¨Ó3×=Ñ=¸aÓ@	Ø×#Ñ# [°)Ð$<Ö=à˜|Ó+Ø#×3Ñ3¸5Ðei×esÑesÐ3ÐtØ—>‘> TÓ)Ø#/§?¡?°4×3CÑ3CÓ#DLøàÐ7Ó7à#×3Ñ3¸5×Q_ÑQ_Ð3Ð`Ø—>‘> TÓ)Ø#/§?¡?°4×3CÑ3CÓ#DLØ"Ÿn™n¨VÇNÁN˜nÐSØ×#Ñ# KÕ0ô ! ;¨z¨lÐ:gÐ!hÓiÐið  ×/Ñ/°uÏ^É^Ð/Ð\ˆLØ~‰~ Ó%Ø+Ÿ™¨t×/?Ñ/?Ó@à:‰:×Ñ×'Ñ'¨5Ó0°V±^Ø(,ˆL˜Ñ%àÐr)   c                 ó2  • SU;   a7  [        US   [        5      (       a  [        S US    5       5      (       a  S US'   SU;  a  U R                  US'   UR	                  U R
                  R                  5      nU R
                  R                  " U40 UDUD6nU$ )NrI   c              3   ó&   #   • U H  oS L v •  M
     g 7fr:   r;   )Ú.0Úxs     r'   Ú	<genexpr>Ú/ImageToTextPipeline._forward.<locals>.<genexpr>Ù   s   é € ÐAÑ'@ !˜•IÒ'@ùs   ‚Úgeneration_config)rN   ÚlistÚallrf   rB   rQ   Úmain_input_nameÚgenerate)r#   r\   r2   r7   Úmodel_outputss        r'   Ú_forwardÚImageToTextPipeline._forwardÓ   s™   € ð ˜<Ó'Ü˜<¨Ñ4´d×;Ñ;ÜÑA |°KÒ'@ÓA×AÑAà(,ˆL˜Ñ%ð  oÓ5Ø37×3IÑ3IˆOÐ/Ñ0ð ×!Ñ! $§*¡*×"<Ñ"<Ó=ˆØŸ
™
×+Ò+¨FÑV°lÐVÀoÑVˆØÐr)   c                 óv   • / nU H0  nSU R                   R                  USS90nUR                  U5        M2     U$ )NÚgenerated_textT)Úskip_special_tokens)r.   ÚdecodeÚappend)r#   rk   ÚrecordsÚ
output_idsÚrecords        r'   ÚpostprocessÚImageToTextPipeline.postprocessé   sL   € ØˆÛ'ˆJà  $§.¡.×"7Ñ"7ØØ(,ð #8ð #ðˆFð N‰N˜6Ö"ñ (ð ˆr)   r;   )NNNN)NN)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú_pipeline_calls_generateÚ_load_processorÚ_load_image_processorÚ_load_feature_extractorÚ_load_tokenizerr   Ú_default_generation_configr    r5   r   r   rO   r   rg   Údictr=   r^   rl   rv   Ú__static_attributes__Ú__classcell__)r&   s   @r'   r   r   .   sü   ø† ñð4  $ÐØ€OØ ÐØ#ÐØ€Oá!1Øñ"Ðõ
ô5ð6 Øe˜u S¨-Ð%7Ñ8ÐeÀCÐeÈDÐQUÐVYÐ[^ÐV^ÑQ_ÑL`Óeó ØeàØw˜u T¨#¡Y°°]Ñ0CÐ%CÑDÐwÐPSÐwÐX\Ð]aÐbfÐgjÐloÐgoÑbpÑ]qÑXrÓwó Øwð"2˜u S¨$¨s©)°]ÀDÈÑDWÐ%WÑX÷ "2ôH1òf÷,
ð 
r)   r   )Útypingr   r   r   Ú
generationr   Úutilsr   r	   r
   r   r   r   Úbaser   r   ÚPILr   Úimage_utilsr   Úmodels.auto.modeling_tf_autor   rX   Úmodels.auto.modeling_autor   Ú
get_loggerrx   rL   r   r;   r)   r'   Ú<module>r      sˆ   ð÷  (Ñ 'å )÷÷ ÷ 5ñ ×ÑÝå(á×ÑÝVá×ÑÛåPà	×	Ò	˜HÓ	%€ñ Ñ,¸4ÐUYÑZÓ[ôD˜(ó Dó \ñDr)   