o
    
shI                     @   s  d dl Z d dlZd dlmZmZ ddlmZ ddlmZ ddl	m
Z
mZmZmZ ddlmZmZ e r=d dlZdd	lmZ e rFdd
lmZ eeZG dd de jZe
eddG dd deZe
eddG dd deZe
eddG dd deZdS )    N)AnyUnion   )GenerationConfig)TruncationStrategy)add_end_docstringsis_tf_availableis_torch_availablelogging   )Pipelinebuild_pipeline_init_args)/TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES),MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMESc                   @   s   e Zd ZdZdZdS )
ReturnTyper   r   N)__name__
__module____qualname__TENSORSTEXT r   r   i/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/pipelines/text2text_generation.pyr      s    r   T)has_tokenizerc                	       s   e Zd ZdZdZdZdZdZdZe	dddZ
dZ fdd	Z	
	
	
	
	
	
dddZdededefddZdd Zdeeee f dedeeeef  f fddZejfddZdd ZejdfddZ  ZS ) Text2TextGenerationPipelinea9  
    Pipeline for text to text generation using seq2seq models.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256
    - num_beams: 4

    Example:

    ```python
    >>> from transformers import pipeline

    >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
    >>> generator(
    ...     "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
    ... )
    [{'generated_text': 'question: Who created the RuPERTa-base?'}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
    generation](text_generation).

    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
    identifier: `"text2text-generation"`.

    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
    up-to-date list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
    parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Usage:

    ```python
    text2text_generator = pipeline("text2text-generation")
    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
    ```TF      )max_new_tokens	num_beams	generatedc                    s4   t  j|i | | | jdkrt d S t d S )Ntf)super__init__check_model_type	frameworkr   r   selfargskwargs	__class__r   r   r!   T   s   
z$Text2TextGenerationPipeline.__init__Nc                 K   s   i }|d ur
||d< |}	i }
|d ur|d u r|rt jnt j}|d ur&||
d< |d ur.||
d< |d urK| jj|dd}t|dkrEtd |d |d	< | jd urU| j|	d
< | j	d urd| j|	d< | j	|	d< ||	|
fS )N
truncationreturn_typeclean_up_tokenization_spacesF)add_special_tokensr   zStopping on a multiple token sequence is not yet supported on transformers. The first token of the stop sequence will be used as the stop sequence string in the interim.r   eos_token_idassistant_model	tokenizerassistant_tokenizer)
r   r   r   r0   encodelenwarningswarnr/   r1   )r%   return_tensorsreturn_textr+   r,   r*   stop_sequencegenerate_kwargspreprocess_paramsforward_paramspostprocess_paramsstop_sequence_idsr   r   r   _sanitize_parameters]   s0   






z0Text2TextGenerationPipeline._sanitize_parametersinput_length
min_length
max_lengthc                 C   s   dS )j
        Checks whether there might be something wrong with given input with regard to the model.
        Tr   r%   r?   r@   rA   r   r   r   check_inputs   s   z(Text2TextGenerationPipeline.check_inputsc                   s   | j d ur| j nd t|d tr*| jjd u rtd fdd|d D f}d}nt|d tr; |d  f}d}n
td|d  d	| j|||| jd
}d|v rW|d= |S )N r   zOPlease make sure that the tokenizer has a pad_token_id when using a batch inputc                    s   g | ]} | qS r   r   ).0argprefixr   r   
<listcomp>       zCText2TextGenerationPipeline._parse_and_tokenize.<locals>.<listcomp>TFz `args[0]`: zI have the wrong format. The should be either of type `str` or type `list`)paddingr*   r6   token_type_ids)	rI   
isinstancelistr0   pad_token_id
ValueErrorstr	TypeErrorr#   )r%   r*   r&   rL   inputsr   rH   r   _parse_and_tokenize   s    z/Text2TextGenerationPipeline._parse_and_tokenizer&   r'   returnc                    sZ   t  j|i |}t|d tr+tdd |d D r+tdd |D r+dd |D S |S )a  
        Generate the output text(s) using text(s) given as inputs.

        Args:
            args (`str` or `list[str]`):
                Input text for the encoder.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
                The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
                (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
                max_length instead of throwing an error down the line.
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./text_generation)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
              ids of the generated text.
        r   c                 s   s    | ]}t |tV  qd S )N)rN   rR   )rF   elr   r   r   	<genexpr>   s    z7Text2TextGenerationPipeline.__call__.<locals>.<genexpr>c                 s   s    | ]	}t |d kV  qdS )r   N)r3   rF   resr   r   r   rX      s    c                 S   s   g | ]}|d  qS )r   r   rY   r   r   r   rJ      rK   z8Text2TextGenerationPipeline.__call__.<locals>.<listcomp>)r    __call__rN   rO   all)r%   r&   r'   resultr(   r   r   r[      s   z$Text2TextGenerationPipeline.__call__c                 K   s   | j |fd|i|}|S )Nr*   )rU   )r%   rT   r*   r'   r   r   r   
preprocess   s   z&Text2TextGenerationPipeline.preprocessc                 K   s   | j dkr|d j\}}n| j dkrt|d  \}}| ||d| jj|d| jj d|vr9| j|d< | j	j
d
i ||}|jd }| j dkrc|j||| g|jdd  R  }d	|iS | j dkrzt|||| g|jdd  R }d	|iS )Npt	input_idsr   r@   rA   generation_configr   r   
output_idsr   )r#   shaper   numpyrD   getra   r@   rA   modelgeneratereshape)r%   model_inputsr9   in_br?   rb   out_br   r   r   _forward   s&   




"
$z$Text2TextGenerationPipeline._forwardc                 C   sh   g }|d d D ])}|t jkr| j d|i}n|t jkr,| j d| jj|d|di}|| q|S )Nrb   r   
_token_ids_textT)skip_special_tokensr,   )r   r   return_namer   r0   decodeappend)r%   model_outputsr+   r,   recordsrb   recordr   r   r   postprocess   s   

z'Text2TextGenerationPipeline.postprocess)NNNNNN)r   r   r   __doc___pipeline_calls_generate_load_processor_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configrp   r!   r>   intrD   rU   r   rR   rO   r   dictr[   r   DO_NOT_TRUNCATEr^   rl   r   r   rv   __classcell__r   r   r(   r   r      s4    )
*2&r   c                       s>   e Zd ZdZdZ fddZdedededefd	d
Z  Z	S )SummarizationPipelinea  
    Summarize news articles and other documents.

    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"summarization"`.

    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
    of available parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256
    - num_beams: 4

    Usage:

    ```python
    # use bart in pytorch
    summarizer = pipeline("summarization")
    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)

    # use t5 in tf
    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
    ```summaryc                       t  j|i |S )a  
        Summarize the text(s) given as inputs.

        Args:
            documents (*str* or `list[str]`):
                One or several articles (or one list of articles) to summarize.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./text_generation)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
              ids of the summary.
        r    r[   r$   r(   r   r   r[     s   zSummarizationPipeline.__call__r?   r@   rA   rV   c              	   C   sR   ||k rt d| d| d ||k r't d| d| d|d  d d	S d	S )
rB   zYour min_length=z' must be inferior than your max_length=.zYour max_length is set to z , but your input_length is only z. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=r   )NloggerwarningrC   r   r   r   rD   1  s   z"SummarizationPipeline.check_inputs)
r   r   r   rw   rp   r[   r~   boolrD   r   r   r   r(   r   r      s
    "r   c                       s`   e Zd ZdZdZdededefddZejddd	 fd
d
Z	d fdd	Z
 fddZ  ZS )TranslationPipelinea  
    Translates from one language to another.

    This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"translation_xx_to_yy"`.

    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
    For a list of available parameters, see the [following
    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256
    - num_beams: 4

    Usage:

    ```python
    en_fr_translator = pipeline("translation_en_to_fr")
    en_fr_translator("How old are you?")
    ```translationr?   r@   rA   c                 C   s(   |d| krt d| d| d dS )Ng?zYour input_length: z" is bigger than 0.9 * max_length: z`. You might consider increasing your max_length manually, e.g. translator('...', max_length=400)Tr   rC   r   r   r   rD   \  s
   z TranslationPipeline.check_inputsN)r*   src_langtgt_langc                   s:   t | jdd r| jj|| j|||dS t j|d|iS )N_build_translation_inputs)r6   r*   r   r   r*   )getattrr0   r   r#   r    rU   )r%   r*   r   r   r&   r(   r   r   r^   d  s
   zTranslationPipeline.preprocessc           	         s   t  jdi |\}}}|d ur||d< |d ur||d< |d u rD|d u rD|d| j}|d}|rDt|dkrD|d |d< |d |d< |||fS )	Nr   r   task_r   r      r   )r    r>   re   r   splitr3   )	r%   r   r   r'   r:   r;   r<   r   itemsr(   r   r   r>   l  s   

z(TranslationPipeline._sanitize_parametersc                    r   )a  
        Translate the text(s) given as inputs.

        Args:
            args (`str` or `list[str]`):
                Texts to be translated.
            return_tensors (`bool`, *optional*, defaults to `False`):
                Whether or not to include the tensors of predictions (as token indices) in the outputs.
            return_text (`bool`, *optional*, defaults to `True`):
                Whether or not to include the decoded texts in the outputs.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not to clean up the potential extra spaces in the text output.
            src_lang (`str`, *optional*):
                The language of the input. Might be required for multilingual models. Will not have any effect for
                single pair translation models
            tgt_lang (`str`, *optional*):
                The language of the desired output. Might be required for multilingual models. Will not have any effect
                for single pair translation models
            generate_kwargs:
                Additional keyword arguments to pass along to the generate method of the model (see the generate method
                corresponding to your framework [here](./text_generation)).

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:

            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
              token ids of the translation.
        r   r$   r(   r   r   r[   |  s   zTranslationPipeline.__call__)NN)r   r   r   rw   rp   r~   rD   r   r   r^   r>   r[   r   r   r   r(   r   r   @  s    r   )enumr4   typingr   r   
generationr   tokenization_utilsr   utilsr   r   r	   r
   baser   r   
tensorflowr   models.auto.modeling_tf_autor   models.auto.modeling_autor   
get_loggerr   r   Enumr   r   r   r   r   r   r   r   <module>   s*    
 ZJ