
    Ph4                    z   d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ d dlmZ ddlmZmZmZmZmZ ddlmZmZmZmZ dd	lmZ dd
l m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z,m-Z-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3m4Z4m5Z5m6Z6m7Z7m8Z8m9Z9 ddl:m;Z;m<Z<m=Z=m>Z>m?Z? ddl@mAZA ddlBmCZCmDZDmEZEmFZFmGZGmHZHmIZImJZJmKZKmLZLmMZMmNZNmOZOmPZPmQZQmRZRmSZSmTZTmUZUmVZVmWZWmXZXmYZYmZZZm[Z[ ddl\m]Z]m^Z^m_Z_m`Z`maZambZbmcZc erddldmeZe ddlfmgZg ddlhmiZi  e/j                  ek      Zl e+       rd dlmmnZnmoZo g dZpe?j                  de?j                  de?j                  de?j                  de?j                  de?j                  de?j                  de?j                  de?j                  d i	Zze G d! d"e)             Z{e G d# d$e)             Z|e G d% d&e)             Z}e G d' d(e)             Z~e{Ze{Ze{Ze|Ze|Ze|Ze}Ze}Ze~Ze~Zeeef   Zeeef   Zeeef   Zeeef   Zeeef   Zee{e|f   Zee}e~f   Zeeef   Z G d) d*eA      Zd+ Zd-d,Zy).    N)	dataclass)TYPE_CHECKINGAnyCallableOptionalUnion)version)nn   )CacheDynamicCacheEncoderDecoderCacheQuantizedCacheStaticCache)check_python_requirementsget_cached_module_fileget_class_in_moduleresolve_trust_remote_code)is_deepspeed_zero3_enabled)is_fsdp_managed_module)create_masks_for_generate)isin_mps_friendly)ExtensionsTrie)ModelOutputTransformersKwargsis_accelerate_availableis_hqq_availableis_optimum_quanto_availableis_torchdynamo_exportinglogging   )	AssistantVocabTranslatorCacheAssistedCandidateGenerator-AssistedCandidateGeneratorDifferentTokenizersCandidateGeneratorEarlyExitCandidateGeneratorPromptLookupCandidateGenerator%UniversalSpeculativeDecodingGenerator_prepare_attention_mask_prepare_token_type_ids) ALL_STATIC_CACHE_IMPLEMENTATIONS'DEPRECATED_STATIC_CACHE_IMPLEMENTATIONSSTATIC_CACHE_IMPLEMENTATIONSGenerationConfigGenerationMode)ContinuousMixin)#EncoderNoRepeatNGramLogitsProcessor'EncoderRepetitionPenaltyLogitsProcessorEpsilonLogitsWarperEtaLogitsWarperExponentialDecayLengthPenaltyForcedBOSTokenLogitsProcessorForcedEOSTokenLogitsProcessorInfNanRemoveLogitsProcessorLogitNormalizationLogitsProcessorListMinLengthLogitsProcessor!MinNewTokensLengthLogitsProcessorMinPLogitsWarperNoBadWordsLogitsProcessorNoRepeatNGramLogitsProcessor PrefixConstrainedLogitsProcessor RepetitionPenaltyLogitsProcessorSequenceBiasLogitsProcessor$SuppressTokensAtBeginLogitsProcessorSuppressTokensLogitsProcessorTemperatureLogitsWarperTopKLogitsWarperTopPLogitsWarperTypicalLogitsWarper.UnbatchedClassifierFreeGuidanceLogitsProcessor)ConfidenceCriteriaEosTokenCriteriaMaxLengthCriteriaMaxTimeCriteriaStoppingCriteriaStoppingCriteriaListStopStringCriteria)PreTrainedModel)PreTrainedTokenizerBase)BaseStreamer)AlignDevicesHookadd_hook_to_module)past_key_valuescache_paramsstatememspast_buckets_states_sample_beam_search_assisted_decodingztransformers-community/dolaz)transformers-community/contrastive-searchz(transformers-community/group-beam-searchz.transformers-community/constrained-beam-searchc                      e Zd ZU dZej
                  ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	e	ej                           ed<   dZee	e	ej                           ed<   dZee   ed<   y)	GenerateDecoderOnlyOutputa\  
    Outputs of decoder-only generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    	sequencesNscoreslogits
attentionshidden_statesrV   )__name__
__module____qualname____doc__torch
LongTensor__annotations__ra   r   tupleFloatTensorrb   rc   rd   rV   r        W/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/generation/utils.pyr_   r_      s    4 15FHU5,,-.515FHU5,,-.5<@JuU%6%6789@?CM8E%(9(9":;<C'+OXe_+ro   r_   c                      e Zd ZU dZej
                  ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	ej                        ed<   dZee	e	ej                           ed<   dZee	e	ej                           ed	<   dZee	e	ej                           ed
<   dZee   ed<   y)GenerateEncoderDecoderOutputa  
    Outputs of encoder-decoder generation models, when using non-beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Processed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size, config.vocab_size)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, generated_length, hidden_size)`.
        past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    r`   Nra   rb   encoder_attentionsencoder_hidden_statesdecoder_attentionscross_attentionsdecoder_hidden_statesrV   )re   rf   rg   rh   ri   rj   rk   ra   r   rl   rm   rb   rs   rt   ru   rv   rw   rV   r   rn   ro   rp   rr   rr      s    !F 15FHU5,,-.515FHU5,,-.5=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDK'+OXe_+ro   rr   c                   X   e Zd ZU dZej
                  ed<   dZeej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeej
                     ed<   dZeeeej                           ed<   dZeeeej                           ed	<   dZee   ed
<   y)GenerateBeamDecoderOnlyOutputa
  
    Outputs of decoder-only generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams, num_heads, generated_length, sequence_length)`.
        hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    r`   Nsequences_scoresra   rb   beam_indicesrc   rd   rV   )re   rf   rg   rh   ri   rj   rk   rz   r   rm   ra   rl   rb   r{   rc   rd   rV   r   rn   ro   rp   ry   ry      s    @ 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3<@JuU%6%6789@?CM8E%(9(9":;<C'+OXe_+ro   ry   c                      e Zd ZU dZej
                  ed<   dZeej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeej
                     ed<   dZeeej                        ed<   dZeeej                        ed	<   dZeeeej                           ed
<   dZeeeej                           ed<   dZeeeej                           ed<   dZee   ed<   y) GenerateBeamEncoderDecoderOutputa  
    Outputs of encoder-decoder generation models, when using beam methods.

    Args:
        sequences (`torch.LongTensor` of shape `(batch_size*num_return_sequences, sequence_length)`):
            The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or shorter
            if all batches finished early due to the `eos_token_id`.
        sequences_scores (`torch.FloatTensor` of shape `(batch_size*num_return_sequences)`, *optional*, returned when `output_scores=True`):
            Final beam scores of the generated `sequences`.
        scores (`tuple(torch.FloatTensor)` *optional*, returned when `output_scores=True`):
            Beam transition scores for each vocabulary token at each generation step. Beam transition scores consisting
            of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
            Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
            with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        logits (`tuple(torch.FloatTensor)` *optional*, returned when `output_logits=True`):
            Unprocessed prediction scores of the language modeling head (scores for each vocabulary token before SoftMax)
            at each generation step. Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for
            each generated token), with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
        beam_indices (`torch.LongTensor`, *optional*, returned when `output_scores=True`):
            Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
            `(batch_size*num_return_sequences, sequence_length)`.
        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer of the decoder) of shape `(batch_size, num_heads,
            sequence_length, sequence_length)`.
        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size*num_beams*num_return_sequences, sequence_length, hidden_size)`.
        decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, num_heads, generated_length,
            sequence_length)`.
        cross_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size, num_heads, generated_length, sequence_length)`.
        decoder_hidden_states (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_hidden_states=True`):
            Tuple (one element for each generated token) of tuples (one element for each layer of the decoder) of
            `torch.FloatTensor` of shape `(batch_size*num_beams*num_return_sequences, generated_length, hidden_size)`.
        past_key_values (`Cache`, *optional*, returned when `use_cache=True`):
            Returns the model cache, used to speed up decoding. Different models have a different cache format, check
            the model's documentation. Usually, a [`~cache_utils.Cache`] instance.
    r`   Nrz   ra   rb   r{   rs   rt   ru   rv   rw   rV   )re   rf   rg   rh   ri   rj   rk   rz   r   rm   ra   rl   rb   r{   rs   rt   ru   rv   rw   rV   r   rn   ro   rp   r}   r}     s   (T 48hu001815FHU5,,-.515FHU5,,-.5/3L(5++,3=Au'8'8!9:A@D8E%*;*;$<=DDHuU->->'?!@AHBFhuU5+<+<%=>?FGK8E%0A0A*B$CDK'+OXe_+ro   r}   c            $          e Zd ZdZ	 	 dzdeeeej                  f      dee	   de
fdZdej                  deej                     d	eej                     deej                  ej                  f   fd
Zdej                  deej                     d	eej                     deej                  ej                  f   fdZ	 	 	 	 d{dej                  dee   deej                     deej                     d	eej                     f
dZ	 	 	 d|deej(                     deej(                     deeeej(                  f      deej(                  ee   eeej(                  f   f   fdZ	 	 	 d|deej(                     deej(                     deeeej(                  f      dej                  fdZdej(                  dedeeef   dej                  fdZdej(                  dee   dedeeef   fdZ	 d}dededeeej(                  f   dej(                  deej:                     deej                  eeej(                  f   f   fdZe	 	 	 d~dede	deej                     deej                  eeef   f   fd       Z 	 	 dd e!deeef   de	d!edeeef   f
d"Z"	 	 	 d|dedej                  dej(                  d#e#deeef   d$ed%   d&ed'   d(ed'   de$fd)Z%	 	 	 	 	 	 	 	 dded*ee   d+eej                     d,ee
eej(                  ge&e   f      d#ee#   dee   deeeef      d-eej(                     d.eej(                     de#fd/Z'	 d}ded0ee(   d1ed'   de(fd2Z)d3ee#e(f   d4ee#e(f   dee#e(f   fd5Z*	 	 dd6ej(                  d7eej(                     d8eej(                     d9e	dej(                  f
d:Z+d; Z,deeef   fd<Z-d= Z.d> Z/	 d}dee   d?ee	   d@edeeef   fdAZ0dB Z1dCededDedefdEZ2e3de	fdF       Z4dededGe5dedHede	fdIZ6de	fdJZ7	 	 dzdedKee	   deeej:                  ef      fdLZ8deeef   dede	fdMZ9	 d}dGe5de	dNee   dee   fdOZ:deeef   fdPZ; ejx                         	 	 	 	 	 	 	 	 	 	 	 	 ddeej(                     dee   d#ee#   d0ee(   d,ee
eej(                  ge&e   f      dQee	   d$ed%   dRedS   d-eej(                     d.eej(                     d?ee	   dNeeee
f      dee=ej                  f   fdT       Z>dUe	dQe	dej:                  de	fdVZ?	 d}dej                  d1ed'   dej                  fdWZ@	 	 ddej                  d#e#d0e(dedQe	dRedS   deeAej                  f   fdXZBedYej(                  dej(                  fdZ       ZCedYej(                  ded[edej(                  fd\       ZDedYej(                  d8ej(                  dej(                  fd]       ZEed^ej(                  d_ej(                  d`ej(                  daej(                  dbedceddedeee	ef   dfeFfdg       ZGed^ej(                  daej(                  dhej(                  deee	ef   fdi       ZHdjej(                  dkej(                  dlej(                  dbeddedme	dned[edoededeej(                  ej(                  ej(                  f   fdpZIdqej(                  drej(                  dsej(                  dhej(                  d[edeej(                  ej(                  ej(                  f   fdtZJd6ej(                  drej(                  d`ej(                  dqej(                  d8ej(                  dsej(                  d^ej(                  daej(                  dhej(                  duej(                  d[edbeddedfeFdeee	ef   deej(                  ej(                  ej(                  ej(                  f   f dvZK	 ddej                  d#e#d0e(dedQe	deeLej                  f   fdwZM	 	 	 	 	 	 ddej                  d#e#d0e(dedQe	dRedS   deej                     d$ed%   d(ed'   d1ed'   deeAej                  f   fdxZNdej                  defdyZOy)GenerationMixina  
    A class containing all functions for auto-regressive text generation, to be used as a mixin in model classes.
    Inheriting from this class causes the model to have special generation-related behavior, such as loading a
    `GenerationConfig` at initialization time or ensuring `generate`-related tests are run in `transformers` CI.

    A model class should inherit from `GenerationMixin` to enable calling methods like `generate`, or when it
    has defined a custom `generate` method that relies on `GenerationMixin`, directly or indirectly, which
    approximately shares the same interface to public methods like `generate`. Three examples:
        - `LlamaForCausalLM` should inherit from `GenerationMixin` to enable calling `generate` and other public
            methods in the mixin;
        - `BlipForQuestionAnswering` has a custom `generate` method that approximately shares the same interface as
           `GenerationMixin.generate` (it has a few extra arguments, and the same output). That function also calls
           `GenerationMixin.generate` indirectly, through an inner model. As such, `BlipForQuestionAnswering` should
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
        - *multinomial sampling* if `num_beams=1` and `do_sample=True`
        - *beam-search decoding* if `num_beams>1` and `do_sample=False`
        - *beam-search multinomial sampling* if `num_beams>1` and `do_sample=True`
        - *assisted decoding* if `assistant_model` or `prompt_lookup_num_tokens` is passed to `.generate()`

    To learn more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
    Npretrained_model_name_or_pathtrust_remote_codereturnc                     	 t        |fddi|}t        j                  j	                  |      }d| d}t        |||| |       t        |fdd	i| t        d
|      }|S # t        $ r t        d| d      w xY w)at  
        Loads and returns a custom generate function, given a model repo.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                 Can be either:
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                    - A path to a *directory* containing model weights saved using
                      [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`.
            trust_remote_code (`bool`, *optional*):
                Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
                should only be set to `True` for repositories you trust and in which you have read the code, as it will
                execute code present on the Hub on your local machine.
            **kwargs:
                Additional keyword arguments for remote code loading.

        Raises:
            OSError: If `pretrained_model_name_or_path` does not contain a `custom_generate` subdirectory.

        Returns:
            A callable that can be used to generate text.
        module_filezcustom_generate/generate.py`zw` does not contain a `custom_generate` subdirectory with a `generate.py` file, can't load the custom generate function.zThe repository `zS` contains custom generation code that will override the default `generate` method.)has_local_codehas_remote_codeerror_messagerequirements_filez custom_generate/requirements.txtgenerate)r   OSErrorospathexistsr   r   r   )selfr   r   kwargsmoduleis_local_coder   custom_generate_functions           rp   load_custom_generatez$GenerationMixin.load_custom_generate  s    <	+-;X\bF 'DE<= >- - 	 	")( --'	
 	")	
=_	
ci	
 $7z6#J ''3  	12 3O O 	s   A" "A;	input_idsinputs_embedscache_positionc                 ^   t               r| j                  |||      S |/|j                  d   dk(  r|dd|j                  d    df   }||fS ||d   |j                  d   k\  r|dd|j                  d    df   }||fS |j                  d   |j                  d   k7  r	|dd|f   }||fS )a  
        Generic cache-dependent input preparation
        The code is put in a separate function to allow granular unit testing
        as it needs a different implementation to be exportable.

        If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.

        The current implementation does not rely on ``self`` and could be
        a class method. It is left as a standard method to be easily rewritten.
        Nr!   r   )r   ,_cache_dependant_input_preparation_exportingshape)r   r   r   r   s       rp   "_cache_dependant_input_preparationz2GenerationMixin._cache_dependant_input_preparation  s    * $%DDYP]_mnn$);q)@)!n.B.B1.E-E-G*GHM i'' %r"iooa&88!!n&:&:1&=%=%?"?@I i'' __Q>#7#7#::!!^"34Ii''ro   c                     ||dd|f   }||fS d d d t        j                  |j                  d   dk(  fdfd|||g      \  }}||fS )	z
        This method implements method ``_cache_dependant_input_preparation``
        with :func:`torch.cond` to make it exportable with :func:`torch.export.export`.
        The code is put in a separate function to allow granular unit testing.
        Nc                 P    | d d |j                   d    d f   j                         S Nr   r   clone)r   r   s     rp   branch_1zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_1  s,    $Q)=)=a)@(@(B%BCIIKKro   c                 P    | d d |j                   d    d f   j                         S r   r   r   r   s     rp   branch_2zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_2  s,     ^%9%9!%<$<$>!>?EEGGro   c                 0    | d d |f   j                         S Nr   r   s     rp   branch_3zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.branch_3  s     N!2399;;ro   r!   r   c                 6     ||      | j                         fS r   r   )r   r   r   r   s      rp   <lambda>zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>  s     ?!)Fro   c                 h    |t        j                  |d   | j                  d   k\  fd| |g      fS )Nr   r!   c                 t    t        j                  | j                  d   |j                  d   k7  d | |g      S )Nr!   r   c                 "    | j                         S r   r   r   s     rp   r   zrGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>.<locals>.<lambda>  s    yGXro   ri   condr   )r   r   r   s     rp   r   z`GenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>.<locals>.<lambda>  s9     %

$-OOA$6.:N:Nq:Q$Q$,%X%.$?	!"ro   r   )r   r   r   r   r   s      rp   r   zNGenerationMixin._cache_dependant_input_preparation_exporting.<locals>.<lambda>  s@    %

*2.)//!2DD$ '7Fro   r   )r   r   r   r   r   r   r   s       @@@rp   r   z<GenerationMixin._cache_dependant_input_preparation_exporting  s      !!^"34Ib i''KLH< (-zz"a'" M>:5($M98 i''ro   rV   attention_maskc           
         i }||d<   |||d<   | j                  |||      \  }}| j                  j                  rdnd}| j                  j                  sQ|&t        |      |j                  d   k(  rd||<   ||d<   nL|j                  t        j                        ||<   d|d<   n#|j                  t        j                        ||<   | j                  j                  r|nd}	| j                  j                  r|j                  d	d      n|}| j                  j                  rd	nd
}
| j                  j                  rdnd}||j                  |      ~|t        t        j                  | j                        j                  j                               v r<|j!                         j#                  d      dz
  }|j%                  |dk(  d       |||<   dD ]~  }|j                  |      }||a|j                  d      |d   j                  d   n||   j                  d   }|dd| df   }|j                  t        j                        }|||<    t'        |t(              rY|j*                  rL|I|j,                  dk(  r9| j                  j                  s|d   |d   j                  \  }}}n||   j                  dd \  }}t/        | | j0                  |       }t3        |d      r|j5                         nd}t/        |dd      }||t/        |dd      }|n|j                  d      }|j                  |      }t/        | dt6              } || j                  t        j8                  ||f| j:                        |||||      }n2 ||||j=                         | j:                  ||| j                  |      }||||
<   |	|	|d
<   |j?                         D ]  \  }}||vs|||<    |j                  dd       |S )a_  
        Prepare the model inputs for generation. Notable steps include selecting the correct input key and cloning when appropriate,
        creating position_ids from the attention_mask when missing, slicing inputs and converting 2D attention masks to 4D for
        compilable caches, and finally forwarding all additional keyword arguments unchanged to the model's forward pass.

        See the forward pass in the model documentation for expected arguments (different models might have different
        requirements for e.g. `past_key_values`). This function should work as is for most LLMs.
        r   NrV   decoder_input_idsr   r!   r   )memory_formatdecoder_attention_maskr   decoder_position_idsposition_idsr   r   )r   token_type_idsr   r   get_decoder5_prepare_4d_causal_attention_mask_with_cache_positionr   r   dtype)configinput_embedsr   r   rV   r   r   )sequence_lengthtarget_lengthr   r   
batch_sizer   rV   labels) r   r   is_encoder_decoderlenr   r   ri   contiguous_formatpopgetsetinspect	signatureforward
parameterskeyslongcumsummasked_fill_
isinstancer   is_compileablendimgetattrbase_model_prefixhasattrr   r   emptyr   get_max_cache_shapeitems)r   r   rV   r   r   r   r   model_inputsinput_ids_keyencoder_attention_maskattention_mask_keyposition_ids_keyr   model_input_namemodel_inputcurrent_input_lengthr   r   _
base_modeldecodercausal_mask_creation_functionr   keyvalues                            rp   prepare_inputs_for_generationz-GenerationMixin.prepare_inputs_for_generation"  sR   & )7%& &.=L*+ (,'N'N=.($M9
 04{{/M/M+S^{{--(S-@MDWDWXYDZ-Z.2]+0=_- /8ooELcLco.d]+04_-*3//H_H_/*`L' 48;;3Q3QW[:>++:X:XFJJ/6^l 	 :>9W9W5]m59[[5S5S1Yg&

+,4 C(9(9$,,(G(R(R(W(W(Y$ZZ)..077;a?L%%n&91='3F#$ ![ **%56K&". (++O<H %_5;;A>)-8>>qA )
 #.a2F1F1G.G"HK"-"3"3%BYBY"3"ZK1<-. ![  ...*##q(;;11l?6S6_1=o1N1T1T.
OQ.:=.I.O.OPRQR.S+
O !t'='=tDJ29*m2Tj,,.Z^G,3SUY-) -49L07TVZ1-
 -4!-!1!12B!C+//0@A07>Y[t0u-!>;;!&j/-JRVR\R\!]#1#1$3!-#1	" "?"$3"1"E"E"G**#1);;$3	" %/=L+,!--CL)* !,,.JC,&$)S! )
 	4(ro   inputsbos_token_idmodel_kwargsc                 h   | j                   j                  rFt        | d      r:| j                  j                  | j                  k7  r| j                  j                  }n| j                  }|j                         D ci c]  \  }}|||k7  s|| }}}|j                  |d      }||t        d| d| d| d| d	      ||}|dk(  rd	|v r|d	   |j                  d	       n| j                   j                  sd	t        t        j                  | j                        j                  j                               v }|s#t        d
| j                  j                   d      | j!                  |||      |d<   |d	   d	}}n|t        d      |d	   d	}}| j!                  |||      }|||fS c c}}w )zT
        This function extracts the model-specific `inputs` for generation.
        encoderNz
`inputs`: z` were passed alongside z0 which is not allowed. Make sure to either pass z or z=...r   r   zAYou passed `inputs_embeds` to `.generate()`, but the model class z doesn't have its forwarding implemented. See the GPT2 implementation for an example (https://github.com/huggingface/transformers/pull/21405), and feel free to open a PR with it!)r   zMYou passed `inputs_embeds` and `input_ids` to `.generate()`. Please pick one.)r   r   r   r   main_input_namer   r   
ValueErrorr   r   r   r   r   r   	__class__re   *_maybe_initialize_input_ids_for_generation)	r   r   r   r   
input_namekvinputs_kwarghas_inputs_embeds_forwardings	            rp   _prepare_model_inputsz%GenerationMixin._prepare_model_inputs  s    KK**i(,,0D0DD55J--J)5););)=b)=ARSWaRa1)=b $''
D9#(:VH$<ZL I,,284
|4I  %!F $L)HO,4  1[[33/>#%%d&H&HITTYY[C 0, 4$[\`\j\j\s\s[t ux x  -1,[,[L| -\ -[) &2/%BO
%$%tuu%1/%BO
 @@Wcdz<//[ cs   <F.F.c                 p   ||S |j                  d      }| j                  j                  rR|P|j                  j	                         dd }t        j                  |t
        j                  | j                        dz  S d}|j                         D ]-  }t        |t
        j                        s|j                  d   } n d|v r2t        j                  |dft
        j                  | j                        S |t        d	      t        j                  |dft
        j                  | j                        |z  S )
z3Initializes input ids for generation, if necessary.Nencoder_outputsr   r   deviceir!   r   r   zB`bos_token_id` has to be defined when no `input_ids` are provided.)r   r   r   last_hidden_statesizeri   onesr   r   valuesr   Tensorr   r   )r   r   r   r   r   r   r   r   s           rp   r   z:GenerationMixin._maybe_initialize_input_ids_for_generation  s    M&**+<=;;))o.I#55::<SbAE::e5::dkkJTQQ 
!((*E%."[[^
 +
 l*::z1oUZZTTabbzz:q/DKKPS___ro   inputs_tensorgeneration_configc                 n   |j                   }|j                  }d|v r|d   j                  d   dkD  r|d   }t        j                  |j                  d d t        j
                  |j                        }||S t        |j                        dk(  xr, |j                  t        j                  t        j
                  fv }|s|S |d uxr t        ||      j                         }|d u xs t        ||      j                          }	||	z  }
|j                  |      j                         }||
z  ||
 z  z   }|S )Nr   r!   r   r   r   elementstest_elements)_pad_token_tensor_eos_token_tensorr   ri   r   r   r   r   r   intr   anyne)r   r  r  r   pad_token_ideos_token_iddefault_attention_maskis_input_idsis_pad_token_in_inputs&is_pad_token_not_equal_to_eos_token_idcan_infer_attention_maskattention_mask_from_paddingr   s                rp   &_prepare_attention_mask_for_generationz6GenerationMixin._prepare_attention_mask_for_generation  s_    )::(:: ,&<+D+J+J1+MPQ+Q(5M "'M,?,?,C5::^k^r^r!s))=../14g9L9LQVQZQZ\a\f\fPg9g))".d": "
}LQUUW 	 3?$2F 2
|<PTTVL
. $:<b#b &3&6&6|&D&I&I&K# (*BBE[_w^wEww 	 ro   r   c                    | j                         }t        | d      r4t        |d      rd|j                  _        nt	        |t        d             g d}|j                         D ci c]  \  }t        fd|D              s| }	}}t        t        j                  |j                        j                        }
d|
v xs d|
v }|s(|	j                         D ci c]  \  }}||
v s|| }	}}|j                  |	d	<   |j                  |	d
<   ||n| j                  }d|	d<   ||	|<    |di |	|d<   |S c c}}w c c}}w )Nhf_device_map_hf_hookT)io_same_device)decoder_
cross_attn	use_cachec              3   @   K   | ]  }j                  |        y wr   )
startswith).0parguments     rp   	<genexpr>zQGenerationMixin._prepare_encoder_decoder_kwargs_for_generation.<locals>.<genexpr>N  s     I7H!x**1-7H   r   r   output_attentionsoutput_hidden_statesreturn_dictr   rn   )get_encoderr   r  r  rU   rT   r   r
  r   r   r   r   r   r#  r$  r   )r   r  r   r   r  r   irrelevant_prefixr   r   encoder_kwargsencoder_signatureencoder_accepts_wildcards          `    rp   ._prepare_encoder_decoder_kwargs_for_generationz>GenerationMixin._prepare_encoder_decoder_kwargs_for_generation8  sz    ""$ 4)w
+26  /"7,<D,QR D $0#5#5#7
#7%I7HII eO#7 	 

   1 1'// B M MN#+/@#@#gNVgDg '7E7K7K7M7MOHeQY]nQn%7M   /@.Q.Q*+1B1W1W-. 0@/K+QUQeQe(,}%+8'(7>7P7P&')
s   *!D9D?,D?r   decoder_start_token_idr   c                    |d|v r|j                  d      }nd|v r|dk7  r|j                  d      }nd}|| j                  }|j                  dk(  rC|j                  d   |k7  rt	        d| d|j                  d          |j                  dd      }n+t        j                  |dft        j                  |	      |z  }||}||fS d
| j                  j                  j                         v sI| j                  j                  dk(  r5d
| j                  j                  j                  j                         v r	 ||fS | j                  j                  dk(  r	 ||fS |dddf   |dddf   k7  j                         j!                         r\t        j"                  ||gd      }d|v r?|d   }t        j"                  t        j$                  |      ddddf   |fd      }||d<   ||fS )zGPrepares `decoder_input_ids` for generation with encoder-decoder modelsNr   r   r!   r   z1`decoder_start_token_id` expected to have length z	 but got r   r   donutzvision-encoder-decoderwhisperdimr   )r   r   r   r   r   viewri   r   r   r   re   lowerr   
model_typer   allitemcat	ones_like)r   r   r   r   r,  r   r   r   s           rp   )_prepare_decoder_input_ids_for_generationz9GenerationMixin._prepare_decoder_input_ids_for_generationa  s(    #(;|(K , 0 01D EL(-=-L , 0 0 = $ >[[F!&&!+%++A.*< G
|S\]s]y]yz{]|\}~  &<%@%@Q%G" 

J?%**VLOee # $ 6, !,..% //5577KK""&>>7dkkNaNaNlNlNrNrNtCt !,.. [[##y0 !,..  1%)?1)EEJJLQQS %		+ACT*U[] ^'<7)56N)O&).__%;<QUCE[\*& :P56 ,..ro   expand_sizer   c                       dk(  r||fS  fd}||j                   d      } ||      }|r*|j                  d      t        d       ||d         |d<   ||fS )zIExpands tensors from [batch_size, ...] to [batch_size * expand_size, ...]r!   c                     | D ]E  }|dk7  s	| |   t        | |   t        j                        s-| |   j                  d      | |<   G | S )Nr   r   r0  )r   ri   r   repeat_interleave)dict_to_expandr   r:  s     rp   _expand_dict_for_generationzRGenerationMixin._expand_inputs_for_generation.<locals>._expand_dict_for_generation  s]    %++&s+7">##6E*8*=*O*OP[ab*O*cN3' & "!ro   r   r0  r   zMIf `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.)r=  r   r   )r:  r   r   r   r?  s   `    rp   _expand_inputs_for_generationz-GenerationMixin._expand_inputs_for_generation  s     !l**	"  !33KQ3GI2<@ 12: !pqq.I,WhJi.jL*+,&&ro   outputsnum_new_tokensc                    t         D ]   }||v s|dv rd}n|}t        ||      ||<    n d|v r7|d   }t        j                  ||d d df   j	                  d      gd      |d<   |sDd|v r|d   }t        j                  ||j                  |j                  d   df      gd      |d<   nCd	|v r?|d	   }	t        j                  |	|	j                  |	j                  d   df      gd      |d	<   |j                  d
d      r|d   dd  |z   |d<   |S |j                  d      }
t        j                  |
d   dz   |
d   |z   dz   |
j                        j                  |
j                        }t        j                  |
|f      |d<   |S )N)rZ   rY   rV   r   r   r0  r   r   r!   r   r  Tr   r   )ALL_CACHE_NAMESr   ri   r7  	unsqueezenew_onesr   r   r   aranger   tor   )r   rA  r   r   rB  possible_cache_name
cache_namer   r   r   past_positionsnew_positionss               rp   #_update_model_kwargs_for_generationz3GenerationMixin._update_model_kwargs_for_generation  s    $3"g-&*II!2J!4J+27<O+PZ( $3 |+)*:;N-2YYWXZ\W\H]HgHghjHk7lrt-uL)*!</!-.>!?16#^%<%<n>R>RST>UWX=Y%Z[ac2-.
 (<7)56N)O&9>+-C-L-LNdNjNjklNmopMq-rs:56
 K.-9:J-KBC-PSa-aL)*  *--.>?N!LLr"Q&r(:^(Ka(OWeWkWkb&&'  .3YY7V-WL)*ro   logits_processorassistant_modelrQ   target_tokenizerrR   assistant_tokenizerc	                    t        d |||fD              }	|j                  t        || ||||      }
|
S |j                  at	        |j
                  |j                  |j                  xs d|j                  || j                  j                         j                        }
|
S |	r|j                  du rct        j                  ||| j                  j                         j                  |d      }d|j                  _        t!        |||||||||	      }
|
S |j                  d	u rt#        ||||||||
      }
|
S t%        dt'        |j                        j(                         t+        ||||||      }
|
S )zU
        Returns the candidate generator to be used in `assisted_generation`
        c              3   $   K   | ]  }|d u 
 y wr   rn   )r  r   s     rp   r!  z;GenerationMixin._get_candidate_generator.<locals>.<genexpr>  s     "s:rQ1D=:rs   N)r   rO  r  r   r  rN  r   )r  num_output_tokensmax_matching_ngram_size
max_lengthrN  
vocab_sizeT)rO  assistant_prune_lm_head)	r   rO  r  r   r  rN  rP  rQ  atm_translatorF)r   rO  r  r   r  rN  rP  rQ  z7Invalid value for `do_sample`: expected a boolean, got )r5  assistant_early_exitr&   prompt_lookup_num_tokensr'   r  rU  rV  r   get_text_configrW  	do_sampler"   get_translatorr  repetition_penaltyr(   r$   r   typere   r#   )r   r  r   r  rN  r   rO  rP  rQ  different_tokenizerscandidate_generatorrY  s               rp   _get_candidate_generatorz(GenerationMixin._get_candidate_generator  s     #"s?L\^q:r"ss11="=# $"3)+!1#~ #"o 77C"@.@@"3"L"L(9(Q(Q(VUV,77!1;;668CC#l #"] " **d2!>!M!M$'KK//1<<$3,0" HL11D&K'$3&7!-"/%5%5(;#1
'#F #"1 #,,5&S'$3&7!-"/%5%5(;	'#. #" !MdSdSnSnNoNxNxMyz  #=# /"3)+!1# #"ro   input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnnegative_prompt_idsnegative_prompt_attention_maskc
           	         t               }
|g }|j                  B|j                  dk7  r3|
j                  t        |j                  | ||	|j                               |j
                  %|
j                  t        |j
                               |j                  h|j                  dk7  rYt        |j                        dk(  r'|
j                  t        |j                  |             nt        j                  dt               |j                  4|j                  dk7  r%|
j                  t        |j                  	             |j                   3|j                   d
kD  r$|
j                  t#        |j                                |j$                  g|j$                  d
kD  rXt        |j                        dk(  r&|
j                  t'        |j$                  |             nt        j                  dt               |j(                  /|
j                  t+        |j(                  |j,                               |j.                  Mt1        |dd      @|j.                  d
kD  r1|
j                  t3        |j.                  |j,                  |             |j4                  Nt1        |dd      A|j4                  d
kD  r2|
j                  t7        ||j4                  |j,                  |             |%|
j                  t9        ||j:                               |j<                  $|
j                  t?        |j<                               |j@                  1|
j                  tC        |jD                  |j@                  |             |jF                  du r|
j                  tI                      |jJ                  0|
j                  tM        |jJ                  |j,                  |             |jN                  &|
j                  tQ        |jN                  |             |jR                  A|}|dkD  s|j<                  |n|dz   }|
j                  tU        |jR                  ||             | jW                  |
|      }
|jX                  rQ|j:                  dkD  rwt[        |j,                  t\              rt        |j,                        dz   }nFt[        |j,                  t^        j`                        r|j,                  j                  d
   dz   }nd}nd}|jb                  3|jb                  dk7  r$|
j                  te        |jb                               |jf                  5|jf                  d
k7  r&|
j                  ti        |jf                  |             |jj                  5|jj                  dk  r&|
j                  tm        |jj                  |             |jn                  &|
j                  tq        |jn                  |             |jr                  5|jr                  dk  r&|
j                  tu        |jr                  |             |jv                  >d|jv                  cxk  rdk  r)n n&|
j                  ty        |jv                  |             |jz                  ?d|jz                  cxk  rdk  r*n n'|
j                  t}        |jz                  ||             |j~                  M|
j                  |j~                  j                  | j                  j                         j                  |             |j                  du r|
j                  t                      |
S )z
        This class returns a [`LogitsProcessorList`] list object that contains all relevant [`LogitsProcessor`]
        instances used to modify the scores of the language model head.
        Nr!   )unconditional_idsunconditional_attention_maskr  sequence_bias      ?r   )penaltyre  zyPassing `encoder_repetition_penalty` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.)ro  r   z{Passing `encoder_no_repeat_ngram_size` requires some form of `input_ids` to be passed to `generate`, ignoring the argument.r  r   T)top_kmin_tokens_to_keep)top_prr  )min_prr  )massrr          )epsilonrr  )rw  rr  r   )Fr:   guidance_scaleappendrI   r  rm  rB   encoder_repetition_penaltyr   r   r2   warningswarnUserWarningr_  rA   no_repeat_ngram_sizer?   encoder_no_repeat_ngram_sizer1   bad_words_idsr>   r  
min_lengthr   r;   min_new_tokensr<   r@   	num_beamsforced_bos_token_idr6   forced_eos_token_idr7   rV  remove_invalid_valuesr8    exponential_decay_length_penaltyr5   suppress_tokensrD   begin_suppress_tokensrC   _merge_criteria_processor_listr]  r   listri   r   temperaturerE   rq  rF   rs  rG   rt  r=   	typical_prH   epsilon_cutoffr3   
eta_cutoffr4   watermarking_configconstruct_processorr   r\  rW  renormalize_logitsr9   )r   r  rd  re  rf  rN  r   r   rg  rh  
processorsbegin_indexrr  s                rp   _get_logits_processorz%GenerationMixin._get_logits_processorB  s   " )*
#!++7<M<\<\`a<a>%44&91O/99 **69HYHgHghi 88D!<<C$**+q0!!; 1 L L*; 9
 //;@Q@d@dhk@k>GXGkGklm11=BSBhBhklBl:;L;a;abc::F!>>B$**+q0!!7)FF) 9
 **6)%33%77 ((4)+>EQ!,,q0(%00%77! ,,8)+>EQ!00141(%44%77!	 $/0,%// 00<-%99
 00<-%00%99! 22d:9;<==I-%FF%77( ,,8-%55! 22>.K )1,0A0U0U0]  1_ 
 4%;;! 88EUV
 && !**Q./AA4H),->-P-P)QTU)U& 1 C CU\\R):)L)L)R)RST)UXY)Y&)*&%&" !,,8=N=Z=Z^a=a!!"9:K:W:W"XY &&27H7N7NRS7S!!$+<+B+BWij !&&27H7N7NQT7T!!$+<+B+BWij !&&2!!$+<+B+BWij !**6;L;V;VY\;\!!'->-H-H]op !//;FWFfFf@lil@l!!' 1 @ @Ug
 !++7CBSB^B^<dad<d!!# 1 < <Qclr 00<!55IIKK//1<<f //47023ro   stopping_criteria	tokenizerc                    t               }|j                  =t        | j                  dd       }|j	                  t        |j                  |             |j                  %|j	                  t        |j                               |j                  3|t        d      |j	                  t        |j                  |             |j                  %|j	                  t        |j                               |j                  r@|j                  4|j                  dkD  r%|j	                  t        |j                               | j!                  ||      }|S )	Nmax_position_embeddings)rV  r  )max_timea  There are one or more stop strings, either in the arguments to `generate` or in the model's generation config, but we could not locate a tokenizer. When generating with stop strings, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.)stop_stringsr  )r  r   )assistant_confidence_threshold)rO   rV  r   r   ry  rL   r  rM   r  r   rP   r  rK   is_assistantr  rJ   r  )r   r  r  r  criteriar  s         rp   _get_stopping_criteriaz&GenerationMixin._get_stopping_criteria  s3    ()''3&-dkk;TVZ&[#OO!0;;,C %%1OOO5F5O5OPQ))5  s 
 OO.<M<Z<Zfopq..:OO,:K:]:]^_**!@@L!@@1DOO"BSBrBrs 66xARSro   default_listcustom_listc                    t        |      dk(  r|S  t        |             }|D ]  }d}|D ]~  }t        |      t        |      u st        |t              rdnd}t        j                  d| dt        |       dt        |       dt        |       d		       |j                  |       d
} n |r|j                  |        |D ]  }||vs|j                  |        |S )a4  
        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
        processor/criteria is present on both lists, use the user-defined one.

        (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
        r   Fzstopping criteriazlogits processorz	A custom z	 of type zt has been passed to `.generate()`, but it was also created in `.generate()`, given its parameterization. The custom z5 will take precedence. Please check the docstring of z$ to see related `.generate()` flags.T)r   r`  r   rN   loggerwarning_oncery  )r   r  r  
final_listdefaultusing_customcustomobject_types           rp   r  z.GenerationMixin._merge_criteria_processor_listE  s    {q 'T,')
#G L%<4=09CFL\9]"5cuK''#K=	$v, Heeijpeqdr sOOSTZ|n ]// %%f-#'L &  !!'* $" "FZ'!!&) " ro   r`   ra   r{   normalize_logitsc                    |it        j                  |d   j                  d         j                  dd      j	                  |j
                        }|j                  dt        |            }t        j                  |      j                  t        |      d      j                  dd      }|r|j                  d| j                  j                         j                  |j                  d         }t         j                  j                  j!                  |d      }|j                  d|j                  d         }|dk  }d|j#                         z
  j%                  d      j'                         }|j)                         ddd|f   }|ddd|f   }d||<   || j                  j                         j                  z  }|j                  d   |z
  }|dd|df   |z   }	|j+                  d|	      }
d|
|<   |
S )a  
        Computes the transition scores of sequences given the generation scores (and beam indices, if beam search was
        used). This is a convenient method to quickly obtain the scores of the selected tokens at generation time.

        Parameters:
            sequences (`torch.LongTensor`):
                The generated sequences. The second dimension (sequence_length) is either equal to `max_length` or
                shorter if all batches finished early due to the `eos_token_id`.
            scores (`tuple(torch.FloatTensor)`):
                Transition scores for each vocabulary token at each generation step. Beam transition scores consisting
                of log probabilities of tokens conditioned on log softmax of previously generated tokens in this beam.
                Tuple of `torch.FloatTensor` with up to `max_new_tokens` elements (one element for each generated token),
                with each tensor of shape `(batch_size*num_beams, config.vocab_size)`.
            beam_indices (`torch.LongTensor`, *optional*):
                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
                generate-time.
            normalize_logits (`bool`, *optional*, defaults to `False`):
                Whether to normalize the logits (which, for legacy reasons, may be unnormalized).

        Return:
            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length)` containing
                the transition scores (logits)

        Examples:

        ```python
        >>> from transformers import GPT2Tokenizer, AutoModelForCausalLM
        >>> import numpy as np

        >>> tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
        >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
        >>> tokenizer.pad_token_id = tokenizer.eos_token_id
        >>> inputs = tokenizer(["Today is"], return_tensors="pt")

        >>> # Example 1: Print the scores for each token generated with Greedy Search
        >>> outputs = model.generate(**inputs, max_new_tokens=5, return_dict_in_generate=True, output_scores=True)
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, normalize_logits=True
        ... )
        >>> # input_length is the length of the input prompt for decoder-only models, like the GPT family, and 1 for
        >>> # encoder-decoder models, like BART or T5.
        >>> input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
        >>> generated_tokens = outputs.sequences[:, input_length:]
        >>> for tok, score in zip(generated_tokens[0], transition_scores[0]):
        ...     # | token | token string | log probability | probability
        ...     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")
        |   262 |  the     | -1.414 | 24.33%
        |  1110 |  day     | -2.609 | 7.36%
        |   618 |  when    | -2.010 | 13.40%
        |   356 |  we      | -1.859 | 15.58%
        |   460 |  can     | -2.508 | 8.14%

        >>> # Example 2: Reconstruct the sequence scores from Beam Search
        >>> outputs = model.generate(
        ...     **inputs,
        ...     max_new_tokens=5,
        ...     num_beams=4,
        ...     num_return_sequences=4,
        ...     return_dict_in_generate=True,
        ...     output_scores=True,
        ... )
        >>> transition_scores = model.compute_transition_scores(
        ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
        ... )
        >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
        >>> # use case, you might want to recompute it with `normalize_logits=True`.
        >>> # Tip 2: the output length does NOT include the input length
        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
        >>> length_penalty = model.generation_config.length_penalty
        >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
        >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
        True
        ```Nr   r   r!   r0  )ri   rG  r   r2  rH  r   expandr   stackreshape	transposer   r\  rW  r
   
functionallog_softmaxr   summaxr   gather)r   r`   ra   r{   r  beam_indices_maskmax_beam_lengthbeam_sequence_indicescut_idxindicestransition_scoress              rp   compute_transition_scoresz)GenerationMixin.compute_transition_scoresj  s   h  <<q	(:;@@QGJJ9K[K[\L'..r3v;?L V$,,S["=GG1M ^^B(C(C(E(P(PRXR^R^_aRbcFXX((44V4CF^^BR(89F )1,05577<<R@DDF#))+A/?/?,?@-a1A/1A.AB +,&' !-t{{/J/J/L/W/W W //"%7AwxK(+@@ #MM!W5 01+,  ro   c                 .    |t         j                  k(  rd|v rt        d      |t         j                  k(  rV|j                  dkD  rt        d|j                   d       j
                  r"t        d j                  j                         |j                  d      x j                  j                  rcj                  j                  sMg d}t        j                        D cg c]	  }||v s| }}t         fd	|D              }|st        d
      d} j                  j                         j                  j                  j                         j                  k(  rd|v rt        d| d      y d|vsd|vrt        d| d      y y c c}w )NstreamerzZ`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1.r!   zFnum_return_sequences has to be 1 when doing assisted generate, but is .zCassisted generation is not supported with stateful models, such as rO  )encoder_attention_headsencoder_ffn_dimencoder_layersc              3   x   K   | ]1  }t        j                  |      t        j                  |      k(   3 y wr   )r   r   )r  attrrO  r   s     rp   r!  z<GenerationMixin._validate_generation_mode.<locals>.<genexpr>  s5       dw\`GDKK.'/:P:PRV2WWdws   7:zThe main model and the assistant don't have compatible encoder-dependent input shapes. Ensure you load the assistant with the correct encoder-decoder class, e.g. `AutoModelForSpeechSeq2Seq` for Whisper.zc(see https://huggingface.co/docs/transformers/en/generation_strategies#universal-assisted-decoding)rQ  z`assistant_tokenizer` is not required when the main and assistant models use the same tokenizer. Please omit `assistant_tokenizer` from `generate()` r  z~The main and assistant models have different tokenizers. Please provide `tokenizer` and `assistant_tokenizer` to `generate()` )r/   BEAM_SEARCHr   ASSISTED_GENERATIONnum_return_sequences_is_statefulr   re   r   r   r   dirr5  r\  rW  )	r   generation_moder  generation_mode_kwargsattributes_to_checkr  	are_equaldoc_referencerO  s	   `       @rp   _validate_generation_modez)GenerationMixin._validate_generation_mode  s   n888ZKa=al  n@@@ 559 /DDEQH     !YZ^ZhZhZqZqYrs   699:KLLOY{{--o6L6L6_6_&f#8;O<R<R8S&s8SW[_rWrt8S#&s  dw  	 !$N  v  {{**,77?;Q;Q;a;a;c;n;nn(,BB$ p  q~  p  @  A  C
 &<<@U]s@s$ Y  Zg  Yh  hi  j  At- Z 'ts   (	F2Fc                    | j                   j                  rdD ]  }|j                  |d        g }t        t	        j
                  | j                        j                        }d|v sd|v r5|t        t	        j
                  | j                        j                        z  }| j                   j                  rt        | | j                  d      }t        | dd      }||t        |dd      }|7t        t	        j
                  |j                        j                        }||z  }t        | dd      }||t        |dd      }|Ht        t	        j
                  |j                        j                        }	||	D 
ch c]  }
d|
 	 c}
z  }|j                         D ]1  \  }}|	||vs|t        j                  vs!|j                  |       3 |rt        d| d	      yc c}
w )
zXValidates model kwargs for generation. Generate argument typos will also be caught here.)r   Nr   r   r   r   r  z8The following `model_kwargs` are not used by the model: zG (note: typos in the generate arguments will also show up in this list))r   r   r   r   r   r   r   r   r   r   r   r   r   __optional_keys__ry  r   )r   r   r   unused_model_args
model_argsr   r   encoder_model_argsr   decoder_model_argsxr   s               rp   _validate_model_kwargsz&GenerationMixin._validate_model_kwargs  s    ;;)),  d+ - **4+M+MNYYZ
 z!^z%A#g//=HHIIJ ;;)) t'='=tDJ dIt4G :#9!*i>"%():):7??)K)V)V%W"00
 dIt4G:#9!*i>"%():):7??)K)V)V%W"7IJ7I!!~7IJJ
 ',,.JC S
%:sJ\JnJn?n!((- / JK\J] ^F F   Ks   <G#c           	         |rC|j                   7|j                  dk(  r(t        j                  d|j                   dt               ||j                  k\  r9| j
                  j                  rdnd}t        d| d| d	|j                   d
      d}|r|d|j                   dz  }|j                  Q|j                  |j                  kD  r8t        j                  d|j                   d|j                   d|z   t               |j                  [|j                  |z   }||j                  kD  r<t        j                  d|j                   d| d|j                   d|z   t               yyy)z=Performs validation related to the resulting generated lengthN   z0Using the model-agnostic default `max_length` (=zz) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.r   r   zInput length of z is z, but `max_length` is set to z}. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.z Generation will stop at the defined maximum length. You should decrease the minimum length and/or increase the maximum length.z" Note that `max_length` is set to z, its default value.z-Unfeasible length constraints: `min_length` (z.) is larger than the maximum possible length (z).z1Unfeasible length constraints: `min_new_tokens` (z$), when added to the prompt length (z/), is larger than the maximum possible length ()
max_new_tokensrV  r{  r|  r}  r   r   r   r  r  )r   r  input_ids_lengthhas_default_max_lengthinput_ids_stringmin_length_error_suffixr  s          rp   _validate_generated_lengthz*GenerationMixin._validate_generated_lengthD  s    "&7&F&F&NSdSoSosuSuMMBCTC_C_B` a  	 0;;;6:kk6T6T2Ze"#3"4D9I8J K%001 2UU + 	  "#45F5Q5Q4RRfg# ''38I8T8TWhWsWs8sMM?@Q@\@\?] ^11B1M1M0NbRTkl
 ++7*99<LLJ-888GHYHhHhGi j33C2D E55F5Q5Q4RRTVXop  	 9 8ro   c                    |j                   S|s<|j                  0t        j                  d|j                    d|j                   d       |j                   |z   |_        n|dk(  rK||j                  d   k7  r9| j
                  j                  s#|xj                  |j                  d   z  c_        nk|ri|j                  t               j                  k(  rH|j                  |z   |_        t        | j
                  dd      }|t        |j                  |      |_        |j                  H|s0t        j                  d|j                   d	|j                   d
       |j                  |z   |_        |S |dk(  rS||j                  d   k7  rA| j
                  j                  s+t        |j                  |j                  d   z
  d      |_        |S )z]Prepared max and min length in generation configs to avoid clashes between similar attributesNzBoth `max_new_tokens` (=z) and `max_length`(=z) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   r!   r  zBoth `min_new_tokens` (=z) and `min_length`(=z) seem to have been set. `min_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)r   )r  rV  r  warningr   r   r   r.   r   minr  r  r  )r   r  r  has_default_min_lengthr   r  r  r  s           rp   _prepare_generated_lengthz)GenerationMixin._prepare_generated_lengthp  s    ++7).?.J.J.V./@/O/O.PPd(334 5ff ,=+K+KN^+^(
 / M$7$7$::KK22((M,?,?,BB(# ++/?/A/L/LL/@/K/KN^/^!,*1$++?XZ^*_'*6367H7S7SUl3m%0 ++7)./@/O/O.PPd(334 5ff ,=+K+KN^+^( !  / M$7$7$::KK22+./@/K/KmNaNabcNd/dfg+h(  ro   use_model_defaultsr   c                 d   d}|| j                   j                  r| j                   j                  t        | j                         k(  rut	        | j
                  j                               dkD  rOt        j                  | j
                        }|| j                   k7  r!t        j                  dt               || _         | j                   }d}|j                  dk(  rd|_        t        j                  |      }|st        j                   t        j                   | j                   j"                        j$                        }|du s||t        j                   d      k\  ri }t               }| j                   }	|	j&                  j)                         D ]i  \  }
}|
j+                  d      s|
d	k(  r|
d
k(  r|	j                  dk(  r2t-        ||
d      }t-        ||
d      }||k(  sR||k7  sX|||
<   t/        ||
|       k |j0                  dk(  rd|_        |t	        |      dkD  rt4        j7                  d| d       n|j8                  | j                   j8                  |_        |j:                  | j                   j:                  |_        |j<                  | j                   j<                  |_        |j>                  | j                   j>                  |_         |j@                  di |}|jB                  }|jD                  }|jA                  |rd|ini        |jA                  |rd|ini        ||fS )z
        Prepares the base generation config, then applies any generation configuration options from kwargs. This
        function handles retrocompatibility with respect to configuration files.
        FNr   a?  You have modified the pretrained model configuration to control generation. This is a deprecated strategy to control generation and will be removed in v5. Please use and modify the model generation configuration (see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )Thybridz4.50.0r   transformers_versioncache_implementationrv  zX`generation_config` default values have been modified to match model-specific defaults: z=. If this is not desired, please set these values explicitly.r#  r$  rn   )#r  _from_model_config_original_object_hashhashr   r   &_get_non_default_generation_parametersr.   from_model_configr{  r|  r}  r  copydeepcopyr	   parser  base_version__dict__r   r  r   setattrr  r]  r  r  r   r  r  r,  updater#  r$  )r   r  r  r   using_model_generation_confignew_generation_configmodel_base_versionmodified_values global_default_generation_configmodel_generation_configr   model_gen_config_valueglobal_default_valuecustom_gen_config_valuer   r#  r$  s                    rp   _prepare_generation_configz*GenerationMixin._prepare_generation_config  s&    ).%$ &&99**@@DI_I_D``JJLMPQQ(8(J(J4;;(W%(D,B,BBMMB $ .CD* $ 6 6,0) !55A9=!6 !MM*;<, ")w}}T=S=S=h=h/i/v/v!w!T)"*/AW]]S[E\/\"$3C3E0*.*@*@'3J3S3S3Y3Y3[/C/~~c*c5K.K 449P9e9eiq9q +23SUXZ^+_(.56Gd.S+/3GG26JJ/E, 138NO 4\  %00C727%/%-#o2F2J''r*++hj
 %119595K5K5X5X%2$119595K5K5X5X%2$119595K5K5X5X%2$;;C?C?U?U?l?l%< 0(//9&9-??0EEHY02CD_abNb35IJhjk ,..ro   c                    d|v r|d   |S d|v rY| j                   j                  sCt        j                  |d   ddddf   t        j                        j                  d      dz
  }nd|v rY| j                   j                  rCt        j                  |d   ddddf   t        j                        j                  d      dz
  }n8t        j                  |t        j                  |      j                  d      dz
  }d}|j                  d	      N|d	   }d}t        |t              r|d   d   j                  d
   }nt        |d      r|j                         }||d }||d<   |S )zbCalculates `cache_position` for the pre-fill stage based on `input_ids` and optionally past lengthr   Nr   r   r   r!   decoder_inputs_embedsr   rV   r   get_seq_length)r   r   ri   r8  int64r   r   r   r   rl   r   r   r  )r   
seq_lengthr   r   r   past_lengthcaches          rp   _get_initial_cache_positionz+GenerationMixin._get_initial_cache_position  se    |+=M0N0Zl*4;;3Q3Q"__\/-J1aQR7-S[`[f[fgnnopqtuuN$49W9W-D EaAg NV[VaVabiijklopp  #ZZ
%++fU\\]^_bccN-.: !23EK%'#Ahqk//2 01#224+KL9N)7%&ro   r  max_cache_lenc                 J   | j                   j                  xs |j                  d      du}d|v }t        | d      r$|r| j                  j
                  n| j                  }t        | d       xs1 j                  |k7  xs  |j                  |k7  xs |j                  |k  }|rFt        | d      r:|xs6 | j                  j                  j                  |d   d   j                  d   k7  }|r| j                   j                  d      ||d	}	t        d
i |	| _        |rW| j                   j                  d      |d   d   j                  d   |d	}
t        | j                  t        d
i |
      | _        | j                  S | j                  j                          | j                  S )z
        Sets a cache for `generate`, that will persist across calls. A new cache will only be initialized a
        new `generate` call requires a larger cache or uses a different batch size.

        Returns the resulting cache object.
        r   N	offloaded_cacher   r!   Tr   )r   r  
offloadingrn   )r   r   r   r   r
  self_attention_cacher  max_batch_sizer  cross_attention_cacher   r\  r   r   reset)r   r  r   r  r   requires_cross_attention_cacheoffload_cachecache_to_checkneed_new_cacheself_attention_cache_kwargscross_attention_cache_kwargss              rp   
_get_cachezGenerationMixin._get_cache0  s    KK**]l.>.>?P.QY].] 	' $';;4"A_T[[==eiepepN h'' <((M9<,,
:< ++m;	 	 *gdH.E r;;44BBlSdFefgFhFnFnopFqq 
 ++55d5C!.++'
 &D(CDDK-"kk99$9G%12C%DQ%G%M%Ma%P"/0,
 2$++{?jMi?jk {{ KK{{ro   c                 J      j                    xr t         fddD              S )a  
        Return `True` if current model can use a `DynamicCache` instance when initializing the `past_key_values`.
        This adds exception for some models like `Mamba` models which use their own caches
        and do not need to initialize the Cache in advance in order to save memory (because no back and forth
        `to_legacy_cache` and `from_legacy_cache` will be performed for mamba-based models).
        c              3   V   K   | ]   }|j                   j                         v " y wr   )re   r3  )r  special_model_nameclss     rp   r!  zBGenerationMixin._supports_default_dynamic_cache.<locals>.<genexpr>g  s0      	,
'" cll&8&8&::'s   &))reformerminimaxxlnetlfm2zlfm2-vl)r  r5  )r  s   `rp   _supports_default_dynamic_cachez/GenerationMixin._supports_default_dynamic_cache^  s1     ### 	
 	,
'	,
 	)
 		
ro   r  max_cache_lengthc                 l    t         fddD              }|sdnd} j                  j                  xs |j                  d      du}|j                  |      }	|	|j                  t        d| d      t        |	t              rT j                         rDt        j                  d	       |st        j                  |	      nt        j                  |	      ||<   y|j                  d
u ry j                         s0|j                  #t        j                  d|j                   d       y|t        j                   k(  r6|j                  *t        j                  d|j                   d       d|_        |t        j                   t        j"                  fv s|j                  dk(  ri }
nd j                  j%                  d      i}
|j                  |j                  t&        v r|j                  t(        v r*t        j                  d|j                   dt*         d        j-                  |j                  t/        |j0                  |j2                        |z  ||      ||<   n|j                  dk(  rÉ j                  j                  s j                         st        d      |j4                  |j4                  ni }d|vr j                  j%                         |d<   |j7                  dd      }|dk(  rt9               st;        d      |dk(  rt=               st;        d      t?        d!d|i|||<   nM|j                  dk(  rt        d!i |
ddi||<   n+d |j                  v rt        d!i |
||<   nt        d!i |
||<   |r0t        ||   t              st        ||   t        d!i |
      ||<   yyy)"z
        Prepares the cache for generation (if applicable), given `generate`'s parameterization. If a cache is
        instantiated, writes it to `model_kwargs`, under the name expected by the model.
        c              3   j   K   | ]*  }|j                   j                  j                         v  , y wr   )r   re   r3  )r  
class_namer   s     rp   r!  z@GenerationMixin._prepare_cache_for_generation.<locals>.<genexpr>  s-     t^sPZjDNN,C,C,I,I,KK^ss   03)mambafalconh1rV   rW   r   NzMPassing both `cache_implementation` (used to initialize certain caches) and `zB` (a Cache object) is unsupported. Please use only one of the two.zPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `Cache` instead.FzNThis model does not support `Cache` instances. `cache_implementation` (set to z) will be ignored.zRAn assistant model is provided, using a dynamic cache instead of a cache of type='z'.dynamic_fullr   Tr  zUsing `cache_implementation='z(' is deprecated. Please only use one of z9, and the layer structure will be inferred automatically.)r  r   r  r   	quantizedzThis model does not support the quantized cache. If you want your model to support quantized cache, please open an issue and tag @zucchini-nlp.backendquantozYou need to install optimum-quanto in order to use KV cache quantization with optimum-quanto backend. Please install it via  with `pip install optimum-quanto`HQQzYou need to install `HQQ` in order to use KV cache quantization with HQQ backend. Please install it via  with `pip install hqq`r	  r  dynamicrn   ) r
  r   r   r   r  r   r   rl   r   r  r  r   from_legacy_cacher   r  r/   r  CONTRASTIVE_SEARCHr\  r+   r,   r-   r  r  r  r  cache_configr   r   ImportErrorr   r   )r   r  r   r  r   r!  is_hybrid_cacherJ  r  user_defined_cachedynamic_cache_kwargsr/  r)  s   `            rp   _prepare_cache_for_generationz-GenerationMixin._prepare_cache_for_generationr  s    t^stt.=&>
 KK**]l.>.>?P.QY].] 	' *--j9) 55A cdnco pT T  ,e49]9]9_##F : !223EF,>>?QR Z(
  &&%/ 335 55A##d(==>>PR  ~AAA!66B%::;2? 6:2  B BNDeDeff 55G#% $,dkk.I.IRV.I.W#X 11= 559YY$99=dd''78I8^8^7_ `22N1O P22
 ,0??):)O)O"#4#>#>@Q@f@fgjtt"2!-	 ,; ,Z( #77;F;;119]9]9_$M 
 BSA_A_Ak0==qs</-1[[-H-H-JL*&**9h?h&/J/L%\  %.>.@%H  ,:+Z'+Z\+ZZ("77;F+7+`:N+`[_+`Z(/DDD+7+O:N+OZ( (4'K6J'KL$ **\*=UWj2k':Z(434(L$ 3l)ro   c                     dt        t        j                  | j                        j                  j                               v S )z
        Return True if the current model supports the keyword argument `logits_to_keep` in forward()
        to save memory. Checking it in this way allows to avoid using a new model attribute.
        logits_to_keep)r   r   r   r   r   r   )r   s    rp   _supports_logits_to_keepz(GenerationMixin._supports_logits_to_keep  s2    
  3w'8'8'F'Q'Q'V'V'X#YYYro   kwargs_has_attention_maskc                     d fd	} ||j                   |      } ||j                  |      } ||j                  |      } ||j                  |      } j                  j
                  r||n|}| |j                  dk(  r|j                  d      }|9|7||st        j                  d       |d   }t        j                  d| d        j                  j
                  r|t        d      |4t        ||	      j                         r||st        j                  d
       |At        j                  |      s|dk  j                         rt        j                  d| d       ||_        ||_        ||_        ||_        y)a  
        Prepares the special tokens for generation, overwriting the generation config with their processed versions
        converted to tensor.

        Note that `generation_config` is changed in place and stops being serializable after this method is called.
        That is no problem if called within `generate` (`generation_config` is a local copy that doesn't leave the
        function). However, if called outside `generate`, consider creating a copy of `generation_config` first.
        Nc                     | | S ||nj                   }t        | t        j                        r| j	                  |      S t        j
                  | |t        j                        S )Nr   r   )r   r   ri   r   rH  tensorr   )tokenr   r   s     rp   _tensor_or_nonez@GenerationMixin._prepare_special_tokens.<locals>._tensor_or_none  sQ    }%1Vt{{F%.xx''<<fEJJGGro   rp  r   zThe attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z)Setting `pad_token_id` to `eos_token_id`:z for open-end generation.z\`decoder_start_token_id` or `bos_token_id` has to be defined for encoder-decoder generation.r  zThe attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.z;`eos_token_id` should consist of positive integers, but is zq. Your generation will not stop until the maximum length is reached. Depending on other flags, it may even crash.r   )r   r  r  r,  r   r   r   rE  r  r  r   r   r
  r  ri   is_floating_point_bos_token_tensorr  r  _decoder_start_token_tensor)	r   r  r8  r   r>  bos_token_tensoreos_token_tensorpad_token_tensordecoder_start_token_tensors	   `        rp   _prepare_special_tokensz'GenerationMixin._prepare_special_tokens  s    	H ++<+I+IRXY*+<+I+IRXY*+<+I+IRXY%45F5]5]fl%m" ;;)).H.T*Zj '
 ',<,A,AQ,F/99!< #(8(D(4=Vq  02NNFGWFXXqrs ;;)).H.Pn  (!+;K[\``b(4=V##C
 '##$45:JQ:N9S9S9UNNMN^M_ `r r /?+.>+.>+8R5ro   c                 X   |j                   ry| j                  j                  dk(  xs/ t        |j                  duxr |j                  j
                        }t        |j                  d      t              xr |d   j                  }|xr |}t        | dd      || j                  j                  z  }t        | d      rGt        | j                  j                               }d|v xr t!        |      dkD  }|| z  }d	|v }|| z  }|j                  |st"        j%                  d
       |S )zp
        Determines whether to trigger auto-compilation of the model's forward pass at generation time.
        FcudaNrV   hf_quantizerr  cpur!   diskzsYou have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation will be skipped.)disable_compiler   r`  boolcompile_config_compile_all_devicesr   r   r   r   r   rI  r   r   r  r   r   r  r  )	r   r   r  valid_hardwareusing_compilable_cachecan_compileall_model_deviceshas_cpu_offloadhas_disk_offloads	            rp   _valid_auto_compile_criteriaz,GenerationMixin._valid_auto_compile_criteriaX  sH   
 ,, ))V3 
t,,D8r=N=]=]=r=r8
 |''(9:EBu|TeGfGuGu 	 %?)? 4.:4,,;;;K4) #D$6$6$=$=$? @#'88WSAR=SVW=WO..K  &)::///K ++7#
 ro   custom_generatec                 (   |dt         |   x}vryt        j                  |j                  j	                  dd      j                          d| d| d       |s9t        |j                  j	                  dd      j                          d| d	      |S )
zP
        Returns the Hub repo for a deprecated generation mode, if any.
        N/r    z6 was moved to a `custom_generate` repo: https://hf.co/zC. To prevent loss of backward compatibility, add `custom_generate='z*'` to your `generate` call before v4.62.0.zY requires `trust_remote_code=True` in your `generate` call, since it loads https://hf.co/r  )GENERATION_MODES_MAPPINGr  r  namereplacetitler   )r   r  r   rW  repos        rp   _get_deprecated_gen_repoz(GenerationMixin._get_deprecated_gen_repo  s     &#>VWf>g6gd*h##++C5;;=>>tuytz {PPTv V66	

 !"''//S9??AB C004vQ8  ro   c                 f   |j                  dd      |j                  dd      ||d}|0t               xs t        |       xr t        j                         dkD  n||d<   |j                         D ci c]  \  }}|	|| }}}t        |t              rt        j                  t        j                        j                  j                         }	t        j                  |      j                  j                         }
|
|	z
  }|D ci c]  }||v s||j                  |       }}|S c c}}w c c}w )zn
        Extracts and returns the generation mode related keyword arguments from the provided kwargs.
        r  NrQ  )r  rQ  rO  r  r!   synced_gpus)r   r   r   distget_world_sizer   r   r   r   r   r   r[   r   r   )r   rW  r   rb  rO  r  r  r   r   usual_mode_kwargscustom_generate_kwargsnew_custom_keyss               rp   _extract_generation_mode_kwargsz/GenerationMixin._extract_generation_mode_kwargs  s6     K6#)::.CT#J. 	"
 " ()I-CD-IhtObObOdghOh 	}-
 4J3O3O3Q!c3Q41aUVUb!Q$3Q!c ox0 ' 1 1/2I2I J U U Z Z \%,%6%6%G%R%R%W%W%Y"47HHO@O%_1STX^S^aA&6"%_%% "d &`s   2
D(=D(	D.D.rb  r  rS   c                 \   |j                  dd      }|tt        |t              rdh d}t               j	                         D ci c]  \  }}||vs|| }}}|j                  |        | j                  |fd|i|} |d$d| i|S | j                  |||||      } | j                  ||fi |\  }}|j                  |      }t        |t              r|}nt        t        |       t        |         }| j                  |j                                | j!                  |||       | j#                  |||      x}r$t%        j&                  | f|||||||	|
|||d||S ||n	t)               }||n	t+               }dt-        t/        j0                  | j2                        j4                  j7                               v }d|v}|j9                  dd      du}| j;                  ||j<                  |      \  }}}dt/        j0                  |      j4                  j7                         v r||d<   |j>                  d	   }|j@                  }| jC                  |||
       | jD                  jF                  sj|jH                  ^|dkD  rYtK        |j>                        dk(  rAtM        jN                  |dddf   |jH                  k(        d	kD  rtP        jS                  d       | jD                  jF                  s|dk(  rd|_*        |s|r|r| jW                  |||      |d<   n-|r+|dk(  r&tK        |d   j>                        dkD  rtY        d      | jD                  jF                  rd|vr| j[                  ||||      }| jD                  jF                  r.| j]                  ||||j^                  |j@                        \  } }n|dk(  r|n|j                  d      }  | j`                  d$| tc        |jd                  |jf                        | jD                  jF                  d|\  } }|jh                  r!| jk                  | |j9                  d            } ||jm                  | jo                                | j>                  d   }!|j9                  d      du xr |jp                  du}"|j9                  d      du xr |jr                  du}#| ju                  ||"|#|||!      }| jw                         r	d|vrd|d<   | jy                  ||!|"       |jp                  dz
  }$|j>                  d   |!k7  r-|dk(  r(| jD                  jF                  s|$|j>                  d   z  }$| j{                  |||||$       | j@                  j                  | j@                  j                  k7  r`t}        j~                  d| j@                  j                   d| j@                  j                   d| j@                  j                   dt               | j                  ||!||||j@                  ||	|
	      }%| j                  |||j9                  d            }&|jT                  |d <    || | f|%|&|d!||}'|j                  du rAt        |'d"      r5t        |'j                  d#      |'j                  j                         |'_E        |'S c c}}w )%a  

        Generates sequences of token ids for models with a language modeling head.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.

        For an overview of generation strategies and code examples, check out the [following
        guide](../generation_strategies).

        </Tip>

        Parameters:
            inputs (`torch.Tensor` of varying shape depending on the modality, *optional*):
                The sequence used as a prompt for the generation or as model inputs to the encoder. If `None` the
                method initializes it with `bos_token_id` and a batch size of 1. For decoder-only models `inputs`
                should be in the format of `input_ids`. For encoder-decoder models *inputs* can represent any of
                `input_ids`, `input_values`, `input_features`, or `pixel_values`.
            generation_config ([`~generation.GenerationConfig`], *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which has the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            logits_processor (`LogitsProcessorList`, *optional*):
                Custom logits processors that complement the default logits processors built from arguments and
                generation config. If a logit processor is passed that is already created with the arguments or a
                generation config an error is thrown. This feature is intended for advanced users.
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complements the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
                intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
                Retrieval](https://huggingface.co/papers/2010.00904).
            synced_gpus (`bool`, *optional*):
                Whether to continue running the while loop until max_length. Unless overridden, this flag will be set
                to `True` if using `FullyShardedDataParallel` or DeepSpeed ZeRO Stage 3 with multiple GPUs to avoid
                deadlocking if one GPU finishes generating before other GPUs. Otherwise, defaults to `False`.
            assistant_model (`PreTrainedModel`, *optional*):
                An assistant model that can be used to accelerate generation. The assistant model must have the exact
                same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistant model
                is much faster than running generation with the model you're calling generate from. As such, the
                assistant model should be much smaller.
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            negative_prompt_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                The negative prompt needed for some processors such as CFG. The batch size must match the input batch
                size. This is an experimental feature, subject to breaking API changes in future versions.
            negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Attention_mask for `negative_prompt_ids`.
            use_model_defaults (`bool`, *optional*):
                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
                generation configuration (`model.generation_config`), as opposed to the global defaults
                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
                `True`.
            custom_generate (`str` or `Callable`, *optional*):
                One of the following:
                - `str` (Hugging Face Hub repository name): runs the custom `generate` function defined at
                  `custom_generate/generate.py` in that repository instead of the standard `generate` method. The
                  repository fully replaces the generation logic, and the return type may differ.
                - `str` (local repository path): same as above but from a local path, `trust_remote_code` not required.
                - `Callable`: `generate` will perform the usual input preparation steps, then call the provided callable to
                  run the decoding loop.
                For more information, see [the docs](../../generation_strategies#custom-generation-methods).
            kwargs (`dict[str, Any]`, *optional*):
                Ad hoc parametrization of `generation_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.

        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.LongTensor`.

                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateDecoderOnlyOutput`],
                    - [`~generation.GenerateBeamDecoderOnlyOutput`]

                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:

                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        r   N>   r   r   rW  r   global_keys_to_excludemodel)r   r  rN  r  rf  rO  rg  rh  r  rW  r   r   r   r  r   rp  r!   r   r   zA decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.r   Tr   z1`attention_mask` passed to `generate` must be 2D.)r   r   r   r,  r   )r   r:  r   r  rV  r  )r  r  r  r   r  r  r6  z~You are calling .generate() with the `input_ids` being on a device type different than your model's device. `input_ids` is on z, whereas the model is on z. You may experience unexpected behaviors or slower generation. Please make sure that you have put `input_ids` to the correct device by calling for example input_ids = input_ids.to('z ') before running `.generate()`.)	r  rd  re  rf  rN  r   r   rg  rh  )r  r  r  r  )rN  r  r  rV   to_legacy_cachern   )Gr   r   strlocalsr   r  r   rh  r  get_generation_moder   r   r`  r[  r  r  r  r`  r   r   r:   rO   r   r   r   r   r   r   r   r   r   r   r   rF  r   r   r  r   ri   r  r  r  r  r  r   r+  r9  rA  r@  r  r  r  token_healingheal_tokensputrJ  rV  r  r  r7  r  r4  r{  r|  r}  r  r  return_legacy_cacher   rV   rl  )(r   r   r  rN  r  rf  rb  rO  r  rg  rh  r  rW  r   r   rj  r   r   generate_argumentsr   r  r   r  decoding_methoddeprecated_mode_repoaccepts_attention_maskrequires_attention_maskr8  r  r   r   r   r   r  r  r  r!  prepared_logits_processorprepared_stopping_criteriaresults(                                           rp   r   zGenerationMixin.generate  s   f #JJ':DA&:os+K&" @Fx~~?O!u?OeSV^tSt#u*?O!u%%f-'@t'@'@(3D(HN($ ,M$M:LMM "&!E!E"
 +J$*I*I1+
5;+
'< ,??Pox0-O &d4j2J?2[\O##L$5$5$78&&8IKab $(#@#@Rcet#uuu"++"3!1"3)A /$7/M#5 4"3 )  $ 0@/K+QdQf1B1N-ThTj!1S9J9J4<<9X9c9c9h9h9j5k!k"3<"G$0$4$45Et$LTX$X! 9=8R8R%22L9
5' g//@KKPPRR6C"?3"((+
%%$$%68QZ`$a {{-- "33?N++,1IImArE26G6Y6YYZ]^^l {{--2Bo2U*.'(-DI_-1-X-X0,.L)* ';.3|DT7U7[7[3\_`3` !TUU;;)).?|.SNN|-=?PL
 ;;))&*&T&T%!1)'8'T'T$++ 'U '#I| *:[)HlN^N^_jNkI #E$"D"D #
-779J9_9_`#{{==#
 	#
	< **((4J4N4N{4[\ILL) %??1-!'L!9T!A!nFWFbFbjnFn!'L!9T!A!nFWFbFbjnFn ::/#9#9-'- ; 
 ((*/?|/S-.L)*''(9;KMcd -77!;"&66 O3KK22 3 3A 66**|_jJZ	
 ;;y//444MM@@I@P@P@U@U?V W++**+ ,TTXT_T_TdTdSe f*	*  %)$>$>/!1+%=- ''% 3+I %? 
%
! &*%@%@//,00= &A &
" %6$?$?[! !
 78/
 %
 
 11T9 12..0ABN%+%;%;%K%K%MF"y "vs   Z(Z(this_peer_finishedc                     |r_t        j                  |rdnd|      }t        j                  |t        j                  j
                         |j                         dk(  ryy|ryy)z
        Returns whether there are still unfinished sequences in the device. The existence of unfinished sequences is
        fed through `this_peer_finished`. ZeRO stage 3-friendly.
        rv  rn  rp  )opFT)ri   r<  rc  
all_reduceReduceOpSUMr6  )r   r|  rb  r   this_peer_finished_flags        rp   _has_unfinished_sequencesz)GenerationMixin._has_unfinished_sequences
  s^    
  ',ll:L3RU^d&e#OO38I8IJ&++-4   ro   c                    t        d      j                  j                  }}t        j	                               }t        d|      }j                  |d      D cg c]  }|j                          }} |dd      j                  j                  |j                        }t        j                  ||k(  ||      }|j                         d	k(  r|S |ddd
f   j                         }	j                  d      0j!                  j                  d            d	   fd|	D        }
nfd|	D        }
t#        t%        |	|
            D ]  \  }\  }}||   }t        j&                  ||k(        j)                         r5	 |j+                  |      D ci c]  }j                  |      fd }}t-        |      dk(  ru||fxx   dz  cc<   |j/                  |       |dd
 }	 |j                         d	k(  rt-        |||k7           dk(  r||d
<   | j1                  |j3                  d	      |      ||<    |S c c}w c c}w )a  
        Generates sequences of token ids for models with a language modeling head.
        Parameters:
            input_ids (`torch.LongTensor`): The sequence used as a prompt for the generation.
            tokenizer (`PreTrainedTokenizerBase`, *optional*): The tokenizer used to decode the input ids.
        Return:
            `torch.LongTensor` where each sequence has its tail token replaced with its appropriate extension.
        Nzs When generating with token healing, you must pass the model's tokenizer to the `tokenizer` argument of `generate`.r!   )r  r  T)skip_special_tokenspt)return_tensorspaddingr   r   rZ  c              3   `   K   | ]%  }j                  |      j                  d        ' yw)rZ  N)decoder]  )r  t	space_tokr  s     rp   r!  z.GenerationMixin.heal_tokens.<locals>.<genexpr>T
  s*     Wh))!,44S)Dhs   +.c              3   @   K   | ]  }j                  |        y wr   )r  )r  r  r  s     rp   r!  z.GenerationMixin.heal_tokens.<locals>.<genexpr>V
  s     ?h))!,hr"  )prefixg      $@rn  rl  )r  )r   r   r  r   	get_vocabr.   batch_decodestripr   rH  r   ri   wherenumeltolistconvert_tokens_to_idsconvert_ids_to_tokens	enumeratezipr5  r6  
extensionsr   r  r   rE  )r   r   r  r   r  
vocab_trier  r  promptstail_ids	tail_toks	batch_idxtail_idtail_tok	batch_idsalt_tokseq_biastrimmed_idsr  s     `               @rp   rq  zGenerationMixin.heal_tokens)
  s    * 
 &/%;%;Y=S=Sl#I$7$7$9:
,ALY '0&<&<Y\`&<&ab&a1779&ab
 )BBy''(	 	 KK	\ 9<S	 ??!QU#**, **3/;!77	8W8WX[8\]^_`IWhWI?h?I.7Hi8P.Q*I*!),Iyyl2388:
 R\QfQfnvQfQwQwg009;TAQw   8}! gZ C' $$8$<#CR.K   "a' 9Y,678A=".B#'==1F1Fq1I]n=#oIi E /RH y cDs   #IIc                 	   |j                   }|j                  }	|j                  }
|j                  }|j                  }|j
                  }t        d |D              }|j                  }|r|rdnd}|r|rdnd}|r|	rdnd}|r|	rdnd}|r|
rdnd}|rF| j                  j                  r0|	r|d   j                  d      nd}|
r|d   j                  d      nd}|j                  dd \  }}d}t        j                  |t        j                  |j                  	      }| j!                  ||j                  |      }| j"                  }| j%                  ||      }|rd
t&        j(                  d<   | j                  j*                  dk(  rH|j,                  <|j,                  j.                  r&t0        j3                  d       d|j,                  _        | j5                  |j,                        }|j6                   | j8                  ||fi |}d}nd}| j;                  |||j                        r@ | j<                  |fi |}|r | di |ddi}d}n |di |ddi}| j?                  ||| j                  j                        }|r|r||j@                  dddddf   jC                  dt        jD                  |j                        }  |||       }!|r|r||!fz  }|r|| fz  }|	rY|| j                  j                  r|jF                  fn|jH                  fz  }| j                  j                  r||jJ                  fz  }|
r3|| j                  j                  r|jL                  fn|jN                  fz  }|rHtP        jR                  jU                  |!d      }"t        jV                  |"d      jY                  d      }#nt        jZ                  |!d      }#|r|#|z  |d|z
  z  z   }#t        j\                  ||#dddf   gd      }||j_                  |#ja                                | |||       z  }|jc                         dk(  }|dz  }~| j;                  |||j                        r@||je                          |rY| j                  j                  r#tg        |||||||j                  d      	      S ti        ||||||j                  d            S |S )a  
        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        c              3   4   K   | ]  }t        |d         yw)r  N)r   )r  r  s     rp   r!  z*GenerationMixin._sample.<locals>.<genexpr>
  s     'lZkh.(IZks   rn   Nr   rc   rd   r   Fr   0TOKENIZERS_PARALLELISMflash_attention_2zWhen using Flash Attention 2 and a static cache, you cannot use the option `CompileConfig(fullgraph=True)` as FA2 introduces graph breaks. We overrode the option with `fullgraph=False`.Trp  r%  r   r   r  r   r   r0  r!   num_samplesr   rV   	r`   ra   rb   rs   rt   ru   rv   rw   rV   r`   ra   rb   rc   rd   rV   )5r  r#  r$  output_scoresoutput_logitsreturn_dict_in_generater
  r]  r   r   r   r   ri   r   r   r   r  __call__rV  r   environ_attn_implementationrN  	fullgraphr  r  get_compiled_callprefill_chunk_size_prefill_chunkingr  r   rM  rb   rH  float32ru   rc   rv   rw   rd   r
   r  softmaxmultinomialsqueezeargmaxr7  rr  rJ  r  endrr   r_   )$r   r   rN  r  r  rb  r  r   r  r#  r$  r  r  r  has_eos_stopping_criteriar]  ra   
raw_logitsru   rv   rw   rs   rt   r   cur_lenr|  unfinished_sequencesmodel_forwardcompile_forward
is_prefillr   rA  next_token_logitsnext_token_scoresprobsnext_tokenss$                                       rp   r[   zGenerationMixin._sample~
  s*   V )::-??0EE)77)77"3"K"K$''lZk'l$l!%//	 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G"$zz*EJJyO_O_`77AQAQS_`;;LJ[\36BJJ/0{{//3FF$33?DUDdDdDnDn''f BG%44> 223D3S3STM//;1411)=N_R^_LJJ,,-?U^UeUe,f=4==iX<XL@@4@"
'I,IDI  CC#';;#A#A D L
 1 !(q"ax 8 ; ;U]]clcscs ; t !1<M N ' 022F #4"66J$&9=9W9W335^e^p^p]r& {{55(W-E-E,GG(');;99 !668%335) --.?R-H#//1EMMaP#ll+<"E )),@@<STWkSkCll 		9k!T'.B"CLI#[__./#7;LYX^;_:_#_ !5!9!9!;q!@qLG K ,,-?U^UeUe,fN LLN"{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  ro   r<  c                 x    t        | j                        }t        j                  | |d   |d   z  g|dd z         S )z=[batch_size, num_beams, ...] -> [batch_size * num_beams, ...]r   r!   r   Nr  r   ri   r  )r<  r   s     rp   _flatten_beam_dimz!GenerationMixin._flatten_beam_dim>  s>     V\\"}}VeAhq&9%:U12Y%FGGro   r  c                 h    t        | j                        }t        j                  | ||g|dd z         S )z=[batch_size * num_beams, ...] -> [batch_size, num_beams, ...]r!   Nr  )r<  r   r  r   s       rp   _unflatten_beam_dimz#GenerationMixin._unflatten_beam_dimD  s3     V\\"}}Vj)%<uQRy%HIIro   c                    t        |j                        t        | j                        k  r=|j                  d      }t        |j                        t        | j                        k  r=t        j                  | |d      }|S )a  
        Gathers the beam slices indexed by beam_indices into new beam array.

        Args:
            tensor (`torch.Tensor`): A tensor containing data to be gathered. The tensor is a 2D or a 3D tensor
                with the two first dimensions depicting the batch and the beam dimensions.
            beam_indices (`torch.Tensor` of shape `(batch_size, num_beams_to_select)`): The indices of the beams to
                select .

        Returns:
            A tensor with the selected beams
        r   r!   )inputr  r1  )r   r   rE  ri   take_along_dim)r<  r{   gathered_tensors      rp   _gather_beamszGenerationMixin._gather_beamsJ  si     ,$$%FLL(99'11"5L ,$$%FLL(99..V\WXYro   #is_early_stop_heuristic_unsatisfiedrunning_beam_scoresbeam_scoresis_sent_finishedr  rV  decoder_prompt_lenearly_stoppinglength_penaltyc	                     |dk(  r|dkD  r||z
  }	n||z
  }	|ddddf   |	|z  z  }
t        j                  |t        j                  |dd      d   d      }| t        j                  |
|kD  d	d      z  S )
uH  
        Determine whether early stopping is possible by checking if the best possible score of running beams
        could still improve upon the finished ones.

        Mechanism:
        - Without a length penalty, beam scores typically decrease as more tokens are generated.
        So, if the *best possible* score from any running beam is already worse than the *worst* finished beam,
        we can safely stop early.
        - With a length penalty, scores may increase with longer sequences. In this case, we use heuristics
        to estimate the best possible score — though this estimate may not always be correct — and stop
        if no further improvement seems likely.

        We apply different heuristics depending on the value of `early_stopping`:
        1. `early_stopping == False`:
        -> Use a heuristic that assumes the best score comes from the current length minus the decoder prompt length.
        -> See detailed discussion: https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565

        2. `early_stopping == "never"`:
        -> Estimate the best score using either `max_length` or `cur_len`, depending on the sign of `length_penalty`.
        -> A positive length penalty favors longer sequences, so we use `max_length` in that case.

        NOTE: the canonical beam search implementation can be replicated with `early_stopping="never"` and
        `length_penalty=0.0`, which are NOT the default flags. The default behavior was empirically found to produce
        better sequences (prior to 2022), and changing it is BC breaking.
        neverrv  Nr!   T)r1  keepdimr       er   )ri   r  r  r
  )r  r  r  r  r  rV  r  r  r  best_hypothetical_lengthbest_possible_running_scoreworst_finished_scores               rp   _check_early_stop_heuristicz+GenerationMixin._check_early_stop_heuristic^  s    J W$#)='14F'F$'.1C'C$&9!RaR%&@D\^lDl&m#${{+;UYY{XYcg=hij=kmst2UYY'*>>BPT6
 
 	
ro   !next_token_hits_stopping_criteriac                     t        j                  |       }t        j                  |      |du z   }t        j                  |       }||z  |z  S )zv
        Beam Search stopping condition -- halts the generation loop if any of these conditions becomes False
        T)ri   r
  r5  )r  r  r  r  improvement_possibleexists_open_beamvalid_continuationss          rp   %_beam_search_has_unfinished_sequencesz5GenerationMixin._beam_search_has_unfinished_sequences  sZ      %yy)LM #YY'78Nd<RST  %yy)JKK#&669LLLro   accumulated_log_probsrunning_sequencesrunning_beam_indicesr]  beams_to_keeprW  c                    |rOt        j                  t        j                  j	                  |d      |      }t        j
                  |d|      }nt        j                  ||      \  }}||	z  }| j                  ||      }| j                  ||      }||	z  }||dddd|f<   t        j                  |
|j                        j                  dd      |z  }||z   }||dddd||z
  f<   |||fS )	a'  
        Get top-K continuations given the accumulated log probs on the next token.

        A few notes to understand what's going on:
        1. Each item in batch has `num_beams` * `vocab_size` candidate continuations. For each item, get the
        top K [K = (number of EOS tokens + 1) * `num_beams`] candidates with the highest accumulated
        log-probabilities, or sample them without replacement using the accumulated scores
        2. We gather the top K (as opposed to `num_beams`, or any number lower than K) here so that we have at
        least `num_beams` sequences remaining to continue the live beam search.
        3. Note that other stopping criteria might result in impossible to continue beams, i.e. all continuations
        selected in this step hit the stopping criteria.
        r   r0  r  r!   )r  r1  indexr   Nrp  )ri   r  r
   r  r  r  topkr  rG  r   r2  )r   r  r  r  r  r  r]  r  r  rW  r   topk_indicestopk_log_probstopk_current_beam_indicestopk_running_beam_indicestopk_running_sequencestopk_idsbatch_offsetbatch_modified_indicess                      rp   _get_top_k_continuationsz(GenerationMixin._get_top_k_continuations  s   <  ,,%%&;%DR_L #\\0E1T`aN+0::6K}+](NL %1J$>!$($6$67KMf$g!!%!3!34EG`!a*, 19q!W}- ||JxGLLRQRSV__!:\!IH^!!Q2D(D"DE57PPPro   r  r  r  c                     ||j                  t        j                        dz  z   }t        j                  ||      d   }| j	                  ||      }| j	                  ||      }	| j	                  ||      }
||	|
fS )z
        Given the top-K continuations, their scores, and whether they hit a stopping criteria, select the
        best non-finished beams to continue beam search in the next iteration.
        r  r  r!   )rH  ri   r  r  r  )r   r  r  r  r  r  topk_running_log_probsnext_topk_indicesr  r  r  s              rp   %_get_running_beams_for_next_iterationz5GenerationMixin._get_running_beams_for_next_iteration  s     "02S2V2VW\WdWd2ehn2n!n!JJ'=KAN ../EGXY"001GIZ[#112KM^_ "57KKKro   top_num_beam_maskc                    |	|
dddf   z  }||dz   |z
  |z  z  }t        j                  |dd      |du z  }||j                  t         j                        dz  z  }|| j                  t         j                        dz  z  }|| dz  z  }t        j                  ||fd      }t        j                  ||fd      }t        j                  ||fd      }t        j                  ||fd      }t        j
                  ||      d   }| j                  ||      }| j                  ||      }| j                  ||      }| j                  ||      }||||fS )	z
        Updates the finished beams if (and only if) there are new completed sequences that have a higher score than
        the current finished sequences.
        Nr!   r   T)axiskeepdimsr  r0  r  )ri   r5  rH  r  r7  r  r  )r   r`   r  r  r  r{   r  r  r  r  r  r  r  r  r  r  did_top_num_beams_just_finishedbeams_in_batch_are_fullmerged_sequencesmerged_scoresmerged_beam_indicesmerged_is_sent_finishedtopk_merged_indicess                          rp   _update_finished_beamsz&GenerationMixin._update_finished_beams  s   2 +LN_`dfg`gNh*h' (GaK:L,LQ_+_`"')),<2PT"UYgkoYo"p144U]]CfLL??CCEMMRU[[[ 	;;vEE
 !99i1G%HaP		;"?QG#ii7P(QWXY"'))-=?^,_ef"g#jj)DQG&&'79LM	((8KL))*=?RS--.EGZ[+|5EEEro   c                    |j                   }|j                  }|j                  }	|j                  }
|j                  }|j
                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  dd \  }}||z  }| j                  j                  dk(  r| j                   j"                  }n| j                  j                  dk(  r| j%                         j&                  }nT| j                  j                  dk(  r| j                   j(                  }n$| j                   j+                         j,                  }|}d}||j                  d   nd}t/        dd|z         |z  }t1        j2                  t1        j4                  |t0        j6                  	      t1        j8                  ||z
  t0        j6                  	      fd
      j;                  |j<                        }| j?                  ||j<                  |      }|j@                  }|rtC        d      |r|rdnd}|r|rdnd}|r|rdnd} |r|	rdnd}!|r|	rdnd}"|r|
rdnd}#|rF| j                   jD                  r0|	r|d   jG                  d      nd}$|
r|d   jG                  d      nd}%|	|xs |d   nd}&t1        jH                  |||f|&t0        jJ                  |j<                        }'| jM                  |||      |'ddddd|f<   |'jO                         jQ                         }(t1        j8                  ||ft0        jR                  |j<                        })d|)ddddf<   t1        jH                  ||fdt0        jR                  |j<                        }*t1        j8                  ||ft0        j6                  |j<                        }+t1        j4                  |dft0        j6                  |j<                        },t1        j8                  ||ft0        j6                  |j<                        }-t1        jH                  ||||z
  fdt0        jT                  |j<                        }.|.jO                         jQ                         } | jW                  |||j<                        r!| jY                  |'ddddd|f         }/ | jZ                  |/fi |}0 | di |0ddi}1| j]                  |1|| j                   jD                        }|r|r|1j^                  dddddf   j;                  dt0        j`                  |j<                        }2tb        jd                  jg                  |2d
      }3 ||/|3      }3|r|r||2jQ                         fz  }|r|r||3jQ                         fz  }|	rY|!| j                   jD                  r|1jh                  fn|1jj                  fz  }!| j                   jD                  r|"|1jl                  fz  }"|
r3|#| j                   jD                  r|1jn                  fn|1jp                  fz  }#~1| jM                  |3||      }3|3|)dddddf   z   }3t1        jr                  |3|||z  f      }3| ju                  |3|'|.|||||||
      \  }4}5}6 || jY                  |5ddddd|dz   f         |      }-| jM                  |-||      }-| jw                  |4|5|6|-|      \  }'})}.| jy                  |(|5|*|4| |6|,|+|-||||||      \  }(}*} }+|jG                  dd      R| jY                  |.d||z
  f         }7t{        | d      r| j}                  |d   |7      |d<   n|d   j                  |7       |dz   }| j                  |,|)|*|+|||||	      },| j                  |,|+|-|       }| jW                  |||j<                        r!| jY                  |(ddd|ddf         }(| jY                  |*ddd|f         }*| jY                  | ddd|ddf         } | dz   j7                         j                  d
      j/                         }8||8z   }9|(ddd|9f   }(| ddd|8f   } |ra|sd}*| j                   jD                  r%t        |(|*||| $%|!|"|#|jG                  d             S t        |(|*||| |!|#|jG                  d      !      S |(S )"a	  
        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.

        If it's the first time you're diving into Beam Search, we recommend you read the following blog post:
        https://huggingface.co/blog/how-to-generate (especially the beam search section).

        You can recompute the sequence scores from the individual scores using the `compute_transition_scores` function
        (https://huggingface.co/docs/transformers/main_classes/text_generation#transformers.GenerationMixin.compute_transition_scores)

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size*num_beams, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`:
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            model_kwargs:
                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
                an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        Nr   MoshiDepthDecoderImageGPTForCausalImageModelingBarkSemanticModelFr   r!   r   r0  z`low_memory=True` is not supported after the beam search refactor. Please check the discussion in #35802 *after the PR got merged*, and add a comment there if your questions are not yet answered.rn   r   rc   rd   r   )
fill_valuer   r   r   r  rp  r%  Tr  r  )
r  r  r  r  r  r]  r  r  rW  r   )r  r  r  r  r  )r`   r  r  r  r{   r  r  r  r  r  r  r  r  r  r  rV   ._reorder_cache)	r  r  r  r  r  rV  r  r  r  )r`   rz   ra   rb   r{   rs   rt   ru   rv   rw   rV   )r`   rz   ra   rb   r{   rc   rd   rV   )Er  r  r#  r$  r  r  r  r]  r  r  rV  r  r  r   r   re   r   audio_vocab_sizeget_output_embeddingsout_featuresoutput_vocab_sizer\  rW  r  ri   r7  r   rM  zerosrH  r   r  
low_memoryr   r   r   fullr  r  detachr   floatint32r  r  r   rM  rb   r  r
   r  r  ru   rc   rv   rw   rd   r  r  r  r
  r   r  reorder_cacher  r  r  r}   ry   ):r   r   rN  r  r  rb  r   r  r  r#  r$  r  r  r  r]  r  r  rV  r  r  batch_size_unflattenedr  r   rW  r  r|  n_eos_tokensr  r  
sequential
all_scoresr  r{   ru   rv   rw   rs   rt   output_fill_valuer  r`   r  r  r  r  r  r  flat_running_sequencesr   model_outputsrb   	log_probsr  r  r  beam_idxmax_generated_lengthoutput_lengths:                                                             rp   r\   zGenerationMixin._beam_search'  s	   \ )::(::-??0EE)77)77"3"K"K%//	*99*99&11
%//	0EE*3//"1*='+y8
>>""&9955J^^$$(HH335BBJ^^$$(;;66J446AAJ$" 1=0H|))!,aAq</09<!IIZZ5::6]U^E^glgqgq8rs
 "Y
 	
 77AQAQS_` '11
t  4RD
3RD
5-rd$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf " @L?WL;LO]_!JJJ/(++##	
 -1,D,DYPZ\e,f!Q.)%,,.446	
 $kk:y*A]f]m]mn%)AqrE"jj*i!8TQVQ\Q\eneueuv !;;
I'>ejjYbYiYij /4jj*aPUPZPZclcscs.t+ -2KK#5::i>N>N-
)
  %zzJ$89bPUP[P[dmdtdt 
 ,224::< ,,-?U^UeUe,f%)%;%;<MaQRT\U\T\n<]%^"=4==>TeXdeL B<BTBM  CC#';;#A#A D L
 1 #))!R(366D^g^n^n6oF 11&b1AI()?KI ' 6<<>"33J*}9??#4"66J$&;;99 '99;+668&
 {{55(]-K-K,MM(');;99 '<<>+99;) 00J	RI!$71d
$CCIi*i*>T1UVI QUPmPm&/"3%9#5#+#%% Qn QMN24M 1B&&'=aMgPQkM>Q'RS1- 150H0H1:}1-
 LPKuKu-'=*C2S# Lv LH24H FJE`E`#'='-)*C4W!12S"3##5-- Fa FBI{L2B.  148D112FsGVhLhGh2ij4!126:6I6I,WhJiks6tL!23 !23AA(KkG262R2R4W$7'!1%#5-- 3S 
3/ &*%O%O3 1	& "G ,,-?U^UeUe,fX **9Q8M9M8Mq5P+QR	,,[<Q=Q<Q9Q-RS--l1>S?S>SUV;V.WX ".!1 7 7 9>>1>EIIK*-AAa-/0	#A'<(<'<$<=" "{{--7'%0%%!-'9*?'9%5*?$0$4$45F$G  5'%0%%!-1"7$0$4$45F$G	 	 ro   c                   ./ |d   st        d      |j                  dv s2d|v r9t        |d   d      r*t        d |d   j                  D              rt        d      | j                  ||||||
|	|      }|j                  }|j                  }|j                  }|j                  }|j                  }|j                  }|r|rd	nd
}|r|rd	nd
}|r|rd	nd
}|r|rd	nd
}|r|rd	nd
}|rF| j                  j                  r0|r|d   j                  d      nd
}|r|d   j                  d      nd
}|j                  d
d \  }}|dkD  rt        d      t!        j"                  |t         j$                  |j&                        }| j)                  ||j&                  |      }d}d}| j+                  |||j&                        r|j                  d   }|j-                  |      \  }} |j/                  | j&                        }| | j/                  | j&                        } |j                  d   |j                  d   z
  }! ||d
      }"t1        j0                  |      }#t3        |#|j                  d   | j                  j                        }#t5        |#|j                  d         }#d|#v rQt!        j6                  |#d   t!        j8                  |||!z   |j&                  t         j$                        fd      |#d<    | j:                  |fi |#}$d|$v r|!dz   |$d<    | d	i |$}%|%j<                  d
d
|! dz
  d
f   j/                  t         j>                  |j&                        ..jA                         /tC        |      dkD  r<tE        |!dz         D ]+  }& ||d
d
d
||&z   f   .d
d
|&d
d
f         .d
d
|&d
d
f<   - |r| tG        || |!.|"      \  }'}(n|rJ.jI                  d      })t!        jJ                  |)dd
d
d
d
f   d      jM                  d      d
d
d
f   }*n.jO                  d      }*|d
d
|d
f   }+|+|*d
d
d
df   k(   jQ                  d      dk  jS                         }(|"r
|(|!k(  r|(dz  }(|*d
d
d
|(dz   f   }'t!        j6                  ||'fd      }||jU                  |'jW                                |j                  d   },|%jX                  j[                  |,dz
         |j]                  |.|(       | j_                  |%|| j                  j                  |(dz         }|r|r|r|(dz   }-|r |ta        .fdtE        |-      D              z  }|r |ta        /fdtE        |-      D              z  }|r|,n|-}-|rr| j                  j                  r3tc        ||%jd                  ||-      }tc        ||%jf                  ||-d      }n)|%jh                  d   tc        ||%jh                  ||-d      }|rG| j                  j                  rtc        ||%jj                  ||-      }ntc        ||%jl                  ||-      }| |||       z  }|jo                         dk(  }d}| j+                  |||j&                        r||jq                          t        |d       rH|jr                  jt                  jv                  d!k(  r%|jx                  |jr                  jt                  _<        |rY| j                  j                  r#t{        |||||||j                  d      "	      S t}        ||||||j                  d      #      S |S )$a  
        Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
        models.

        Parameters:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                The sequence used as a prompt for the generation.
            logits_processor (`LogitsProcessorList`):
                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
                used to modify the prediction scores of the language modeling head applied at each generation step.
            stopping_criteria (`StoppingCriteriaList`):
                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
                used to tell if the generation loop should stop.
            generation_config ([`~generation.GenerationConfig`]):
                The generation configuration to be used as parametrization of the decoding method.
            synced_gpus (`bool`):
                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
            streamer (`BaseStreamer`, *optional*):
                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
            inputs_tensor (`torch.FloatTensor`, *optional*):
                The input tensor for generation. For decoder models, usually `input_ids`. For encoder-decoder models,
                the tensor that produced `model_kwargs["encoder_outputs"]`.
            assistant_model (`PreTrainedModel`, *optional*):
                The model used to assist the generation process. If not provided, the main model will be used.
            assistant_tokenizer (`PreTrainedTokenizerBase`, *optional*):
                The tokenizer used for the assistant model. If not provided, the token space is assumed to be the same.
            tokenizer (`PreTrainedTokenizerBase`, *optional*):
                The tokenizer used for the main model. If not provided, the token space is assumed to be the same.
            model_kwargs:
                Additional model specific keyword arguments will be forwarded to the `forward` function of the model.
                If model is an encoder-decoder model the kwargs should include `encoder_outputs`.

        Return:
            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or
            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
            `model.config.is_encoder_decoder=True`.
        r  z+assisted generate requires `use_cache=True`)staticr  sliding_windowrV   layersc              3   6   K   | ]  }t        |d d        yw)r   FN)r   )r  ls     rp   r!  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>  s     hAgAGA/7Ags   z=assisted generate is not supported with Static cache classes`)r  r   r  rO  rN  rP  rQ  r   rn   Nr   rc   rd   r   r!   z6assisted generate is only supported for batch_size = 1r   FTrp  r   r;  r   r0  r6  r   r  )r   rB  c              3   6   K   | ]  }d d |d d f     y wr   rn   )r  i
new_logitss     rp   r!  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>P  s     #[AZAJq!Qw$7AZ   c              3   6   K   | ]  }d d |d d f     y wr   rn   )r  r.  r  s     rp   r!  z5GenerationMixin._assisted_decoding.<locals>.<genexpr>R  s      'fLeq(9!Q'(BLer0  )is_decoder_attentionrO  	heuristicr  r  )?r   r  r   r
  r*  rc  r]  r#  r$  r  r  r  r   r   r   r   ri   r   r   r   r  r  get_candidatesrH  r  r)   r*   r7  rG  r   rb   r  r   r   range_speculative_samplingr  r  r  r  r   r  rr  rJ  rV   cropupdate_candidate_strategyrM  rl   _split_model_outputsrv   ru   rc   rw   rd   r  r  rO  r  num_assistant_tokens_schedulenum_assistant_tokensrr   r_   )0r   r   rN  r  r  rb  r  r  rO  rQ  r  r   rb  r]  r#  r$  r  r  r  ra   r  ru   rv   rw   rs   rt   r   r  r  r|  is_first_iterationcandidate_input_idscandidate_logitscandidate_lengthis_done_candidatecandidate_kwargsr   rA  r.  valid_tokens	n_matchesr  selected_tokenscandidate_new_tokensnew_cur_lennewly_added_lengthr/  r  s0                                                 @@rp   r]   z"GenerationMixin._assisted_decodingu  s   t K(JKK115[[-%67BhN_A`AgAghh\]]";;/'+-& 3% < 	
 &//	-??0EE)77)77"3"K"K 0M3RD
$;@QRX\"9>O2VZ'>CW^b #t{{'E'EVg.?!@!D!D\!RmqH\./33ODbf "
 (oobq1
G>UVV$zz*EJJyO_O_`77AQAQS_`"!,,-?U^UeUe,fooa(G 5H4V4VW`4a1!1"5"8"8"E+#3#6#6t{{#C 288;iooa>PP 12Et L  $yy66 "5";";A">@^@^   77GI\IbIbcdIef#335:YY()9:Wg8H.HQZQaQainisist 6 !12 >4==>QfUefL</1AA1E-. *\*G !,<+<q+@+B(BCFFmmI,<,< G J !+ 0 0 2#$q(/!34A*:;NqR_T[^_T_R_O_;`blmnpqstmtbu*vJq!Qw' 5 -9*?'$$%+'i &..2.6E&+&7&7aAgTU&V&^&^_`&abfhibi&jO&0&7&7B&7&?O':1gh;'G$ 43B38O OPXX]_X`cddiik	 %6F)FNI.q/IM//AB 		9l";DI#\--/0#//!,K ##((q9  99)ZQZ[  CC#';;#A#A(1}	 D L 1 '%.]" e#[GYAZ#[[[F %'fERdLe'f"ffJ4F[L^"${{55+?,g.F.FQc,( .B.#66#.15.* !++A.:-A.#..#.15.* ({{550D173P3PRY[m1- 1E173H3H'Se1- $8;LYX^;_:_#_ !5!9!9!;q!@!&k ,,-?U^UeUe,fn LLN '):;#33EEccgrr $88  //AAV #{{--3'!%'9*?'9%5*?$0$4$45F$G
 
 1'!%1"7$0$4$45F$G  ro   c                    dt         j                  j                  _        |j                  }t        j
                  |d d d df   |d      }d|vrt        d      | j                  }| j                  ||      }|r| j                  |j                        }|j                  dd       }d}	|D ]  }
|	|
j                  d   z   }||d d d |f   |d<   t        j                  |	|t         j                  |
j                        |d	<   |d	   j!                  d      |d
<    | j"                  |
fi |} |di |ddi}|j$                  |d<   |}	 ||d<   |d	   dd  dz   |d	<   |j                  d
d       }|S )N@   r   r0  rV   z+Cannot use prefill chunking without a cacher   r   r   r   r   r%  Tr!   rn   )ri   _dynamor   cache_size_limitr  splitr   r   rV  r  rN  r   r   rG  r   r   rE  r   rV   )r   r   r  r   
chunk_sizeinput_chunksr  r  r   r  input_chunkcurrent_lengthr   rA  r   s                  rp   r  z!GenerationMixin._prefill_chunking  s    13-&99
 {{9QV#4jbIL0JKK;;LJ[\ 223D3S3STM%))*:DA'K(;+<+<R+@@N)1??N?@R1S-.-2\\^5::kFXFX.L)* ,88H+I+S+STU+VL(=4==kZ\ZL#ElEEG.5.E.EL*+(K (  *8%&)56F)G)Lq)P%&^T2ro   )NN)NNNN)NNNr   )r!   FN)Fr!   )NNNNNNNN)NF)NNNNNNNNNNNN)FNF)FNNNNN)Pre   rf   rg   rh   r   r   rm  r   PathLikerM  r   r   ri   rj   rm   rl   r   r   r   r   r   dictr   r   r.   r   r  r+  r	  r   r9  staticmethodr@  r   rM  r:   r%   rc  r  r  rO   r  r  r  r  r  r  r  r  r  r  classmethodr   r/   r4  r7  rF  rV  r`  rh  no_gradGenerateOutputr   r  rq  GenerateNonBeamOutputr[   r  r  r  r  r  r  r  r  r
  GenerateBeamOutputr\   r]   r  rn   ro   rp   r   r   g  s/   < LP,0;('/c2;;6F0G'H;( $D>;(
 
;(z (## (   1 12 ( !!1!12	 (
 
u  %"2"22	3 (D=(##=(   1 12=( !!1!12	=(
 
u  %"2"22	3=(D ,0595959M##M "%M !!1!12	M
   1 12M !!1!12Mb *./3:>	A0&A0 u||,A0 tC$567	A0
 
u||Xc]Dell1B,CC	DA0J *./3:>	`&` u||,` tC$567	`
 
		`@"||" ," 38n	"
 
		"H'||' #3-	'
 ,' 
c3h'^ *.9/9/ 9/ 3,-	9/
 !&9/ &9/ 
uc5<<&7!88	99/v #(04 ' '  ' E,,- '
 
uc3h/	0 '  'L $)// 38n/ !	/
 / 
c3h/p 8<@DCGP#+P# ##P# ||	P#
 .P# 38nP# ""34P# ##<=P# &&?@P# 
P#j /38<W[:> $156:AE[+[ 'sm[ $E$4$45	[
 #+8S%,,4Gc4R+S"T[ ##67[ [ tCH~.[ &ell3[ )1(>[ 
[B :>	$+$ $$89$ 56	$
 
$L#/1EEF# .0DDE# 
"$88	9	#R 04!&x!<<x! ell#x! u||,	x!
 x! 
x!t,\04S> 0d*X6!v .2i/#$45i/ %TNi/ 	i/
 
%	&i/V:,s , ,TW ,jo ,\ 
 
 
&K+K K (	K
 K K 
KZZ$ Z 5959	PS+PS $,D>PS u||S012	PSd'c3h '\l 'qu 'Z *.	'   "#	
 
#0& 
c3h&@ U]]_ *.8<:><@W[&*7;-16:AE-1:>[&[ $$45[ ##67	[
 $$89[ #+8S%,,4Gc4R+S"T[ d^[ ""34[ >*[ &ell3[ )1(>[ %TN[ "%X"67[ 
~u///	0[ [z
D t ]b]i]i nr & ]aS))S6>?X6YS			Sv "-1~##~ .~ 0	~
 ,~ ~ >*~ 
$e&6&66	7~@ H%,, H5<< H H
 JELL Jc Jc JV[VbVb J J
 ell %,, 5<<  & ,
-2\\,
"\\,
 \\,
  ,,	,

 ,
 ,
  ,
 dCi(,
 ,
 ,
\ M-2\\M,,M ,1<<M dCi(	M M,4Q$||4Q !<<4Q $ll	4Q
 4Q  4Q 4Q 4Q 4Q 4Q 4Q 
u||U\\5<<7	84QlLL !&L $)<<	L
 ,1<<L L 
u||U\\5<<7	8L,3F<<3F !&3F \\	3F
 3F ll3F $)<<3F .3\\3F  ,,3F ,1<<3F !<<3F 3F 3F  3F 3F  dCi(!3F" 
u||U\\5<<E	F#3Fz "L##L .L 0	L
 ,L L 
!5#3#33	4Lh
 "-1597;CG9=d##d .d 0	d
 ,d d >*d   1 12d ""34d &&?@d 56d 
$e&6&66	7dL	*5+;+; *P` *ro   r   c                 d   | dd| df   }|j                  d      }|ddt        j                  |      |f   j                  dd      }|j                  d      }|ddt        j                  |      |f   j                  dd      }	|	|z  }
t        j                  |
      }||
k  }| j                  d      dk  j                         }|r||k(  r|dz  }|ddd|dz   f   }||fS |j                  d   }|dd|ddf   }||k  rF|dd|ddf   }t        j                  ||z
  d      }|j                  |j                                n|}t        j                  |d      j                  d      dddf   }|dkD  r&t        j                  |ddd|f   |fd      }||fS |}||fS )a  
    Applies sampling as in the speculative decoding paper (https://huggingface.co/papers/2211.17192, algorithm 1). Returns
    the selected tokens, as well as the number of candidate matches.

    NOTE: Unless otherwise stated, the variable names match those in the paper.
    Nr   r0  r   r!   )r  r  )r  ri   rG  r  	rand_liker   r  r   clampdiv_r  r7  )r=  r>  r?  r/  r@  new_candidate_input_idsqq_ir  p_iprobability_ratior_iis_acceptedrC  rB  gamma
p_n_plus_1
q_n_plus_1p_primer  s                       rp   r6  r6    s    2!6F5F5G2GH 	  R (A
Au||,-/FF
G
O
OPQST
UCr"A
Au||,-/FF
G
O
OPQST
UCc	
 //+
,C**K,&&2&.2779I Y*:: 	Q	.q/IM//AB& ""! !&&q)q)Q'
u1i?+Jkk:
#:CGLL' Gg15==a@qI q= 99&=a)m&La%PVXYL "" L""ro   c                 *   t        |       dk(  r<d}|D ]%  }|r|n|j                  d   }||dd|d|f   fz  }' | |fz  } |dz  }||z  }t        |      D ]:  }d}|D ]+  }|r||z   n|j                  d   }||d||dz   d|f   fz  }- | |fz  } < | S )z
    Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
    where each member corresponds to a single generated token.
    r   rn   r   .Nr!   )r   r   r5  )	rA  new_outputsr  	added_lenr2  	new_tuplelayerlast_dim_sizer.  s	            rp   r9  r9     s     7|q	 E';GRM%XgX~~ =>@@I ! 	I<1W	9	 E+?GaKU[[QS_M%QQY >?AAI ! 	I<  Nro   rQ  )r  r   r   r{  dataclassesr   typingr   r   r   r   r   ri   torch.distributeddistributedrc  	packagingr	   r
   cache_utilsr   r   r   r   r   dynamic_module_utilsr   r   r   r   integrations.deepspeedr   integrations.fsdpr   masking_utilsr   pytorch_utilsr   tokenization_utilsr   utilsr   r   r   r   r   r   r    rb  r"   r#   r$   r%   r&   r'   r(   r)   r*   configuration_utilsr+   r,   r-   r.   r/   continuous_batchingr0   logits_processr1   r2   r3   r4   r5   r6   r7   r8   r9   r:   r;   r<   r=   r>   r?   r@   rA   rB   rC   rD   rE   rF   rG   rH   rI   r  rJ   rK   rL   rM   rN   rO   rP   modeling_utilsrQ   tokenization_utils_baserR   	streamersrS   
get_loggerre   r  accelerate.hooksrT   rU   rD  SAMPLEGREEDY_SEARCHr  BEAM_SAMPLEr  DOLA_GENERATIONr.  GROUP_BEAM_SEARCHCONSTRAINED_BEAM_SEARCHr[  r_   rr   ry   r}   GreedySearchDecoderOnlyOutput"ContrastiveSearchDecoderOnlyOutputSampleDecoderOnlyOutput%ContrastiveSearchEncoderDecoderOutput GreedySearchEncoderDecoderOutputSampleEncoderDecoderOutputBeamSearchDecoderOnlyOutputBeamSampleDecoderOnlyOutputBeamSearchEncoderDecoderOutputBeamSampleEncoderDecoderOutputGreedySearchOutputSampleOutputBeamSearchOutputBeamSampleOutputContrastiveSearchOutputrX  rY  rW  r   r6  r9  rn   ro   rp   <module>r     s      	  ! @ @        @ 6 5 - /  
 
 
  1      6   0A'			H	%E 9  )&&(<""$A%%'R$$&P**,\   ,  ,  ,F ,,; ,, ,,^ (,K (, (,V 5,{ 5, 5,t !: %> "3 (D %#?  9 ; ; !A !A ;=ZZ[ /1HHI79TTU 79TTU  EGi ij  79UUV 8:ZZ[ ,.@@A^5o ^5Bk5#pro   