
    Ph+                         d dl mZmZmZ ddlmZ ddlmZ ddlm	Z	  e       rd dl
Z
ddlmZ dd	lmZ d
Z G d de	      Zy)    )AnyUnionoverload   )GenerationConfig)is_torch_available   )PipelineN)%MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING)SpeechT5HifiGanzmicrosoft/speecht5_hifiganc            
           e Zd ZdZdZdZdZdZdZdZ e	d      Z
dddd fd
Zd	 Zd
 Zedededeeef   fd       Zedee   dedeeeef      fd       Zdeeee   f   deeeef   eeeef      f   f fdZ	 	 	 ddZd Z xZS )TextToAudioPipelinea  
    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
    pipeline generates an audio file from an input text and optional other conditional inputs.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> pipe = pipeline(model="suno/bark-small")
    >>> output = pipe("Hey it's HuggingFace on the phone!")

    >>> audio = output["audio"]
    >>> sampling_rate = output["sampling_rate"]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    <Tip>

    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
    [`TextToAudioPipeline.__call__.generate_kwargs`].

    Example:

    ```python
    >>> from transformers import pipeline

    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")

    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
    >>> generate_kwargs = {
    ...     "do_sample": True,
    ...     "temperature": 0.7,
    ...     "max_new_tokens": 35,
    ... }

    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
    ```

    </Tip>

    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
    `"text-to-audio"`.

    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
    TF   )max_new_tokensN)vocodersampling_rateno_processorc                   t        	|   |i | || _        | j                  dk(  rt	        d      d | _        | j                  j                  t        j                         v rE|<t        j                  t              j                  | j                  j                        n|| _        || _        | j
                  %| j
                  j                   j                  | _        | j                  | j                  j                   }| j                  j"                  j%                  dd       }||j'                  |j)                                dD ]H  }t+        ||d       }||| _        t+        |dd       (t+        |j,                  |d       }|B|| _        J | j                  J| j                  s=t/        | j0                  d      r&| j0                  j2                  j                  | _        y y y y )Ntfz5The TextToAudioPipeline is only available in PyTorch.generation_config)sample_rater   codec_configfeature_extractor)super__init__r   	framework
ValueErrorr   model	__class__r   valuesr   from_pretrainedDEFAULT_VOCODER_IDtodevicer   config__dict__getupdateto_dictgetattrr   hasattr	processorr   )
selfr   r   r   argskwargsr%   
gen_configsampling_rate_namer   s
            ^/var/www/html/saasai/venv/lib/python3.12/site-packages/transformers/pipelines/text_to_audio.pyr   zTextToAudioPipeline.__init__a   s   $)&) )>>T!TUU::#H#O#O#QQ ?  //0BCFFtzzGXGXY L +<<#!%!4!4!B!BD% ZZ&&F,,001DdKJ%j0023&F" '0BD I ,)6D&V^T:F$+F,?,?ASUY$ZM$0-:* 'G %d.?.?GDNN\oDp!%!A!A!O!OD Eq.?%    c                 R   t        |t              r|g}| j                  j                  j                  dk(  r?| j
                  j                  j                  dd      ddddd}|j                  |       |}| j                  r| j                  n| j                  } ||fi |dd	i}|S )
Nbarkmax_input_semantic_lengthr   FT
max_length)r7   add_special_tokensreturn_attention_maskreturn_token_type_idspaddingreturn_tensorspt)
isinstancestrr   r%   
model_typer   semantic_configr'   r(   r   	tokenizerr,   )r-   textr/   
new_kwargspreprocessoroutputs         r2   
preprocesszTextToAudioPipeline.preprocess   s    dC 6D::''61 #44DDHHIdfij&+)-).'J f%F)-):):t~~dBfBTBr3   c                    | j                  || j                        }|d   }|d   }| j                  j                         r`| j                  || j                        }d|vr| j                  |d<   |j                  |        | j                  j                  di ||}n>t        |      rt        d|j                                 | j                  di ||d   }| j                  | j                  |      }|S )N)r$   forward_paramsgenerate_kwargsr   zYou're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. For reference, the `generate_kwargs` used here are: r    )_ensure_tensor_on_devicer$   r   can_generater   r(   generatelenr   keysr   )r-   model_inputsr/   rI   rJ   rF   s         r2   _forwardzTextToAudioPipeline._forward   s   ..vdkk.J 01 !23::""$";;OTXT_T_;`O #/97;7M7M 34 !!/2(TZZ((J<J>JF?# KKZK_K_KaJbd 
  TZZA,A.A!DF<<#\\&)Fr3   text_inputsrI   returnc                      y NrK   r-   rS   rI   s      r2   __call__zTextToAudioPipeline.__call__   s    SVr3   c                      y rV   rK   rW   s      r2   rX   zTextToAudioPipeline.__call__   s    _br3   c                 $    t        |   |fi |S )a  
        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.

        Args:
            text_inputs (`str` or `list[str]`):
                The text(s) to generate.
            forward_params (`dict`, *optional*):
                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
                underlying model.
            generate_kwargs (`dict`, *optional*):
                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
                complete overview of generate, check the [following
                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
                only passed to the underlying model if the latter is a generative model.

        Return:
            A `dict` or a list of `dict`: The dictionaries have two keys:

            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
        )r   rX   )r-   rS   rI   r   s      r2   rX   zTextToAudioPipeline.__call__   s    0 w>~>>r3   c                     t        | dd       | j                  |d<   t        | dd       | j                  |d<   | j                  |d<   |r|ni |r|ni d}|i }i }|||fS )Nassistant_modelassistant_tokenizerrB   )rI   rJ   )r*   r\   rB   r]   )r-   preprocess_paramsrI   rJ   paramspostprocess_paramss         r2   _sanitize_parametersz(TextToAudioPipeline._sanitize_parameters   s     4*D1=151E1EO-.4.5A+/>>OK(595M5MO12 1?nB2Ar

 $ " &*<<<r3   c                 4   i }| j                   j                  j                  dk(  rd}nd}| j                  r/t	        |t
              r||   }n4t	        |t              r|d   }n|}n| j                  j                  |      }t	        |t              rA|D cg c]1  }|j                  dt        j                        j                         3 c}|d<   n2|j                  dt        j                        j                         |d<   | j                  |d<   |S c c}w )Ncsmaudiowaveformr   cpu)r$   dtyper   )r   r%   r@   r   r>   dicttupler,   decodelistr#   torchfloatnumpyr   )r-   rd   output_dictwaveform_keyre   els         r2   postprocesszTextToAudioPipeline.postprocess   s    ::''50"L%L %& .E5) 8  ~~,,U3HeT"]e#f]eWYBEEekkE$J$P$P$R]e#fK #+;;e5;;;#O#U#U#WK '+'9'9O$ $gs   6D)NNN)__name__
__module____qualname____doc___load_processor_pipeline_calls_generate_load_image_processor_load_feature_extractor_load_tokenizerr   _default_generation_configr   rG   rR   r   r?   r   rh   rX   rk   r   ra   rr   __classcell__)r   s   @r2   r   r      s   2j O#O!#O "2" '+$T (PT0B VCV3V4S>V VbDIbbdSVX[S[nI]b b? d3i0?	tCH~tDcN33	4?8 	=.r3   r   )typingr   r   r   
generationr   utilsr   baser
   rl   models.auto.modeling_autor   !models.speecht5.modeling_speecht5r   r"   r   rK   r3   r2   <module>r      s;    ( ' ) &  QC1 w( wr3   