
    <h                        S r SSKrSSKJr  SSKJrJrJr  SSKr	SSK
r
SSKr
SSK
Jr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJr  SSKJ r   \RB                  " \"5      r#\\" SS9 " S S\5      5       5       r$\\" SS9 " S S\5      5       5       r%\
RL                  RN                  S 5       r(     S@S jr)S r* " S S\
R                  RV                  5      r, " S S\RV                  5      r- " S S \RV                  5      r. " S! S"\RV                  5      r/ " S# S$\RV                  5      r0 " S% S&\RV                  5      r1 " S' S(\RV                  5      r2 " S) S*\RV                  5      r3 " S+ S,\RV                  5      r4 " S- S.\RV                  5      r5 " S/ S0\RV                  5      r6 " S1 S2\RV                  5      r7 " S3 S4\RV                  5      r8 " S5 S6\5      r9 " S7 S8\RV                  5      r: " S9 S:\RV                  5      r;\ " S; S<\5      5       r<\" S=S9 " S> S?\<5      5       r=S?S</r>g)AzPyTorch VITS model.    N)	dataclass)AnyOptionalUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigz`
    Describes the outputs for the VITS model, with potential hidden states and attentions.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
VitsModelOutput(   a  
waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    The final audio waveform predicted by the model.
sequence_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
    The length in samples of each element in the `waveform` batch.
spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
    The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
    GAN decoder model to obtain the final audio waveform.
Nwaveformsequence_lengthsspectrogramhidden_states
attentions )__name__
__module____qualname____firstlineno____doc__r   r   torchFloatTensor__annotations__r   r   tupler   r   __static_attributes__r       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vits/modeling_vits.pyr   r   (   s     -1Hhu(()048hu00186:K% 1 123:8<M8E%"3"345<59Ju00129r)   r   zm
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S'   S	rg)
VitsTextEncoderOutput@   aq  
prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    The predicted mean values of the prior distribution for the latent text variables.
prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
    The predicted log-variance values of the prior distribution for the latent text variables.
Nlast_hidden_stateprior_meansprior_log_variancesr   r   r   )r   r    r!   r"   r#   r.   r   r$   r%   r&   r/   r0   r   r'   r   r(   r   r)   r*   r,   r,   @   s~     6:x 1 129/3K%++,37;%"3"34;8<M8E%"3"345<59Ju00129r)   r,   c                     X-   n[         R                  " US S 2S U2S S 24   5      n[         R                  " US S 2US 2S S 24   5      nXE-  nU$ N)r$   tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r*   fused_add_tanh_sigmoid_multiplyr<   U   sP    FJJva,123EMM&LM1!456E=DKr)   c	                    X* :  X:*  -  n	U	) n
[         R                  " U 5      n[         R                  " U 5      n[        R                  " [        R                  " SU-
  5      S-
  5      n[
        R                  R                  USS9nXS'   XS'   X
   X'   SX'   [        X	   XSS24   X)SS24   X9SS24   UUUUUS9	u  X'   X'   X4$ )	ap	  
This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
`tail_bound`, the transform behaves as an identity function.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`, *optional*, defaults to `False`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`, *optional* defaults to 5):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`, *optional*, defaults to 1e-3):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
        applied.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
        limits applied.
r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r$   
zeros_likenplogexpr   
functionalr>   _rational_quadratic_spline)rC   rD   rE   rF   rG   rH   rI   rJ   rK   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r*   (_unconstrained_rational_quadratic_splinerW   ^   s    \ #k1f6JK11v&G""6*KvvbffQ/0145H!}}001Iv0V'/V$(0W%%+%BG"),K&Ga+/a0GH12IJ!9PQ:Q!R#%%
HDG!;#D r)   c	                    Un	U* n
[         R                  " U 5      U
:  d  [         R                  " U 5      U	:  a  [        S5      eUR                  S   nXk-  S:  a  [        SU SU 35      eX{-  S:  a  [        SU SU 35      e[
        R                  R                  USS9nUSXk-  -
  U-  -   n[         R                  " USS9n[
        R                  R                  US	S
SS9nX-
  U-  U
-   nXS'   XS'   USSS24   USSS24   -
  nU[
        R                  R                  U5      -   n[
        R                  R                  USS9nUSX{-  -
  U-  -   n[         R                  " USS9n[
        R                  R                  US	S
SS9nX-
  U-  U
-   nU
US'   U	US'   USSS24   USSS24   -
  nU(       a  UOUnUS==   S-  ss'   [         R                  " U S   U:  SS9S-
  nUS   nUR                  SU5      S   nUR                  SU5      S   nUR                  SU5      S   nX-  nUR                  SU5      S   nUR                  SU5      S   nUSSS24   R                  SU5      S   nUR                  SU5      S   nUU-   SU-  -
  nU(       d  U U-
  U-  nUSU-
  -  nUUUR                  S5      -  UU-  -   -  nUUU-  -   nUUU-  -   n UR                  S5      UUR                  S5      -  SU-  U-  -   USU-
  R                  S5      -  -   -  n![         R                  " U!5      S[         R                  " U5      -  -
  n"U U"4$ U U-
  n#U#U-  n$UUU-
  -  U$-   n%UU-  U$-
  n&U* U#-  n'U&R                  S5      SU%-  U'-  -
  n(U(S:  R                  5       (       d  [!        SU( 35      eSU'-  U&* [         R"                  " U(5      -
  -  n)U)U-  U-   n U)SU)-
  -  nUUU-  -   nUR                  S5      UU)R                  S5      -  SU-  U-  -   USU)-
  R                  S5      -  -   -  n![         R                  " U!5      S[         R                  " U5      -  -
  n"U U"* 4$ )a  
This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

Args:
    inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Second half of the hidden-states input to the Vits convolutional flow module.
    unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
        Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
        layer in the convolutional flow module
    reverse (`bool`):
        Whether the model is being run in reverse mode.
    tail_bound (`float`):
        Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
        transform behaves as an identity function.
    min_bin_width (`float`):
        Minimum bin value across the width dimension for the piecewise rational quadratic function.
    min_bin_height (`float`):
        Minimum bin value across the height dimension for the piecewise rational quadratic function.
    min_derivative (`float`):
        Minimum bin value across the derivatives for the piecewise rational quadratic function.
Returns:
    outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Hidden-states as transformed by the piecewise rational quadratic function.
    log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
        Logarithm of the absolute value of the determinants corresponding to the `outputs`.
z-Input to a transform is not within its domainrA         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rV   rB   )r>   modevaluer?   r@   .Ngư>).N      r   zinvalid discriminant )r$   minmax
ValueErrorshaper   rP   softmaxcumsumr>   softplussumgatherpowrN   allRuntimeErrorsqrt)*rC   rD   rE   rF   rG   rH   rI   rJ   rK   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrT   derivative_numeratorrU   intermediate2intermediate3abcdiscriminantroots*                                             r*   rQ   rQ      sC   X K+Kyy;&%))F*;k*IHII"((,H#%-m_<^_g^hijj 3&.~.>>`ai`jkll]]""#6B"?Fa-"::fDDFV,I!!)jPS!TI*i7+EI#f$gsABw)C"H"55F 2==#9#9:R#SSKmm##$8b#AGN$= =HHGg2.J"":6
RU"VJ+z9KGJ$Jv%Jwab!JsCRCx$88G")JyM'd"iiy)]:CaGGi G&&r73F;O}}R1&9!((W5f=E,,r7+F3K#**2w7?!,S!"W!5!<!<R!I&!QNN2w/7M%(BBQ_TM/)-== %U 3![599Q<%?BSVkBk%kl	!M4I$II"Y%<<*q1&15+o 5561u9//!"445 

 ii 45EIIk<R8RR## !11%5[+<<=M--=L=(uuQx!a%!)+!&&((!6|nEFFA1"uzz,778))O; $D 1!M4I$II*q1&!4+o 5561t8.."334 

 ii 45EIIk<R8RR$$r)   c                   D   ^  \ rS rSrS\S\4U 4S jjrSS jrS rSr	U =r
$ )	VitsWaveNeti0  config
num_layersc           	        > [         TU ]  5         UR                  U l        X l        [        R
                  R                  5       U l        [        R
                  R                  5       U l        [
        R                  " UR                  5      U l        [        [
        R                  R                  S5      (       a%  [
        R                  R                  R                  nO[
        R                  R                  nUR                   S:w  aG  [        R
                  R#                  UR                   SUR                  -  U-  S5      nU" USS9U l        ['        U5       H  nUR(                  U-  nUR*                  U-  U-
  S-  n[        R
                  R#                  UR                  SUR                  -  UR*                  UUS9nU" USS9nU R                  R-                  U5        XRS-
  :  a  SUR                  -  n	OUR                  n	[        R
                  R#                  UR                  U	S5      n
U" U
SS9n
U R                  R-                  U
5        M     g )Nweight_normr   r^   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r$   r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r*   r   VitsWaveNet.__init__1  s   !--$,,.$xx224zz&"8"89288,,m<<((33??K((..K((A-)F)FFL^L^H^akHkmnoJ)*8DDOz"A33Q6H11H<xGAMGxx"..!3!33"66! ' H #8(;HNN!!(+ >!$%(:(:$:!$*$6$6!"XX__V-?-?ARTUVN(hGN  ''7+ #r)   c                    [         R                  " U5      n[         R                  " U R                  /5      nUb  U R	                  U5      n[        U R                  5       H  nU R                  U   " U5      nUb0  US-  U R                  -  nUS S 2XSU R                  -  -   2S S 24   n	O[         R                  " U5      n	[        XyUS   5      n
U R                  U
5      n
U R                  U   " U
5      nX`R                  S-
  :  a;  US S 2S U R                  2S S 24   nX-   U-  nXKS S 2U R                  S 2S S 24   -   nM  XK-   nM     XB-  $ )Nr^   r   r   )r$   rL   	IntTensorr   r   r   r   r   r<   r   r   )r   rC   padding_maskglobal_conditioningrT   num_channels_tensorr   r   cond_offsetglobal_statesr;   res_skip_actsres_actss                r*   forwardVitsWaveNet.forwardZ  sV   ""6*#oot/?/?.@A*"&//2E"Ft'A NN1-f5M".!ed&6&66 3A{STW[WgWgSgEg7gij4j k % 0 0 ?2=QdefQghD<<%D 003D9M??Q&&(,>d.>.>,>)AB +|;!!T5E5E5G2J$KK!1% (( %%r)   c                 z   U R                   S:w  a3  [        R                  R                  R	                  U R
                  5        U R                   H,  n[        R                  R                  R	                  U5        M.     U R                   H,  n[        R                  R                  R	                  U5        M.     g )Nr   )r   r$   r   r   remove_weight_normr   r   r   r   layers     r*   r   VitsWaveNet.remove_weight_normw  st    &&!+HHNN--doo>^^EHHNN--e4 $))EHHNN--e4 *r)   )r   r   r   r   r   r   r2   )r   r    r!   r"   r   intr   r   r   r(   __classcell__r   s   @r*   r   r   0  s&    '8z '8s '8R&:5 5r)   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsPosteriorEncoderi  r   c                 >  > [         TU ]  5         UR                  U l        [        R
                  " UR                  UR                  S5      U l        [        XR                  S9U l        [        R
                  " UR                  U R                  S-  S5      U l        g )Nr   r   r^   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r*   r   VitsPosteriorEncoder.__init__  ss    ",,		&"9"96;M;MqQ"66a6ab6#5#5t7H7H17LaPr)   c                 &   U R                  U5      U-  nU R                  XU5      nU R                  U5      U-  n[        R                  " X@R
                  SS9u  pVU[        R                  " U5      [        R                  " U5      -  -   U-  nXuU4$ )Nr   rZ   )r   r   r   r$   splitr   
randn_likerO   )r   rC   r   r   statsmean
log_stddevsampleds           r*   r   VitsPosteriorEncoder.forward  s    v&5f4GHv&5 ;;u.?.?QG%**40599Z3HHHLXj((r)   )r   r   r   r   r2   	r   r    r!   r"   r   r   r   r(   r   r   s   @r*   r   r     s    Qz Q) )r)   r   c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ sH0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ sH,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )strider   r   )r   r   leaky_relu_sloper   r   r   lenr   get_paddingconvs1convs2)r   channelsr   r   r   r   _r   s          r*   r   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   6C#$2C(c                     X-  U-
  S-  $ )Nr^   r   )r   r   r   s      r*   r    HifiGanResidualBlock.get_padding  s    &1a77r)   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r   r   r   r   r   s      r*   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm  si    hh**288,,m<<((33??K[[E ![[E !r)   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r2   )r   r   r   r   r   r   s     r*   r   'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r)   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r2   )zipr   r   r   rP   
leaky_relur   )r   r   conv1conv2residuals        r*   r   HifiGanResidualBlock.forward  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r)   )r   r   r   )r   )r   r      g?r   )r   r    r!   r"   r   r   r   r   r   r(   r   r   s   @r*   r   r     s!    
>8/ r)   r   c                      ^  \ rS rSrS\4U 4S jjrS rS r SS\R                  S\
\R                     S\R                  4S	 jjrS
rU =r$ )VitsHifiGani  r   c                 `  > [         TU ]  5         Xl        [        UR                  5      U l        [        UR                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [!        UR                  UR"                  5      5       Ha  u  nu  p4U R                  R%                  [        R&                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [+        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[!        UR                  UR,                  5       H4  u  pFU R(                  R%                  [/        XTXaR0                  5      5        M6     Mr     [        R                  " WSSSSSS9U l        UR4                  S:w  a2  [        R                  " UR4                  UR                  S5      U l        g g )	N   r   r   )r   r   r   r^   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r*   r   VitsHifiGan.__init__  s   v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRY^_((A-		&"?"?A`A`bcdDI .r)   c                 N   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nUR                  5         M     g r   )r   r   r   r   r   r  r  r   r   s      r*   r   VitsHifiGan.apply_weight_norm  sm    hh**288,,m<<((33??K^^E $^^E##% $r)   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H  nUR                  5         M     g r2   )r  r   r   r   r  r   s     r*   r   VitsHifiGan.remove_weight_norm   s<    ^^EHH''. $^^E$$& $r)   r   r   returnc                    U R                  U5      nUb  X0R                  U5      -   n[        U R                  5       H  n[        R
                  R                  X0R                  R                  5      nU R                  U   " U5      nU R                  X@R                  -     " U5      n[        SU R                  5       H)  nXPR                  X@R                  -  U-      " U5      -  nM+     XPR                  -  nM     [        R
                  R                  U5      nU R                  U5      n[        R                  " U5      nU$ )a  
Converts a spectrogram into a speech waveform.

Args:
    spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
        Tensor containing the spectrograms.
    global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
        Tensor containing speaker embeddings, for multispeaker models.

Returns:
    `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
r   )r   r
  r   r  r   rP   r   r   r   r  r  r   r	  r$   r3   )r   r   r   r   r   	res_statejr   s           r*   r   VitsHifiGan.forward  s    k2*)II6I,JJMt))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5::m,r)   )r
  r   r	  r   r   r  r  r  r2   )r   r    r!   r"   r   r   r   r   r$   r%   r   r   r(   r   r   s   @r*   r   r     s\    "ez "eH&' bf  ,, CKEL]L]C^ 			   r)   r   c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingLayeri)  r   c                 >  > [         TU ]  5         UR                  S-  U l        [        R
                  " U R                  UR                  S5      U l        [        XR                  S9U l
        [        R
                  " UR                  U R                  S5      U l        g )Nr^   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r	  r   s     r*   r   "VitsResidualCouplingLayer.__init__*  ss    #--2		$"4"4f6H6H!L"66]6]^6#5#5t7I7I1Mr)   c                    [         R                  " XR                  /S-  SS9u  pVU R                  U5      U-  nU R	                  XrU5      nU R                  U5      U-  n[         R                  " U5      n	U(       dP  X[         R                  " U	5      -  U-  -   n[         R                  " XV/SS9n
[         R                  " U	SS/5      nX4$ Xh-
  [         R                  " U	* 5      -  U-  n[         R                  " XV/SS9n
U
S 4$ )Nr^   r   rZ   )
r$   r   r  r   r   r	  rL   rO   catrg   )r   rC   r   r   rG   
first_halfsecond_halfr   r   r   rT   log_determinants               r*   r   !VitsResidualCouplingLayer.forward2  s    "'++f7I7I6JQ6NTU"V
j1L@]BUV~~m,|;%%d+
uyy/D!D|!SSKii 9qAG#ii
QF;O++&-J;1GG,VKii 9qAGD= r)   )r	  r   r  r   NFr   r   s   @r*   r  r  )  s    Nz N! !r)   r  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsResidualCouplingBlockiD  r   c                    > [         TU ]  5         [        R                  " 5       U l        [        UR                  5       H'  nU R                  R                  [        U5      5        M)     g r2   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  r   r   r   r   s      r*   r   "VitsResidualCouplingBlock.__init__E  sH    ]]_
v556AJJ7?@ 7r)   c                     U(       d8  U R                    H&  nU" XU5      u  p[        R                  " US/5      nM(     U$ [        U R                   5       H%  n[        R                  " US/5      nU" XUSS9u  pM'     U$ )Nr   TrG   )r&  r$   flipreversed)r   rC   r   r   rG   flowr   s          r*   r   !VitsResidualCouplingBlock.forwardK  s}    

 7JK	FQC0 #  !,FQC0 7JTXY	 - r)   )r&  r"  r   r   s   @r*   r$  r$  D  s    Az A	 	r)   r$  c                   >   ^  \ rS rSrSS\4U 4S jjjrSS jrSrU =r$ )VitsDilatedDepthSeparableConviW  r   c                 N  > [         TU ]  5         UR                  nUR                  nUR                  U l        [        R                  " U5      U l        [        R                  " 5       U l
        [        R                  " 5       U l        [        R                  " 5       U l        [        R                  " 5       U l        [        U R
                  5       H  nX5-  nX6-  U-
  S-  nU R                  R                  [        R                   " UUUUUUS95        U R                  R                  [        R                   " XDS5      5        U R                  R                  [        R"                  " U5      5        U R                  R                  [        R"                  " U5      5        M     g )Nr^   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r*   r   &VitsDilatedDepthSeparableConv.__init__X  s,   ;;%% ;;zz,/]]_!}}}}}}t'A"~H"-8Q>G%%		 (!) +#%#	   ''		(a(HILLX 67LLX 67 (r)   c                 "   Ub  X-   n[        U R                  5       H  nU R                  U   " X-  5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U   " U5      nU R                  U   " UR	                  SS5      5      R	                  SS5      n[
        R                  R                  U5      nU R                  U5      nX-   nM     X-  $ Nr   rA   )r   r   r6  r8  	transposer   rP   gelur7  r9  r   )r   rC   r   r   r   r   s         r*   r   %VitsDilatedDepthSeparableConv.forwardt  s    *1Ft'A ..q1&2GHM LLOM,C,CAr,JKUUVWY[\MMM..}=M 003MBM LLOM,C,CAr,JKUUVWY[\MMM..}=M LL7M+F ( $$r)   )r6  r7  r   r8  r9  r   )rB   r2   r   r   s   @r*   r1  r1  W  s    8z 8 88% %r)   r1  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsConvFlowi  r   c                   > [         TU ]  5         UR                  U l        UR                  S-  U l        UR                  U l        UR                  U l	        [        R                  " U R
                  U R                  S5      U l        [        U5      U l        [        R                  " U R                  U R
                  U R                  S-  S-
  -  S5      U l        g )Nr^   r   r   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsro   duration_predictor_tail_boundrH   r   r   r   r1  conv_ddsr   r   s     r*   r   VitsConvFlow.__init__  s    %11#<<A;; >>		$"4"4d6J6JAN5f=4#7#79K9Kt}}_`O`cdOd9eghir)   c           	         [         R                  " XR                  /S-  SS9u  pVU R                  U5      nU R	                  XrU5      nU R                  U5      U-  nUR                  u  pn
UR                  XSU
5      R                  SSSS5      nUSS U R                  24   [        R                  " U R                  5      -  nUSU R                  SU R                  -  24   [        R                  " U R                  5      -  nUSSU R                  -  S 24   n[        UUUUUU R                  S9u  pn[         R                  " XV/SS9U-  nU(       d  [         R                   " X-  SS/5      nUU4$ US 4$ )	Nr^   r   rZ   rA   r   r   .)rG   rH   )r$   r   r  r   rI  r   rc   reshapepermutero   mathrl   rE  rW   rH   r  rg   )r   rC   r   r   rG   r  r  r   
batch_sizer   lengthrD   rE   rF   rU   rT   r   s                    r*   r   VitsConvFlow.forward  s}   "'++f7I7I6JQ6NTU"V
j1mCVW}5D'1'7'7$
f%--jBOWWXY[\^_abc+C4==,@ADIIdNbNbDcc,S$--!dmmBS2S-STW[W`W`aeauauWvv#0a$--6G6I1I#J #K $$
  ))Z51=L#ii(BQFKOO++D= r)   )rI  r   r   rE  r  ro   rH   r"  r   r   s   @r*   rC  rC    s    	jz 	j! !r)   rC  c                   :   ^  \ rS rSrS\4U 4S jjrSS jrSrU =r$ )VitsElementwiseAffinei  r   c                 ,  > [         TU ]  5         UR                  U l        [        R
                  " [        R                  " U R                  S5      5      U l        [        R
                  " [        R                  " U R                  S5      5      U l	        g Nr   )
r   r   rF  r   r   	Parameterr$   zeros	translate	log_scaler   s     r*   r   VitsElementwiseAffine.__init__  sY    77ekk$--&CDekk$--&CDr)   c                 8   U(       d]  U R                   [        R                  " U R                  5      U-  -   nXR-  n[        R                  " U R                  U-  SS/5      nXV4$ XR                   -
  [        R                  " U R                  * 5      -  U-  nUS 4$ Nr   r^   )rX  r$   rO   rY  rg   )r   rC   r   r   rG   rT   r   s          r*   r   VitsElementwiseAffine.forward  s    nnuyy'@6'IIG,G#ii(E1vNO++.%))T^^O2LL|[GD= r)   )r   rY  rX  r"  r   r   s   @r*   rS  rS    s    Ez E! !r)   rS  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsStochasticDurationPredictori  c                   > [         TU ]  5         UR                  nUR                  n[        R
                  " X3S5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l
        US:w  a  [        R
                  " X#S5      U l        [        R                  " 5       U l        U R                  R                  [        U5      5        [!        UR"                  5       H'  nU R                  R                  [%        U5      5        M)     [        R
                  " SUS5      U l        [        R
                  " X3S5      U l        [        UUR                  S9U l        [        R                  " 5       U l        U R,                  R                  [        U5      5        [!        UR"                  5       H'  nU R,                  R                  [%        U5      5        M)     g )Nr   )r;  r   )r   r   r   r   r   r   r   r   r1  duration_predictor_dropoutrI  r
  r   r&  r   rS  r   duration_predictor_num_flowsrC  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimrE  r   r   s        r*   r   (VitsStochasticDurationPredictor.__init__  sb   11	 ,,		/AF?QG5::

 >		)a@DI]]_


/78v::;AJJl623 <  YYq/1= ii!L:::

 --/4V<=v::;AOO""<#78 <r)   c                    [         R                  " U5      nU R                  U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R	                  X5      nU R                  U5      U-  nU(       Gd  U R                  U5      nU R                  Xr5      nU R                  U5      U-  n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nSn	Un
U R                   H*  nU" XX-   S9u  p[         R                  " U
S/5      n
X-  n	M,     [         R                   " U
SS/SS9u  pU	[         R"                  " [$        R&                  R)                  U5      [$        R&                  R)                  U* 5      -   U-  SS/5      -  n	[         R"                  " S[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U	-
  nU[         R0                  " U5      -
  U-  n[         R,                  " [         R2                  " US5      5      U-  n[         R"                  " U* SS/5      n[         R4                  " X/SS9nU R6                   H*  nU" UX!S9u  nn[         R                  " US/5      nUU-  nM,     [         R"                  " S	[*        R,                  " S[*        R.                  -  5      US-  -   -  U-  SS/5      U-
  nUU-   $ [9        [;        U R6                  5      5      nUS S
 US   /-   n[         R                  " UR                  S5      SUR                  S5      5      R                  UR                  UR                  S9U-  nU H&  n[         R                  " US/5      nU" UX!SS9u  nnM(     [         R                   " USS/SS9u  nnU$ )Nr   r^   )devicedtype)r   r   rZ         gh㈵>g      ?rA   T)r   rG   )r$   detachr   r
  rI  r   rc  re  rd  randnsizetorj  rk  rf  r,  r   rg   r   rP   
logsigmoidrN  rN   pir4   	clamp_minr  r&  listr-  )r   rC   r   r   	durationsrG   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr.  r   r  r  logqlog_determinant_sumlatentsnllr&  r   log_durations                         r*   r   'VitsStochasticDurationPredictor.forward  s   f%v&*"',,/B"Cii(;<<Fv4',6 ..y9M ..}KM //>MM INN1-q)..2CDGGv}}djdpdpGq  -.) 059%I_62! %*JJ/@1#$F!-@- ( ',kk2CaVQR&S#J)UYY))*58P8PR\Q\8]]ammpqstou. ) 		$$((1tww;"7;KQ;N"OPS__bcefagh/0 
 $emmJ&??<OJ5??:t#DETJ"'))ZK!Q"@ii 9qAG

+/+b(**Wqc2#6# #
 ))C488AK#8GQJ#GH<WZ[]^Y_`cvvC:$**-.E#2J%),E FKKNAv{{1~>AA^d^j^jAk  **Wqc2!'<]ab
  $kk'Aq6qAOL!r)   )	r
  rI  r   r   r&  re  rc  rd  rf  )NNFrY   r   r    r!   r"   r   r   r(   r   r   s   @r*   r_  r_    s    9@@  @ r)   r_  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )VitsDurationPredictori&  c                 p  > [         TU ]  5         UR                  nUR                  n[        R
                  " UR                  5      U l        [        R                  " UR                  X2US-  S9U l
        [        R                  " X1R                  S9U l        [        R                  " X3X"S-  S9U l        [        R                  " X1R                  S9U l        [        R                  " USS5      U l        UR"                  S:w  a2  [        R                  " UR"                  UR                  S5      U l        g g )Nr^   )r   epsr   r   )r   r   r4  "duration_predictor_filter_channelsr   r   ra  r   r   r   conv_1r:  layer_norm_epsnorm_1conv_2norm_2projr   r
  )r   r   r   rE  r   s       r*   r   VitsDurationPredictor.__init__'  s    ;; CCzz&"C"CDii 2 2OZeijZjkll?8M8MNii+fgWghll?8M8MNIIoq!4	((A-		&"?"?ASASUVWDI .r)   c                 `   [         R                  " U5      nUb)  [         R                  " U5      nXR                  U5      -   nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      n[         R                  " U5      nU R                  UR                  SS5      5      R                  SS5      nU R                  U5      nU R                  X-  5      nX-  $ r>  )r$   rn  r
  r  relur  r?  r   r  r  r  )r   rC   r   r   s       r*   r   VitsDurationPredictor.forward6  s    f%*"',,/B"Cii(;<<FV23F#V--a45??2Ff%V23F#V--a45??2Ff%601$$r)   )r
  r  r  r   r  r  r  r2   r  r   s   @r*   r  r  &  s    X% %r)   r  c                   8  ^  \ rS rSrSrS\4U 4S jjrS\R                  S\	S\	4S jr
    SS	\R                  S
\\R                     S\\R                     S\\R                     S\S\\R                  \\R                     4   4S jjrS rS rS rSrU =r$ )VitsAttentioniK  z?Multi-headed attention with relative positional representation.r   c                   > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        UR                  U l        U R                  U R
                  -  U l	        U R                  S-  U l
        U R                  U R
                  -  U R                  :w  a&  [        SU R                   SU R
                   S35      e[        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        [        R                  " U R                  U R                  UR                  S9U l        U R                  (       a  [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        [        R&                  " [(        R*                  " SU R                  S-  S-   U R                  5      U R                  -  5      U l        g g )Nrl  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   r^   )r   r   r   rg  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingrb   r   Linearuse_biask_projv_projq_projout_projrV  r$   ro  	emb_rel_k	emb_rel_vr   s     r*   r   VitsAttention.__init__N  s   ++33//!--$..8}}d*MMDNN*t~~=[\`\j\j[k.t~~.>bB 
 iiV__UiiV__UiiV__U		$..$..vW\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN r)   tensorseq_lenbszc                     UR                  X2U R                  U R                  5      R                  SS5      R	                  5       $ r\  )viewr  r  r?  
contiguous)r   r  r  r  s       r*   _shapeVitsAttention._shapeg  s5    {{3GQQRSUVWbbddr)   r   key_value_statesattention_masklayer_head_maskoutput_attentionsr  c                 0	   UR                  5       u  pgnU R                  U5      U R                  -  n	U R                  U R	                  U5      SU5      n
U R                  U R                  U5      SU5      nX`R                  -  SU R                  4nU R                  XU5      R                  " U6 n	U
R                  " U6 n
UR                  " U6 nU
R                  S5      n[        R                  " XR                  SS5      5      nUR                  5       X`R                  -  X}4:w  a-  [        SX`R                  -  X}4 SUR                  5        35      eU R                  bX  U R                  U R                  U5      n[        R                   " XR                  SS5      5      nU R#                  U5      nUU-  nUbv  UR                  5       USX}4:w  a"  [        SUSX}4 SUR                  5        35      eUR                  X`R                  X}5      U-   nUR                  X`R                  -  X}5      n[$        R&                  R)                  USS	9nUb  UR                  5       U R                  4:w  a*  [        S
U R                  4 SUR                  5        35      eUR                  SSSS5      UR                  X`R                  X}5      -  nUR                  X`R                  -  X}5      nU(       a;  UR                  X`R                  X}5      nUR                  X`R                  -  X}5      nOSn[$        R&                  R+                  XR*                  U R,                  S9n[        R                  " UU5      nUR                  5       X`R                  -  XpR                  4:w  a5  [        SX`R                  XpR                  4 SUR                  5        35      eU R                  bI  U R                  U R.                  U5      nU R1                  U5      n[        R                   " UU5      nUU-  nUR                  X`R                  XpR                  5      nUR                  SS5      nUR3                  XgU R4                  5      nU R7                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelrA   r   r^   z$Attention weights should be of size z	, but is Nrm  z!Attention mask should be of size rZ   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )rp  r  r  r  r  r  r  r  r  r$   bmmr?  rb   r  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rP   rd   r   r  r  '_absolute_position_to_relative_positionrL  rg  r  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r*   r   VitsAttention.forwardj  sR    (,,.a {{=1DLL@ [[]!;RE
{{4;;}#=r3GNN*B>
{{<#>CCZP__j1
#((*5//!$yy/C/CAq/IJ3#7"JJ6nn8Lg7_6` a %%'(* 
 '&*&C&CDNNT[&\##ll<9Z9Z[]_a9bcOGGXLL(L%""$a(BB 7a8R7SS\]k]p]p]r\st  (,,S..'SVddL',,S>>-A7TL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfm?wwL',,S>>-A7TL
 %1$5$5c>>7$\!055cNN6JG]L$(!]]**<<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 '(,(E(EdnnV](^%#KKJW <<(8:STL<'K!&&sNNG]]S!++Aq1 "))#GmmK0111r)   c           	          [        X R                  S-   -
  S5      nUS:  a%  [        R                  R	                  USSX3SS/5      n[        U R                  S-   U-
  S5      nUSU-  -   S-
  nUS S 2XE24   $ )Nr   r   r^   )ra   r  r   rP   r>   )r   relative_embeddingsrP  
pad_lengthslice_start_positionslice_end_positions         r*   r  &VitsAttention._get_relative_embeddings  s    #3#3a#78!<
>"$--"3"34G!QPZhiklIm"n"D$4$4q$8F#BAF1AJ>B"1&:&M#MNNr)   c                 H   UR                  5       u  p#n[        R                  R                  U/ SQ5      nUR	                  X#S-  U-  /5      n[        R                  R                  USUS-
  SS/5      nUR	                  X#S-   SU-  S-
  /5      nUS S 2S U2US-
  S 24   nU$ )N)r   r   r   r   r   r   r^   r   r   rp  r   rP   r>   r  r   xbatch_headsrP  r   x_flatx_finals          r*   r  5VitsAttention._relative_position_to_absolute_position  s    !"Q MMa!34 qj6&9:;""6Avz1a+@A ++{QJF
QGH!WfWfqjl23r)   c           	      >   UR                  5       u  p#n[        R                  R                  USUS-
  SSSS/5      nUR	                  X#SU-  S-
  -  /5      n[        R                  R                  XSSSS/5      nUR	                  X#SU-  /5      S S 2S S 2SS 24   nU$ )Nr   r   r^   r  r  s          r*   r  5VitsAttention._absolute_position_to_relative_position  s    !"Q MMa!VaZAq!!<=F
Q&?@A ""6Aq!+<=++{AJ?@AqrJr)   )r   r  r  rg  r  r  r  r  r  r  r  r  )NNNF)r   r    r!   r"   r#   r   r   r$   Tensorr   r  r   boolr'   r   r  r  r  r(   r   r   s   @r*   r  r  K  s    Irz r2eU\\ eC ec e 481526"'`2||`2 #5<<0`2 !.	`2
 "%,,/`2  `2 
u||Xell33	4`2DO
 
r)   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )VitsFeedForwardi  c                 t  > [         TU ]  5         [        R                  " UR                  UR
                  UR                  5      U l        [        R                  " UR
                  UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        UR                  S:  a.  UR                  S-
  S-  nUR                  S-  nX#SSSS/U l        g S U l        g )Nr   r^   r   )r   r   r   r   r   ffn_dimffn_kernel_sizer  r  r   activation_dropoutr   
isinstance
hidden_actstrr	   act_fnr   )r   r   pad_left	pad_rightr   s       r*   r   VitsFeedForward.__init__  s    ii 2 2FNNFDZDZ[ii0B0BFDZDZ[zz&";";<f''-- !2!23DK ++DK!!A%..2q8H..!3I$Aq!<DLDLr)   c                    UR                  SSS5      nUR                  SSS5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nU R                  U5      nU R                  U5      nX-  nU R                  b)  [        R                  R	                  XR                  5      nU R                  U5      nX-  nUR                  SSS5      nU$ )Nr   r^   r   )	rM  r   r   rP   r>   r  r  r   r  )r   r   r   s      r*   r   VitsFeedForward.forward  s    %--aA6#++Aq!4%4<<#MM--m\\JMM2M2]3%4<<#MM--m\\JMM2%4%--aA6r)   )r  r  r  r   r   r  r   s   @r*   r  r    s     $ r)   r  c            	          ^  \ rS rSrS\4U 4S jjr  S
S\R                  S\R                  S\	\R                     S\
4S jjrS	rU =r$ )VitsEncoderLayeri  r   c                 d  > [         TU ]  5         [        U5      U l        [        R
                  " UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )Nr  )r   r   r  	attentionr   r   hidden_dropoutr   r:  r   r  
layer_normr  feed_forwardfinal_layer_normr   s     r*   r   VitsEncoderLayer.__init__  sz    &v.zz&"7"78,,v'9'9v?T?TU+F3 "V-?-?VEZEZ [r)   r   r   r  r  c                    UnU R                  UUUS9u  pU R                  U5      nU R                  XQ-   5      nUnU R                  X5      nU R                  U5      nU R	                  XQ-   5      nU4nU(       a  Xv4-  nU$ )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rT   s           r*   r   VitsEncoderLayer.forward$  s     !&*nn')/ '5 '
# ]3(@A ))-F]3--h.FG "&Gr)   )r  r   r  r  r  r"  )r   r    r!   r"   r   r   r$   r  r%   r   r  r   r(   r   r   s   @r*   r  r    s\    \z \ 26"'|| '' !.	
   r)   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\R                  S\\R                     S\\
   S\\
   S	\\
   S
\\\4   4S jjrSrU =r$ )VitsEncoderiB  r   c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        UR                  U l
        g s  snf r"  )r   r   r   r   r   r   num_hidden_layersr  layersgradient_checkpointing	layerdropr(  s      r*   r   VitsEncoder.__init__C  s`    mmuVMeMeGf$gGf!%5f%=Gf$gh&+#)) %hs   A6r   r   r  r  output_hidden_statesreturn_dictr  c                 8   U(       a  SOS nU(       a  SOS nUb  [        X1R                  5      nX-  n[        5       =(       d    [        U 5      n	U R                   H  n
U(       a  Xq4-   n[
        R                  R                  SS5      nU R                  =(       a    XR                  :  nU(       a  U	(       a  U
" UUUUS9nUS   nU(       a  SnU(       d  M~  UWS   4-   nM     X-  nU(       a  Xq4-   nU(       d  [        S XU4 5       5      $ [        UUUS9$ )Nr   r   r   )r  r   r  )NNc              3   ,   #    U H  oc  M  Uv   M     g 7fr2   r   ).0vs     r*   	<genexpr>&VitsEncoder.forward.<locals>.<genexpr>}  s     m$[q$[s   	)r.   r   r   )r   rk  r
   r   r  rM   randomuniformr  r  r'   r   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r*   r   VitsEncoder.forwardJ  s(    #7BD$5b4 %7H[H[\N%402R6LT6R![[M#$58H$H! #%))"3"3Aq"9!]]U0Cnn0TN![ -!#1!-&7	! !.a 0 ,  &9]1=M<O&O#- )0 &4 14D Dm]GZ$[mmm++*
 	
r)   )r   r  r  r  )NNNN)r   r    r!   r"   r   r   r$   r%   r   r  r  r   r'   r   r   r(   r   r   s   @r*   r  r  B  s    *z * 26,0/3&*9
((9
 ''9
 !.	9

 $D>9
 'tn9
 d^9
 
uo%	&9
 9
r)   r  c                      ^  \ rS rSrSrS\4U 4S jjr    SS\R                  S\R                  S\
\R                     S\
\   S	\
\   S
\
\   S\\\R                     \4   4S jjrSrU =r$ )VitsTextEncoderi  zk
Transformer encoder that uses relative positional representation instead of absolute positional encoding.
r   c                 (  > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        U5      U l
        [        R                  " UR                  UR                  S-  SS9U l        g )Nr^   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r*   r   VitsTextEncoder.__init__  sm    LL):):F<N<NPVPcPcd"6*yy!3!3V5E5E5IWXYr)   	input_idsr   r  r  r  r  r  c           	         U R                  U5      [        R                  " U R                  R                  5      -  nU R                  UUUUUUS9nU(       d  US   OUR                  n	U R                  U	R                  SS5      5      R                  SS5      U-  n
[        R                  " XR                  R                  SS9u  pU(       d  XU4USS  -   nU$ [        U	UUUR                  UR                  S9$ )N)r   r   r  r  r  r  r   r   r^   rZ   )r.   r/   r0   r   r   )r  rN  rl   r   r   r  r.   r  r?  r$   r   r   r,   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr.   r   r/   r0   rT   s                 r*   r   VitsTextEncoder.forward  s     )))4tyyAXAX7YY,,'%)/!5# ' 
 7BOA.GhGh.88A>?II!QOR^^+0;;ukk>S>SYZ+[((7JKo^_^`NaaGN$/# 3)77&11
 	
r)   )r   r  r  r  )NNNT)r   r    r!   r"   r#   r   r   r$   r  r%   r   r  r   r'   r,   r   r(   r   r   s   @r*   r  r    s    Zz Z 26,0/3&*#
<<#
 ''#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\\"$99	:#
 #
r)   r  c                   N    \ rS rSr% \\S'   SrSrSrS\	R                  4S jrSrg	)
VitsPreTrainedModeli  r   vitsr  Tmodulec                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        g[        U[        R                  [        R                  45      (       a  [        R                  R                  UR
                  5        UR                  bh  [         R"                  " UR$                  UR&                  UR(                  S   -  -  5      n[        R                  R+                  UR                  U* US9  gg[        U[        R,                  5      (       ad  UR
                  R                  R                  SUS9  UR.                  b2  UR
                  R                  UR.                     R                  5         gg[        U[0        5      (       a  U R                   R2                  (       a  U R                   R4                  U R                   R6                  -  n[        R                  R                  UR8                  US-  S9  [        R                  R                  UR:                  US-  S9  gg[        U[<        5      (       aI  UR>                  R                  R                  5         UR@                  R                  R                  5         gg)	zInitialize the weightsrB   )r   stdNrY   r   )r   r   rl  )r!  )!r   initializer_ranger  r   r  r   datanormal_r   zero_r:  fill_r   r  initkaiming_normal_rN  rl   r3  r   r   uniform_r  padding_idxr  r  r   r  r  r  rS  rX  rY  )r   r  r!  kr  s        r*   _init_weights!VitsPreTrainedModel._init_weights  sQ   kk++fbii((MM&&CS&9{{&  &&( '--KK""$MM$$S)B,>,> ?@@GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' --MM&&CS&9!!-""6#5#56<<> ...{{&&;;22dkk6U6UU 0 0hnE 0 0hnE '  566!!'')!!'') 7r)   r   N)r   r    r!   r"   r   r&   base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler,  r(   r   r)   r*   r  r    s)    !O&*#*BII *r)   r  z@
    The complete VITS model, for text-to-speech synthesis.
    c                      ^  \ rS rSrS\4U 4S jjrS r\       SS\\	R                     S\\	R                     S\\   S\\   S	\\   S
\\   S\\	R                     S\\\   \4   4S jj5       rSrU =r$ )	VitsModeli  r   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        U5      U l        UR                  (       a  [        U5      U l        O[        U5      U l        UR                  S:  a0  [        R                  " UR                  UR                   5      U l        [%        U5      U l        UR(                  U l        UR*                  U l        UR,                  U l        U R/                  5         g rU  )r   r   r   r  text_encoderr$  r.  r   decoder"use_stochastic_duration_predictionr_  duration_predictorr  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterw  noise_scale_duration	post_initr   s     r*   r   VitsModel.__init__  s     +F3-f5	"6*44&Ef&MD#&;F&CD#"!#f.A.A6C`C`!aD "6f!= $11!--$*$?$?! 	r)   c                     U R                   $ r2   )r5  )r   s    r*   get_encoderVitsModel.get_encoder  s       r)   r  r  
speaker_idr  r  r  labelsr  c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        S5      eU R
                  R                  R                  R                  nUb!  UR                  S5      R                  U5      n	O4[        R                  " U5      R                  S5      R                  U5      n	U R                   R                  S:  a  Ub  SUs=::  a  U R                   R                  :  d(  O  [        SU R                   R                  S-
   S35      e[        U[         5      (       a  [        R"                  " SX0R$                  S	9nU R'                  U5      R                  S5      n
OSn
U R                  UU	UUUUS
9nU(       d  US   OUR(                  nUR+                  SS5      nU	R+                  SS5      n	U(       d  US   OUR,                  nU(       d  US   OUR.                  nU R                   R0                  (       a  U R3                  UU	U
SU R4                  S9nOU R3                  XU
5      nSU R6                  -  n[        R8                  " [        R:                  " U5      U	-  U-  5      n[        R<                  " [        R>                  " USS/5      S5      RA                  5       n[        RB                  " URE                  5       UR                  UR$                  S9nUR                  S5      UR                  S5      :  nUR                  S5      R                  U	R                  5      n[        R                  " U	S5      [        R                  " US5      -  nURF                  u  nnnn[        RH                  " US5      RK                  UU-  S5      n[        RB                  " UUR                  UR$                  S9nUR                  S5      U:  nUR                  UR                  5      RK                  UUU5      nU[L        RN                  RQ                  U/ SQ5      SS2SS24   -
  nUR                  S5      R+                  SS5      U-  n[        RR                  " URU                  S5      U5      R+                  SS5      n[        RR                  " URU                  S5      U5      R+                  SS5      nU[        RV                  " U5      [        R:                  " U5      -  U RX                  -  -   nU R[                  UUU
SS9nUU-  n U R]                  U U
5      n!U!RU                  S5      n!U[^        R`                  " U R                   Rb                  5      -  n"U(       d  U!U"U 4USS -   n#U#$ [e        U!U"U URf                  URh                  S9$ )a'  
speaker_id (`int`, *optional*):
    Which speaker embedding to use. Only used for multispeaker models.
labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
    Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation.

Example:

```python
>>> from transformers import VitsTokenizer, VitsModel, set_seed
>>> import torch

>>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
>>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

>>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

>>> set_seed(555)  # make deterministic

>>> with torch.no_grad():
...     outputs = model(inputs["input_ids"])
>>> outputs.waveform.shape
torch.Size([1, 45824])
```
Nz&Training of VITS is not supported yet.rA   r   r   z Set `speaker_id` in the range 0-.r   )rp  
fill_valuerj  )r  r   r  r  r  r  r^   T)rG   rw  rY   )rk  rj  )r   r   r   r   r   r   r   r+  )r   r   r   r   r   )5r   r  r  use_return_dictNotImplementedErrorr5  r  r   rk  	unsqueezerq  r$   	ones_liker9  rb   r  r   fullrj  r:  r.   r?  r/   r0   r7  r8  r=  r<  ceilrO   rt  rg   longarangera   rc   re   r  r   rP   r>   r  squeezer   rw  r.  r6  rM   prodr   r   r   r   )$r   r  r  rC  r  r  r  rD  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r/   r0   r  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskrO  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsr}  r   r   r   rT   s$                                       r*   r   VitsModel.forward  s;   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&NOO&&33::@@
%!/!9!9"!=!@!@!L!&!;!E!Eb!I!L!LZ!X;;##a'J,B
=T[[%=%== #CDKKD\D\_`D`Caab!cdd*c**"ZZTjQ\Q\]
!%!3!3J!?!I!I"!M!%"//+)/!5# 0 
 7B+A.GZGlGl%//15/99!Q?4?)!,EXEdEd<G1!4M`MtMt;;9922"" 55 3 L  22=VhiLT///::eii58JJ\YZ!OOEIIhA,GKPPR ,,0446>O>U>U^o^v^vw%//25F5P5PQR5SS1;;A>AABTBZBZ[ OO$6:U__M`bd=ee	5>__2
A}l||Hb166zL7PRST,,}HNN8??[))!,|;%((9>>z<Yfg&):):=J\)]^_adbdad^d)ee''*44Q:YF ll4<<?K@JJ1aP#ll4<<?<OPZZ[\^_`#e&6&6{&CeiiPcFd&dgkgwgw&ww))M+>@R\`)a 33<<-?@##A&,rwwt{{7Q7Q/RR!1;?BUVWVXBYYGN-#-;;*55
 	
r)   )
r   r6  r8  r:  r.  rw  r=  r;  r<  r5  )NNNNNNN)r   r    r!   r"   r   r   rA  r   r   r$   r  r   r  r%   r   r'   r   r   r   r(   r   r   s   @r*   r3  r3    s    z 4!  -115$(,0/3&*.2~
ELL)~
 !.~
 SM	~

 $D>~
 'tn~
 d^~
 **+~
 
uSz?*	+~
 ~
r)   r3  )Fg      @MbP?rd  rd  )?r#   rN  dataclassesr   typingr   r   r   numpyrM   r$   torch.utils.checkpointr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r,   jitscriptr<   rW   rQ   r1  r   r   r   r   r  r$  r1  rC  rS  r_  r  r  r  r  r  r  r  r3  __all__r   r)   r*   <module>rv     sP     ! ' '     ! @ 7 B 9 < - , * 
		H	% 
:k : :$ 
:K : :   G TE%PM5%((// M5`)299 )&;299 ;|U")) Up!		 !6		 &+%BII +%\(!299 (!V!BII !$a bii a H"%BII "%JcBII cL'bii 'T$1 $NA
")) A
H/
bii /
d  */  *  *F 
]
# ]

]
@ -
.r)   