
    <hA                    B   S r SSKrSSKJrJr  SSKrSSKrSSKrSSKJ	r	  SSK
JrJrJr  SSKJr  SSKJrJr  SS	KJr  SS
KJr  SSKJr  SSKJrJr  SSKJr  SSKJrJ r J!r!J"r"J#r#  SSK$J%r%J&r&  SSK'J(r(J)r)  SSK*J+r+J,r,  \)RZ                  " \.5      r/Sr0S\Rb                  S\2S\24S jr3 S}S\Rb                  S\2S\\Rb                     4S jjr4  S~S\5\2\24   S\6S\2S\\Rn                     S\2S\Rp                  4S  jjr9 " S! S"\5      r: " S# S$\5      r; " S% S&\5      r< " S' S(\	Rz                  5      r> " S) S*\	Rz                  5      r? " S+ S,\	Rz                  5      r@ " S- S.\R                  Rz                  5      rA " S/ S0\	Rz                  5      rB " S1 S2\	Rz                  5      rC " S3 S4\	Rz                  5      rD " S5 S6\	Rz                  5      rE " S7 S8\	Rz                  5      rF " S9 S:\	Rz                  5      rG " S; S<\	Rz                  5      rH " S= S>\	Rz                  \%5      rI " S? S@\	Rz                  \%5      rJ " SA SB\	Rz                  \%5      rK " SC SD\	Rz                  5      rL " SE SF\	Rz                  5      rM " SG SH\5      rN " SI SJ\5      rO\( " SK SL\&5      5       rP " SM SN\P5      rQ " SO SP\P5      rR " SQ SR\P5      rS " SS ST\P5      rT " SU SV\P5      rU " SW SX\P5      rV " SY SZ\P5      rW " S[ S\\P5      rX " S] S^\	Rz                  5      rY " S_ S`\	Rz                  5      rZ\(" SaSb9 " Sc Sd\P5      5       r[\(" SeSb9 " Sf Sg\P\5      5       r\        SSh\PS\R                  Si\\R                     S\\Rn                     Sj\6Sk\6Sl\6Sm\\	Rz                     Sn\^So\^S\\R                  \5\R                  \R                  4   4   4Sp jjr_\(" SqSb9 " Sr Ss\P5      5       r`\(" StSb9 " Su Sv\P5      5       ra " Sw Sx\	Rz                  5      rb\(" SySb9 " Sz S{\&5      5       rc/ S|Qrdg)zPyTorch SpeechT5 model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)CacheEncoderDecoderCache)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)EmbeddingAccessMixinPreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r    r!   shifted_input_idss       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr+   6   sz     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     US:  a!  U SS2US-
  SU24   n Ub  USS2US-
  SU24   nU R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   UR                  US:H  S5        X24$ )zo
Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
r   Nr#         Y        )r$   r%   r&   r(   )r-   r.   r/   shifted_input_valuess       r*   shift_spectrograms_rightr4   F   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r,   r%   	mask_probmask_length	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ sH  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr6   r5   r7   sequence_lengths     r*   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr,   Nr#   dtyper   F)replace)r'   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper=   put_along_axis)r%   r5   r6   r/   r7   
batch_sizerB   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr>   r?   spec_aug_mask_idxdummy_mask_idxoffsetsr@   rA   s    `` `            @@r*   _compute_mask_indicesre   \   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I/c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5NoLayerNormConvLayer   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r*   ro   %SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r,   c                 J    U R                  U5      nU R                  U5      nU$ N)rw   ry   r{   hidden_statess     r*   forward$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r,   )ry   rw   rq   rr   r   __name__
__module____qualname____firstlineno__ro   r   __static_attributes____classcell__r~   s   @r*   rg   rg      s    A r,   rg   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5LayerNormConvLayer   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   rj   T)elementwise_affine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   	LayerNorm
layer_normr
   rx   ry   rz   s      r*   ro   #SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r,   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )Nr#   )rw   	transposer   ry   r   s     r*   r   "SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r,   ry   rw   rq   r   rr   r   r   r   s   @r*   r   r      s    A r,   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5GroupNormConvLayeri  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   rj   T)
num_groupsnum_channelsaffine)rn   ro   rp   rq   rr   r   rs   rt   ru   rv   rw   r
   rx   ry   	GroupNormr   rz   s      r*   ro   #SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr,   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )rw   r   ry   r   s     r*   r   "SpeechT5GroupNormConvLayer.forward  s2    		-066r,   r   r   r   r   s   @r*   r   r     s    r  r,   r   c            	         ^  \ rS rSrSrSS\S\S\\   4U 4S jjjrSS\S\S\\   4S jjr\	SS\S\S\\   4S	 jj5       r
\R                  " 5       SS
\R                  S\4S jj5       r SS
\R                  S\S\\   4S jjrSrU =r$ )%SpeechT5SinusoidalPositionalEmbeddingi  zDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                    > [         TU ]  5         SU l        X l        X0l        U R                  XR                  -   X#5        g N   )rn   ro   offsetr   r   make_weights)r{   r   r   r   r~   s       r*   ro   .SpeechT5SinusoidalPositionalEmbedding.__init__"  s8    *&-++5}Rr,   num_embeddingsc                     U R                  XU5      n[        U S5      (       a8  UR                  U R                  R                  U R                  R
                  S9nU R                  SUSS9  g )NweightsrE   deviceF
persistent)get_embeddinghasattrtor   rE   r   register_buffer)r{   r   r   r   emb_weightss        r*   r   2SpeechT5SinusoidalPositionalEmbedding.make_weights)  s\    ((T4##%..t||/A/A$,,J]J].^KYFr,   c                    US-  n[         R                  " S5      US-
  -  n[        R                  " [        R                  " U[        R
                  S9R                  5       U* -  5      n[        R                  " U [        R
                  S9R                  5       R                  S5      UR                  S5      -  n[        R                  " [        R                  " U5      [        R                  " U5      /SS9R                  U S5      nUS-  S:X  a,  [        R                  " U[        R                  " U S5      /SS9nUb  SXBSS24'   UR                  [        R                  " 5       5      $ )	z
Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
description in Section 3.5 of "Attention Is All You Need".
r   i'  r   rD   r   dimr#   N)mathlogtorchexprR   int64float	unsqueezecatsincosviewrO   r   get_default_dtype)r   r   r   half_dimembs        r*   r   3SpeechT5SinusoidalPositionalEmbedding.get_embedding1  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r,   r   past_key_values_lengthc                    UR                  5       u  p4U R                  XR                  U5      R                  UR                  5      nU R                  S-   U-   nX`R
                  R                  S5      :  a3  U R                  X`R                  -   U R                  U R                  5        U R
                  R                  SUR                  S5      5      R                  X4S5      R                  5       $ )Nr   r   r#   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rK   )r{   r   r   bszseq_lenposition_idsmax_poss          r*   r   -SpeechT5SinusoidalPositionalEmbedding.forwardC  s     ~~'>>yJZJZ\rsvv

 ""Q&0\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGVXY``bbr,   c                     UR                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
symbols are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:
Returns: torch.Tensor
r   r   )ner<   r   cumsumtype_aslong)r{   r   r   r   maskincremental_indicess         r*   r   HSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsR  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r,   )r   r   r   r   r   )r   r   r   r   __doc__r<   r   ro   r   staticmethodr   r   no_gradTensorr   r   r   r   r   s   @r*   r   r     s    NSc S# SHUXM S SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" ]]_c cs c c bc88478QYZ]Q^8 8r,   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5PositionalConvEmbeddingid  c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr   )rk   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rn   ro   r   rs   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsrw   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r
   rx   ry   )r{   r|   r   r   r   r   r~   s         r*   ro   (SpeechT5PositionalConvEmbedding.__init__e  s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI+F,J,JK !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr   r   )r   rw   r   ry   r   s     r*   r   'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r,   )ry   rw   r   r   r   s   @r*   r   r   d  s    AB r,   r   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ ) SpeechT5ScaledPositionalEncodingi  uS   
Scaled positional encoding, see §3.2 in https://huggingface.co/papers/1809.08895
c           	        > [         R                  " X25      n[         R                  " SU5      R                  S5      n[         R                  " [         R                  " SUS[         R
                  S9R                  5       [        R                  " S5      U-  * -  5      n[         R                  " UR                  5       U-  5      US S 2SS S24'   [         R                  " UR                  5       U-  5      US S 2SS S24'   UR                  S5      n[        TU ]1  5         U R                  SUSS9  [        R                  " US	9U l        X l        [        R$                  " [         R&                  " S
5      5      U l        g )Nr   r   r   rD   g     @peFr   p      ?)r   rO   rR   r   r   r   r   r   r   r   r   rn   ro   r   r   Dropoutdropoutr   	Parametertensoralpha)r{   r	  r   max_lenr  positiondiv_termr~   s          r*   ro   )SpeechT5ScaledPositionalEncoding.__init__  s   [[&<<7+55a899U\\!S!5;;GMMOTXT\T\]dTehkTkRllmii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,\\%,,s"34
r,   c                     XR                   U R                  S S 2S UR                  S5      24   -  -   nU R                  U5      nU$ )Nr   )r  r  r   r	  )r{   r   s     r*   r   (SpeechT5ScaledPositionalEncoding.forward  s@    JJMchhqkM)9!:::ll3
r,   )r  r   r	  )i  )	r   r   r   r   r   ro   r   r   r   r   s   @r*   r  r    s    5 r,   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )"SpeechT5RelativePositionalEncodingi  c                    > [         TU ]  5         Xl        X l        [        R
                  R                  SU-  U5      U l        g r   )rn   ro   r   
max_lengthr   r   	Embeddingpe_k)r{   r   r  r~   s      r*   ro   +SpeechT5RelativePositionalEncoding.__init__  s4    $HH&&q:~s;	r,   c                 t   UR                   S   n[        R                  " SU5      R                  UR                  [        R
                  S9nUS S 2S 4   US S S 24   -
  nU R                  * X3U R                  * :  '   U R                  S-
  X3U R                  :  '   X0R                  -   nU R                  U5      $ )Nr   r   r   rE   )r%   r   rR   r   r   r   r  r  )r{   r   r   pos_seqs       r*   r   *SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+OO+yy!!r,   )r   r  r  )i  r   r   s   @r*   r  r    s    <	" 	"r,   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr   r   r   )rn   ro   num_pad_remove)r{   r   r~   s     r*   ro   SpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car,   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r   r   s     r*   r   SpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr,   r$  r   r   s   @r*   r   r     s    K r,   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SpeechT5FeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a?  [        USS9/[	        UR
                  S-
  5       Vs/ sH  n[        XS-   S9PM     sn-   nOUUR                  S:X  a,  [	        UR
                  5       Vs/ sH  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r}   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rn   ro   feat_extract_normr   rN   num_feat_extract_layersrg   r   r'   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r{   r|   ir.  r~   s       r*   ro   SpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNNw,V!eDNwN K %%0HMfNlNlHmHm1*6>Hm  K 01I1I0JJst  ==5&+#"Ns   CC#c                 N    U R                  5        H
  nSUl        M     SU l        g )NF)
parametersrequires_gradr0  )r{   params     r*   _freeze_parameters)SpeechT5FeatureEncoder._freeze_parameters  s#    __&E"'E '#r,   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ NT)r0  trainingr5  r.  )r{   r-   r   
conv_layers       r*   r   SpeechT5FeatureEncoder.forward  sK    $QW- 4==*.M'**J&}5M + r,   )r0  r.  r/  )
r   r   r   r   r   ro   r7  r   r   r   r   s   @r*   r'  r'    s    8#&$

 
r,   r'  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeatureProjectioni  c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )Nr#   eps)rn   ro   r   r   rp   layer_norm_epsr   Linearr   
projectionr  feat_proj_dropoutr	  r{   r|   r~   s     r*   ro   "SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r,   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r   )r   rE  r	  )r{   r   norm_hidden_statess      r*   r   !SpeechT5FeatureProjection.forward  s7    !__];(:;]300r,   )r	  r   rE  r   r   s   @r*   r?  r?    s    <1 1r,   r?  c                   L  ^  \ rS rSrU 4S jrS r  SS\R                  S\\R                     S\\R                     4S jjrS\S\R                  4S	 jrS
\\R                  \4   4S jr  SS\R                  S\\R                     S\\R                     4S jjrSrU =r$ )SpeechT5SpeechEncoderPreneti  c                   > [         TU ]  5         Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        [!        U5      U l        [%        UR&                  UR(                  -   S-   UR                  UR(                  5      U l        g )Nr2   r   )rn   ro   r|   r'  feature_encoderr?  feature_projectionmask_time_probmask_feature_probr   r
  r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr    pos_sinusoidal_embedrG  s     r*   ro   $SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r,   c                 8    U R                   R                  5         g r   )rO  r7  r{   s    r*   freeze_feature_encoder2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r,   r-   r/   mask_time_indicesc                    U R                  U5      nUR                  SS5      nUb  U R                  UR                  S   U5      nU R	                  U5      u  pTU R                  XSUS9nU R                  U5      nXV-   nUb   UR                  S5      R                  5       nO;[        R                  " UR                  S S [        R                  UR                  S9nU R                  U5      nXX-   nXR4$ )Nr   r   )r]  r/   r   )rO  r   "_get_feature_vector_attention_maskr%   rP  _mask_hidden_statesrU  r   r   r   rO   r   rW  )	r{   r-   r/   r]  extract_featuresr   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r*   r   #SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S'00~ 1 
 %)$7$7$F!%A%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%H,,r,   feature_vector_lengthc                    UR                  SS9S S 2S4   nU R                  U5      R                  [        R                  5      nUR
                  S   n[        R                  " XQ4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )Nr#   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r%   rO   rE   r   rR   fliprP   )r{   rf  r/   non_padded_lengthsoutput_lengthsr\   s         r*   r_  >SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask9  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr,   r^   c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   div)r>   rk   rl   s      r*   _conv_out_lengthVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthN  s      99\7wWZ[[[r,   )zipr|   rt   ru   )r{   r^   rs  rk   rl   s        r*   ri  <SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsI  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r,   r   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )r5   r6   r/   r7   r  )r5   r6   r7   r#   )getattrr|   r   rT  r   rE   rQ  r;  re   mask_time_lengthmask_time_min_masksr   r  r   rP   rR  mask_feature_lengthmask_feature_min_masksexpand)r{   r   r]  r/   r\   rA   r   mask_feature_indicess           r*   r`  /SpeechT5SpeechEncoderPrenet._mask_hidden_statesY  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r,   )r|   rO  rP  rT  rU  rW  NN)r   r   r   r   ro   r[  r   r   r   
LongTensorFloatTensorr   r<   r_  r   ri  r`  r   r   r   s   @r*   rM  rM    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	, ,r,   rM  c                   t   ^  \ rS rSrU 4S jrS r SS\R                  S\\R                     4S jjr	Sr
U =r$ )	SpeechT5SpeechDecoderPreneti  c           	      d  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH@  n[        R                  " US:X  a  UR                  OUR                  UR                  5      PMB     sn5      U l
        [        R                  " UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        [        R                  " UR"                  UR                  -   UR                  5      U l        g s  snf r#  )rn   ro   r|   r   r-  rN   speech_decoder_prenet_layersrD  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr  positional_dropoutrV  encode_positionsspeaker_embedding_dimspeaker_embeds_layerr{   r|   r1  r~   s      r*   ro   $SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 DA	 		+,6F''v7Y7Y66 D
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD-c                     [         R                  " US   US9nUR                  S5      R                  UR	                  S5      SS5      n[         R
                  " US:H  US5      S-  SU-
  -  $ )Nr   r  r   )r   	bernoullir   repeatr   where)r{   inputs_embedsr  r   	all_maskss        r*   _consistent_dropout/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr,   r-   speaker_embeddingsc                 6   UnU R                    HM  n[        R                  R                  U" U5      5      nU R	                  X0R
                  R                  5      nMO     U R                  U5      nU R                  U5      nUb  [        R                  R                  U5      nUR                  S5      R                  SUR                  S5      S5      n[        R                  " X2/SS9n[        R                  R                  U R                  U5      5      nU$ )Nr   r#   r   )r  r   
functionalrelur  r|   speech_decoder_prenet_dropoutr  r  	normalizer   r~  r   r   r   r  )r{   r-   r  r  r*  s        r*   r   #SpeechT5SpeechDecoderPrenet.forward  s     %[[EMM..u]/CDM 44]KKDmDmnM ! ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}&IrRMMM..t/H/H/WXMr,   )r|   r  r  r  r  r   )r   r   r   r   ro   r  r   r   r   r   r   r   r   s   @r*   r  r    s=    u,K 6:ll %U\\2 r,   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SpeechT5BatchNormConvLayeri  c           	        > [         TU ]  5         US:X  a  UR                  nOUR                  nX!R                  S-
  :X  a  UR                  nOUR                  n[
        R                  " UUUR                  SUR                  S-
  S-  SS9U l        [
        R                  " U5      U l
        X!R                  S-
  :  a  [
        R                  " 5       U l        OS U l        [
        R                  " UR                  5      U l        g )Nr   r   r   F)rk   rl   r   rm   )rn   ro   r  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   rs   speech_decoder_postnet_kernelrw   BatchNorm1d
batch_normTanhry   r  speech_decoder_postnet_dropoutr	  )r{   r|   r}   rq   rr   r~   s        r*   ro   #SpeechT5BatchNormConvLayer.__init__  s    q= --K ==K;;a??!..L!>>LII<<99A=!C
	 ..6::Q>> ggiDO"DOzz&"G"GHr,   c                     U R                  U5      nU R                  U5      nU R                  b  U R                  U5      nU R                  U5      nU$ r   )rw   r  ry   r	  r   s     r*   r   "SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r,   )ry   r  rw   r	  r   r   r   s   @r*   r  r    s    I< r,   r  c                   l   ^  \ rS rSrU 4S jrS\R                  4S jrS\R                  4S jrSr	U =r
$ )SpeechT5SpeechDecoderPostneti  c           	        > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  -  5      U l        [        R                  " UR
                  UR                  5      U l	        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        g s  snf r   )rn   ro   r|   r   rD  r   r  r.   feat_outprob_outr-  rN   r  r  r  r  s      r*   ro   %SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<gh<gq'2<gh
hs   *Cr   c                    U R                  U5      R                  UR                  S5      SU R                  R                  5      nU R                  U5      nU R                  U5      R                  UR                  S5      S5      nX#U4$ )Nr   r#   )r  r   r   r|   r  postnetr  )r{   r   outputs_before_postnetoutputs_after_postnetlogitss        r*   r   $SpeechT5SpeechDecoderPostnet.forward  s{    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%fDDr,   c                     UR                  SS5      nU R                   H  nU" U5      nM     XR                  SS5      -   $ r   )r   r  )r{   r   layer_outputr*  s       r*   r  $SpeechT5SpeechDecoderPostnet.postnet  sB    $..q!4[[E .L !55a;;;r,   )r|   r  r  r  )r   r   r   r   ro   r   r   r   r  r   r   r   s   @r*   r  r    s/    	
EU\\ E<U\\ < <r,   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )SpeechT5TextEncoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  5      U l        [        UR                  UR                  UR                  5      U l        g r   )rn   ro   r|   r   r  
vocab_sizer   r    embed_tokensr  r  max_text_positionsr  rG  s     r*   ro   "SpeechT5TextEncoderPrenet.__init__  sc    LL):):F<N<NPVPcPcd @%%%%!
r,   r   c                 J    U R                  U5      nU R                  U5      nU$ r   )r  r  )r{   r   r  s      r*   r   !SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r,   )r|   r  r  )
r   r   r   r   ro   r   r   r   r   r   r   s   @r*   r  r    s    
  r,   r  c                   z   ^  \ rS rSrU 4S jr  SS\R                  S\\R                     S\\	   4S jjr
SrU =r$ )	SpeechT5TextDecoderPreneti  c                   > [         TU ]  5         Xl        [        R                  " UR
                  5      U l        UR                  (       a   [        R                  " UR                  5      OSU l        [        R                  " UR                  UR                  UR                  5      U l        [!        UR"                  UR                  -   S-   UR                  UR                  5      U l        g )Nr  r   )rn   ro   r|   r   r  r  r	  scale_embeddingr   sqrtr   embed_scaler  r  r    r  r   r  embed_positionsrG  s     r*   ro   "SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r,   r   r/   past_key_valuesc                 v   Ub&  UR                  5       nUR                  SUS   5      nO[        S5      eSnUb:  [        U[        5      (       d  US   S   R
                  S   OUR                  5       nU R                  X5      nU R                  U5      U R                  -  nXv-  nU R                  U5      nXr4$ )Nr#   z'You have to specify `decoder_input_ids`r   r   )r   r   r'   
isinstancer   r%   get_seq_lengthr  r  r  r	  )r{   r   r/   r  input_shaper   	positionsr  s           r*   r   !SpeechT5TextDecoderPrenet.forward  s      #..*K!r;r?;IFGG!"& "/599  "1%++B/$335 # ((K	)))4t7G7GG"]3,,r,   )r|   r	  r  r  r  r  )r   r   r   r   ro   r   r   r   r  r   r   r   r   r   s   @r*   r  r    sI    
" 6:+/	-<<- !!1!12- "%	- -r,   r  c                   V   ^  \ rS rSrU 4S jrS\R                  4S jrS rS r	Sr
U =r$ )SpeechT5TextDecoderPostneti:  c                    > [         TU ]  5         Xl        [        R                  " UR
                  UR                  SS9U l        g )NFrm   )rn   ro   r|   r   rD  r   r  lm_headrG  s     r*   ro   #SpeechT5TextDecoderPostnet.__init__;  s3    yy!3!3V5F5FUSr,   r   c                 $    U R                  U5      $ r   r  r   s     r*   r   "SpeechT5TextDecoderPostnet.forward@  s    ||M**r,   c                     U R                   $ r   r  rZ  s    r*   get_output_embeddings0SpeechT5TextDecoderPostnet.get_output_embeddingsC  s     ||r,   c                     Xl         g r   r  r{   new_embeddingss     r*   set_output_embeddings0SpeechT5TextDecoderPostnet.set_output_embeddingsH  s    %r,   )r|   r  )r   r   r   r   ro   r   r   r   r  r  r   r   r   s   @r*   r  r  :  s(    T
+U\\ +
& &r,   r  c                     ^  \ rS rSrSr    SS\S\S\\   S\\   S\\   S\\   4U 4S	 jjjr	       SS
\
R                  S\\
R                     S\\   S\\
R                     S\\
R                     S\\
R                     S\S\\
R                     S\\
R                  \\
R                     \\   4   4S jjrSrU =r$ )SpeechT5AttentioniL  z
Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
https://aclanthology.org/N18-2074.pdf)
	embed_dim	num_headsr	  
is_decoderrm   	layer_idxc                   > [         TU ]  5         Xl        X l        X0l        X-  U l        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l        X`l	        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rn   ro   r  r  r	  head_dimr'   scalingr  r  r   rD  k_projv_projq_projout_proj)r{   r  r  r	  r  rm   r  r~   s          r*   ro   SpeechT5Attention.__init__R  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr,   r   key_value_statespast_key_valuer/   layer_head_maskposition_biasoutput_attentionscache_positionr8   c	                 r   USLn	UR                  5       u  pnU R                  U5      U R                  -  nUb]  [        U[        5      (       aF  UR
                  R                  U R                  5      nU	(       a  UR                  nOUR                  nOUnU	(       a  UOUnU	(       aQ  UbN  W(       aG  WR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      nU R                  U5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUR                  U
SU R                   U R"                  5      R%                  SS5      nUbN  U	(       d  UOSnWR'                  UUU R                  SU05      u  nnU	(       a  SUR
                  U R                  '   XR                   -  SU R"                  4nUR                  XU R                   U R"                  5      R%                  SS5      nUR(                  " U6 nUR(                  " U6 nUR(                  " U6 nUR                  S5      n[*        R,                  " UUR%                  SS5      5      nUR                  5       XR                   -  UU4:w  a.  [/        SXR                   -  UU4 SUR                  5        35      eUb  UR1                  5       R                  XR                   -  SU R"                  5      R%                  S	S5      n[*        R2                  " UUR%                  S
S5      5      nUR%                  S	S5      R                  XR                   -  UR                  S	5      UR                  S5      5      nUU-  nUbz  UR                  5       U
SUU4:w  a#  [/        SU
SUU4 SUR                  5        35      eUR                  XR                   UU5      U-   nUR                  XR                   -  UU5      n[4        R6                  R9                  USS9nUb  UR                  5       U R                   4:w  a*  [/        SU R                   4 SUR                  5        35      eUR                  SSSS5      UR                  XR                   UU5      -  nUR                  XR                   -  UU5      nU(       a=  UR                  XR                   UU5      nUR                  XR                   -  UU5      nOSn[4        R6                  R;                  UU R:                  U R<                  S9n[*        R,                  " UU5      nUR                  5       XR                   -  XR"                  4:w  a5  [/        SXR                   XR"                  4 SUR                  5        35      eUR                  XR                   XR"                  5      nUR%                  SS5      nUR)                  XU R>                  5      nU RA                  U5      nUU4$ )z#Input shape: Batch x Time x ChannelNr#   r   r   r  Tz$Attention weights should be of size z	, but is r   r   z!Attention mask should be of size r   z/Head mask for a single layer should be of size )r  r;  z `attn_output` should be of size )!r   r  r  r  r   
is_updatedgetr  cross_attention_cacheself_attention_cacher  keysvaluesr  r  r   r  r  r   updaterZ   r   bmmr'   
contiguousmatmulr   r  softmaxr	  r;  r  r  )r{   r   r  r  r/   r  r  r  r  is_cross_attentionr   tgt_lenr]   query_statesr  curr_past_key_valuecurrent_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                              r*   r   SpeechT5Attention.forwardo  s    .T9',,.a {{=1DLL@%.*=>>+66::4>>J
%*8*N*N'*8*M*M'&4#-?)]."<,33DNNCHHJ.55dnnELLL^4J;;~6L#b$..$--PZZ[\^_`J',,S"dnndmmT^^_`bcdL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn=NN*B>
#((t~~t}}U__`acde#++Z8''4
#++Z8//!$yyz/C/CAq/IJ3#7'"JJ6nn8LgW^7_6` a %%'(*  $$//166s^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<nn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S..'7SVddL',,S>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVdVdfmov?wwL',,S>>-A7GTL
 %1$5$5c>>7T[$\!055cNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CR_R_3`2a b$$&') 
 "&&sNNG]]S!++Aq1 "))#GmmK0111r,   )r	  r  r  r  r  r  r  r  r  r  r  )r2   FTN)NNNNNFN)r   r   r   r   r   r<   r   r   rP   ro   r   r   r   tupler   r   r   r   s   @r*   r  r  L  sI    $'%*#$(CC C %	C
 TNC tnC D>C C@ 48*.152604"'15~2||~2 #5<<0~2 !	~2
 !.~2 "%,,/~2  -~2  ~2 !.~2 
u||Xell3Xe_D	E~2 ~2r,   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SpeechT5FeedForwardi  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  U5      U l        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " X!R                  5      U l        [        R                  " UR                  5      U l        g r   )rn   ro   r   r  activation_dropoutintermediate_dropoutrD  r   intermediate_denser  
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)r{   r|   intermediate_sizer~   s      r*   ro   SpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''--'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r,   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )r  r  r  r   r"  r   s     r*   r   SpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r,   )r  r  r  r   r"  r   r   s   @r*   r  r    s    @ r,   r  c                      ^  \ rS rSrS\4U 4S jjr    SS\R                  S\\R                     S\\R                     S\\R                     S\	4
S	 jjr
S
rU =r$ )SpeechT5EncoderLayeri  r|   c                   > [         TU ]  5         [        UR                  UR                  UR
                  SS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        XR                  5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r  r  r	  r  rA  )rn   ro   r  r   encoder_attention_headsattention_dropout	attentionr   r  r!  r	  r   rC  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normrG  s     r*   ro   SpeechT5EncoderLayer.__init__	  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r,   r   r/   r  r  r  c                     UnU R                  UUUUUS9u  pU R                  U5      nXa-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  X4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`):
        input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`):
        attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
        large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(config.encoder_attention_heads,)`.
    position_bias (`torch.FloatTensor`):
        relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r/   r  r  r  )r,  r	  r   r.  r/  )	r{   r   r/   r  r  r  residualr  outputss	            r*   r   SpeechT5EncoderLayer.forward  s    . !&*nn')+'/ '5 '
# ]3 06%(9(9-(HH--m< "&Gr,   )r,  r	  r.  r/  r   )NNNF)r   r   r   r   r   ro   r   r   r   rP   r   r   r   r   s   @r*   r(  r(    sx    \~ \  262604"',||, !., "%,,/	,
  -,  , ,r,   r(  c                   >  ^  \ rS rSrSS\4U 4S jjjr         SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\	   S\\
   S\\
   S\\R                     4S jjrSrU =r$ )SpeechT5DecoderLayeriE  r|   c                 t  > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  UR                  S9U l        [!        XR"                  5      U l        [        R                  " UR                  UR                  S9U l        g )NT)r  r  r	  r  r  rA  )r	  r  r  )rn   ro   r  r   decoder_attention_headsr+  	self_attnr   r  r!  r	  r   rC  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr.  r/  )r{   r|   r  r~   s      r*   ro   SpeechT5DecoderLayer.__init__F  s    *((44,,
 zz&"7"78$&LL1C1CI^I^$_!-**,,
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r,   r   r/   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  r  	use_cacher  c           
      r   UnU R                  UUUUUU
S9u  pU R                  U5      nX-   nU R                  U5      nSnUb?  UnU R                  UUUUUUU
S9u  pU R                  U5      nX-   nU R	                  U5      nXR                  U5      -   nU R                  U5      nU4nU(       a  XU4-  nU$ )a  
Args:
    hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
    attention_mask (`torch.FloatTensor`): attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    encoder_hidden_states (`torch.FloatTensor`):
        cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
    encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
        `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
    layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
        `(encoder_attention_heads,)`.
    cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
        size `(decoder_attention_heads,)`.
    past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
)r   r  r/   r  r  r  N)r   r  r/   r  r  r  r  )r9  r	  r:  r;  r<  r.  r/  )r{   r   r/   r?  r@  r  rA  r  r  rB  r  r2  self_attn_weightscross_attn_weightsr3  s                  r*   r   SpeechT5DecoderLayer.forward^  s   > ! ,0>>'))+/) ,: ,
( ]3 011-@ " ,$H040A0A+!65 :-"3- 1B 1-M !LL7M$4M 88GM &(9(9-(HH--m< "+=>>Gr,   )r	  r;  r<  r.  r/  r9  r:  r   )	NNNNNNFTN)r   r   r   r   r   ro   r   r   r   r   rP   r   r   r   r   s   @r*   r6  r6  E  s    \~ \ \6 268<9=26=A*.,1$(15I||I !.I  (5	I
 !) 6I "%,,/I %-U\\$:I !I $D>I D>I !.I Ir,   r6  c                   N    \ rS rSr% \\S'   SrSrSrS\	R                  4S jrSrg	)
SpeechT5PreTrainedModeli  r|   speecht5r-   Tmodulec           
      |   U R                   R                  n[        U[        5      (       a  [        R
                  R                  UR                  R                  SS[        R                  " SUR                  R                  S   UR                  R                  -  -  5      -  S9  [        R
                  R                  UR                  R                  S5        GO%[        U[        5      (       a'  UR                   R"                  R%                  S5        GO[        U[&        5      (       a  [        R                  " SUR(                  R*                  -  5      n[        R
                  R-                  UR(                  R                  U* US9  [        R
                  R-                  UR(                  R                  U* US9  GO=[        U[        R.                  5      (       aW  UR                  R"                  R                  SUS9  UR                  b$  UR                  R"                  R1                  5         GO[        U[        R2                  [        R4                  [        R6                  45      (       aK  UR                  R"                  R1                  5         UR                  R"                  R%                  S5        GO>[        U[        R8                  5      (       a  [        R
                  R;                  UR                  5        UR                  bg  [        R                  " UR<                  UR                  UR                  S   -  -  5      n[        R
                  R-                  UR                  U* US9  O[        U[        R>                  5      (       ab  UR                  R"                  R                  SUS9  UR@                  b1  UR                  R"                  UR@                     R1                  5         [C        US	5      (       a*  [        R
                  R-                  URD                  5        gg)
zInitialize the weightsr   r   r   meanstdr  )abr2   NrT  )#r|   initializer_ranger  r   r   initnormal_rw   r   r   r  rk   in_channels	constant_rm   r  r  datafill_r?  rE  in_featuresrS  rD  zero_r   r   r  rs   kaiming_normal_r   r  r   r   rT  )r{   rJ  rN  ks       r*   _init_weights%SpeechT5PreTrainedModel._init_weights  s   kk++f=>>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 @AALL##C( 9::		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CS&9{{&  &&(r||R^^ LMMKK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8--MM&&CS&9!!-""6#5#56<<>6.//GGV556 0r,    N)r   r   r   r   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointingr   Moduler\  r   r^  r,   r*   rH  rH    s)    "$O&*#"7BII "7r,   rH  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5Encoderi  zm
Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
r|   c                   > [         TU ]  U5        [        R                  " UR                  UR
                  S9U l        [        R                  " UR                  5      U l	        UR                  U l        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        [#        UR                  UR$                  -  UR&                  5      U l        SU l        U R-                  5         g s  snf )NrA  F)rn   ro   r   r   r   rC  r   r  r!  r	  encoder_layerdrop	layerdropr-  rN   encoder_layersr(  r  r  r*  encoder_max_relative_positionr  r/  	post_init)r{   r|   r]   r~   s      r*   ro   SpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$hKga%9&%AKg$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr   r/   	head_maskr  output_hidden_statesreturn_dictr8   c           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUb  [	        X!R
                  5      nU R                  U5      nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      nU(       a  SOSn	U(       a  SOSn
Ub`  UR                  5       S   [        U R                  5      :w  a6  [        S[        U R                  5       SUR                  5       S    S35      e[        U R                  5       H  u  pU(       a  X4-   n	SnU R                   (       a$  ["        R$                  " / 5      nXR&                  :  nU(       a  U(       a  U" UUUUb  X;   OSUS9nUS   nU(       a  S	nU(       d  M~  U
WS
   4-   n
M     U(       a  X4-   n	U(       d  [)        S XU
4 5       5      $ [+        UU	U
S9$ )aA  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the encoder prenet.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr^  r   z&The head_mask should be specified for  layers, but it is for .F)r/   r  r  r  r  r   c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r^  .0vs     r*   	<genexpr>*SpeechT5Encoder.forward.<locals>.<genexpr>L  s     m$[q$[s   	last_hidden_stater   
attentions)r|   r  rn  use_return_dictr   rE   r   r	  r  r   r   r   rS   r  r'   	enumerater;  r   rI   rh  r  r   )r{   r   r/   rm  r  rn  ro  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r*   r   SpeechT5Encoder.forward  s   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8C#$58H$H! #N}}&+jjn#!4~~!E![ -!#1"/7@7LY^RV&7! !.a 0 ,  &9]1=M<O&O#3 #96   14D Dm]GZ$[mmm++*
 	
r,   )r	  r  r/  r   rh  r  NNNNNr   r   r   r   r   r   ro   r   r  r   r   rP   r   r  r   r   r   r   r   s   @r*   re  re    s    ~ ( 26,0,0/3&*f
((f
 !.f
 ELL)	f

 $D>f
 'tnf
 d^f
 
uo%	&f
 f
r,   re  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithSpeechPrenetiU  z
Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   rM  prenetre  wrapped_encoderrk  rG  s     r*   ro   (SpeechT5EncoderWithSpeechPrenet.__init__[  5     1&9.v6 	r,   r-   r/   rm  r  rn  ro  r8   c           	      T    U R                  X5      u  prU R                  UUUUUUS9nU$ N)r   r/   rm  r  rn  ro  r  r  	r{   r-   r/   rm  r  rn  ro  r   r3  s	            r*   r   'SpeechT5EncoderWithSpeechPrenet.forwardc  sC     )-L(Q%&&')/!5# ' 
 r,   r  r  r  r   s   @r*   r  r  U  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r,   r  c                      ^  \ rS rSrSrS\4U 4S jjrS rS r     SS\	R                  S\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithTextPrenetiz  zt
Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  re  r  rk  rG  s     r*   ro   &SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r,   c                 6    U R                   R                  5       $ r   r  get_input_embeddingsrZ  s    r*   r  2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r,   c                 :    U R                   R                  U5        g r   r  set_input_embeddingsr{   values     r*   r  2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r,   r-   r/   rm  r  rn  ro  r8   c           	      P    U R                  U5      nU R                  UUUUUUS9nU$ r  r  r  s	            r*   r   %SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r,   r  r  )r   r   r   r   r   r   ro   r  r  r   r  r   r   rP   r   r  r   r   r   r   r   s   @r*   r  r  z  s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	& r,   r  c                      ^  \ rS rSrSrS\4U 4S jjr     SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\   S\\\4   4S jjrSrU =r$ )SpeechT5EncoderWithoutPreneti  
This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
[`SpeechT5Model`].
r|   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rn   ro   re  r  rk  rG  s     r*   ro   %SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r,   r-   r/   rm  r  rn  ro  r8   c           	      *    U R                  UUUUUUS9$ r  r  )r{   r-   r/   rm  r  rn  ro  s          r*   r   $SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r,   r  r  r  r   s   @r*   r  r    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
 
r,   r  c                     ^  \ rS rSrSrS\4U 4S jjr            SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5Decoderi  zl
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
r|   c           
        > [         TU ]  U5        UR                  U l        [        R
                  " [        UR                  5       Vs/ sH  n[        XS9PM     sn5      U l	        SU l
        U R                  5         g s  snf )N)r  F)rn   ro   decoder_layerdroprh  r   r-  rN   decoder_layersr6  r  r/  rk  r  s      r*   ro   SpeechT5Decoder.__init__  sl     11mmX]^d^s^sXt$uXtST%9&%NXt$uv&+# 	 %vs   	A?r   r/   r?  r@  rm  cross_attn_head_maskr  rB  r  rn  ro  r  r8   c                     U	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUb  UOU R                   R                  nUR                  5       SS nU R                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnSnU(       aB  [        U[        5      (       d-  [        R                  S5        Sn[        R                  " U5      nUb  UR                  5       OSn[        X-X5      nUb  Ub  [!        XAR"                  US   S9n[%        5       =(       d    ['        U 5      nU
(       a  S	OSnU	(       a  S	OSnU	(       a  Ub  S	OSn[)        XV/S
S/5       Hn  u  nnUc  M  UR                  5       S   [+        U R,                  5      :w  d  M7  [/        SU S[+        U R,                  5       SUR                  5       S    S35      e   [1        U R,                  5       H  u  nnU
(       a  UU4-   nSnU R                  (       a%  [2        R4                  " / 5      nUU R6                  :  nU(       a	  U(       d  M[  U" UUUUUb  UU   OSUb  UU   OSUU	UUS9
nUS   nU	(       d  M  UUS   4-   nUc  M  UUS   4-   nM     U
(       a  UU4-   nU(       a  UR9                  5       nU(       d  [;        S XUUU4 5       5      $ [=        UUUUUS9$ )a  
Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
        Features extracted from the speech or text input by the decoder prenet.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
        of the decoder.
    encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
        Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
        selected in `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
        Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
        cross-attention on hidden heads. Mask values selected in `[0, 1]`:

        - 1 indicates the head is **not masked**,
        - 0 indicates the head is **masked**.

    past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
        shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
        shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

        Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
        cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

        If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
        that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
        all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
    inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
        This is useful if you want more control over how to convert `input_ids` indices into associated vectors
        than the model's internal embedding lookup matrix.
    output_attentions (`bool`, *optional*):
        Whether or not to return the attentions tensors of all attention layers. See `attentions` under
        returned tensors for more detail.
    output_hidden_states (`bool`, *optional*):
        Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
        for more detail.
    return_dict (`bool`, *optional*):
        Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
Nr#   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.Tr   )r  r^  rm  r  zThe `z` should be specified for rq  rr  )r@  r  rA  r  r  rB  r  r   r   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r^  rt  s     r*   rw  *SpeechT5Decoder.forward.<locals>.<genexpr>{  s      wA ws   	)rz  r  r   r{  cross_attentions)r|   r  rn  rB  r|  r   r/  r;  loggerwarning_oncer  r   r   from_legacy_cacher  r   r   rE   r   r   ru  rS   r  r'   r}  r   rI   rh  to_legacy_cacher  r   )r{   r   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r  return_legacy_cacher   r~  r  r  all_cross_attentions	attn_mask	mask_namer  decoder_layerr  r  r  s                              r*   r   SpeechT5Decoder.forward  sE   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/&&4==##p "	#Z??\
 #'1CCOTOETE`!?!?!Afg:

 !,1G1S%?&(;(;[QS_&" 12R6LT6R #7BD$5b4&7<Q<]rdh %((IKYoKp$q Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03  %r #,DKK"8C#$58H$H! #N}}&+jjn#!4t~~!Ek)%'=3<3H3dI]Ii,@,Eos."3#-M *!,M  &9]1=M<O&O#(4+?=QRCSBU+U(; #9>   1]4D D-==?O ':KM`bvw   9+++*1
 	
r,   )r/  rh  r  NNNNNNNNNNNNr   r   r   r   r   r   ro   r   r   r  r  r   listrP   r   r  r   r   r   r   r   s   @r*   r  r    sE   	~ 	 6:59=A=A,07;=A$(,0/3&*15s
 1 12s
 !!1!12s
  ((9(9:	s

 !))9)9 :s
 ELL)s
 'u||4s
 "$u'8'8"9:s
 D>s
 $D>s
 'tns
 d^s
 !.s
 
u??	@s
 s
r,   r  c                     ^  \ rS rSrSrS\4U 4S jjr             SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithSpeechPreneti  z|
Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  r  wrapped_decoderrk  rG  s     r*   ro   (SpeechT5DecoderWithSpeechPrenet.__init__  r  r,   r-   r/   r?  r@  r  rm  r  r  rB  r  rn  ro  r  r8   c                 \    U R                  X5      nU R                  UUUUUUUU	U
UUUS9nU$ N)r   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r  r  )r{   r-   r/   r?  r@  r  rm  r  r  rB  r  rn  ro  r  decoder_hidden_statesr3  s                   r*   r   'SpeechT5DecoderWithSpeechPrenet.forward  sS      !%L M&&/)"7#9!5+/!5#) ' 
 r,   r  )NNNNNNNNNNNNNr  r   s   @r*   r  r    sL   
~  5959=A=A59,07;=A$(,0/3&*15!u001! !!1!12!  ((9(9:	!
 !))9)9 :! %U\\2! ELL)! 'u||4! "$u'8'8"9:! D>! $D>! 'tn! d^! !.! 
u??	@! !r,   r  c                     ^  \ rS rSrSrS\4U 4S jjrS rS r            SS\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\\
R                        S\	\   S\	\   S\	\   S\	\   S\	\
R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithTextPreneti  zs
Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
r|   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )rn   ro   r  r  r  r  rk  rG  s     r*   ro   &SpeechT5DecoderWithTextPrenet.__init__  r  r,   c                 6    U R                   R                  5       $ r   r  rZ  s    r*   r  2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r  r,   c                 :    U R                   R                  U5        g r   r  r  s     r*   r  2SpeechT5DecoderWithTextPrenet.set_input_embeddings  r  r,   r-   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r8   c                 b    U R                  XU5      u  pU R                  UUUUUUUUU	U
UUS9nU$ r  r  )r{   r-   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r  r3  s                  r*   r   %SpeechT5DecoderWithTextPrenet.forward  sY     15LZi0j-&&/)"7#9!5+/!5#) ' 
 r,   r  r  )r   r   r   r   r   r   ro   r  r  r   r   r  r  r   r  rP   r   r  r   r   r   r   r   s   @r*   r  r    s@   ~ 20
 5959=A=A,07;=A$(,0/3&*15 u001  !!1!12   ((9(9:	 
 !))9)9 :  ELL)  'u||4  "$u'8'8"9:  D>  $D>  'tn  d^  !.  
u??	@   r,   r  c                     ^  \ rS rSrSrS\4U 4S jjr            SS\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\\R                        S\\   S\\   S\\   S\\   S\\R                     S\\\4   4S jjrSrU =r$ )SpeechT5DecoderWithoutPreneti  r  r|   c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )rn   ro   r  r  rk  rG  s     r*   ro   %SpeechT5DecoderWithoutPrenet.__init__  r  r,   r-   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r8   c                 :    U R                  UUUUUUUUU	U
UUS9nU$ r  r  )r{   r-   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  r3  s                 r*   r   $SpeechT5DecoderWithoutPrenet.forward  sD     &&&)"7#9!5+/!5#) ' 
 r,   r  r  r  r   s   @r*   r  r    s6   
~  5959=A=A,07;=A$(,0/3&*15u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ !. 
u??	@ r,   r  c                      ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  S\R                  S\R                  4S	 jrS
 r\S 5       rSrU =r$ )$SpeechT5GuidedMultiheadAttentionLossi  z
Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
Networks with Guided Attention](https://huggingface.co/papers/1710.08969), adapted for multi-head attention.
r|   c                 f   > [         TU ]  5         UR                  U l        UR                  U l        g r   )rn   ro   guided_attention_loss_sigmasigmaguided_attention_loss_scalescalerG  s     r*   ro   -SpeechT5GuidedMultiheadAttentionLoss.__init__%  s(    77
77
r,   r{  input_masksoutput_masksr8   c                 D   U R                  X#UR                  5      nUR                  S5      UR                  S5      -  nUR                  UR                  5      R                  S5      nXA-  n[        R
                  " UR                  U5      5      nU R                  U-  $ )a  
Compute the attention loss.

Args:
    attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
        Batch of multi-head attention weights
    input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
        Input attention mask as booleans.
    output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
        Target attention mask as booleans.

Returns:
    `torch.Tensor` with the loss value
r#   r   r   )_make_guided_attention_masksr   r   r   r   rM  masked_selectr  )r{   r{  r  r  guided_attn_masksmaskslosseslosss           r*   r   ,SpeechT5GuidedMultiheadAttentionLoss.forward*  s    " !==kYcYjYjk&&r*[-B-B2-FF**+55a8"/zz&..u56zzD  r,   c                 j   UR                  S5      nUR                  S5      n[        R                  " [        U5      UR                  S   UR                  S   4US9n[        [        XE5      5       H.  u  nu  pU R                  XU R                  U5      XgS U	2S U24'   M0     UR                  S5      $ )Nr#   r   rh  )
rL   r   rO   rS   r%   r}  ru  _make_guided_attention_maskr  r   )
r{   r  r  r   r^   rl  r  r  ilenolens
             r*   r  ASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksC  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}+M!NC$373S3STX`d`j`jlr3s5D5%4%/0 "O !**1--r,   c                    [         R                  " [         R                  " XS9[         R                  " XS9SS9u  pEUR                  5       U-  nUR                  5       U -  nS[         R                  " XE-
  S-  * SUS-  -  -  5      -
  $ )Nrh  xy)indexingr  r   )r   meshgridrR   r   r   )r>   output_lengthr  r   grid_ygrid_xs         r*   r  @SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_maskN  sz    LL5LL6

 -/,.UYY&/a!78ANKLLLr,   )r  r  )r   r   r   r   r   r   ro   r   r  
BoolTensorr   r   r  r   r  r   r   r   s   @r*   r  r    sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr,   r  c                      ^  \ rS rSrSrS\4U 4S jjr SS\R                  S\R                  S\R                  S\R                  S	\R                  S
\
\R                     S\R                  4S jjrSrU =r$ )SpeechT5SpectrogramLossiZ  z3
Loss computation used by SpeechT5ForTextToSpeech.
r|   c                 .  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U l        [        5       U l        [        [        R                  " S5      S9U l
        U R                  (       a  [        U5      U l        g g )Ng      @)
pos_weight)rn   ro   use_guided_attention_lossguided_attention_loss_num_headsr.   r   l1_criterionr   r   r  bce_criterionr  attn_criterionrG  s     r*   ro    SpeechT5SpectrogramLoss.__init___  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r,   r/   r  r  r  labelsr  r8   c           	      T   US:g  nUR                  U5      nUR                  U5      nUR                  U5      nU R                  X55      U R                  X%5      -   nUS S 2S S 2S4   n	[        R                  " U	) S-  [        R                  " U	R                  S5      S5      R                  U	R                  5      /SS9n
U
S S 2SS 24   R                  U	5      n
UR                  U	5      nU R                  XJ5      nX-   nU R                  (       a  [        R                  " U Vs/ sH  oS S 2S U R                  24   PM     snSS9nUS:H  nUS S 2S S 2S4   nU R                  S:  a#  US S 2U R                  S-
  S U R                  24   nU R                  XU5      nUU-  nU$ s  snf )Nr1   r   r  r   r   )r  r
  r   r   rU   r   r   r   r  r  r	  r.   r  )r{   r/   r  r  r  r  r  rc  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r*   r   SpeechT5SpectrogramLoss.forwardk  s    ' %%l3!7!E!El!S 5 C CL Q ##$9BTEVEVWmEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%f: ! ))99TdeTdq#IT%I%I#I IJTdeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D|LIID fs   %F%)r  r  r	  r
  r.   r  r   )r   r   r   r   r   r   ro   r   r  r  r   r   r   r   r   r   s   @r*   r  r  Z  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
) )r,   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            $       z  ^  \ rS rSr  SS\S\\R                     S\\R                     4U 4S jjjrS r	S r
S rS	 rS
 r\               SS\\R                      S\\R"                     S\\R                      S\\R"                     S\\R$                     S\\R$                     S\\R                      S\\\\R$                           S\\\\R$                           S\\   S\\R$                     S\\   S\\   S\\   S\\R                      S\\\R$                     \4   4 S jj5       rSrU =r$ )SpeechT5Modeli  r|   encoderdecoderc                    > [         TU ]  U5        Xl        Uc  [        U5      OUU l        Uc  [        U5      OUU l        U R                  5         g)z
encoder (`PreTrainedModel`, *optional*):
    The encoder model to use.
decoder (`PreTrainedModel`, *optional*):
    The decoder model to use.
N)rn   ro   r|   r  r  r  r  rk  )r{   r|   r  r  r~   s       r*   ro   SpeechT5Model.__init__  sK     	 ?F3F;T[?F3F;T[ 	r,   c                     [        U R                  [        5      (       a  U R                  R                  5       $ [        U R                  [
        5      (       a  U R                  R                  5       $ [        er   )r  r  r  r  r  r  NotImplementedErrorrZ  s    r*   r  "SpeechT5Model.get_input_embeddings  sR    dll$ABB<<4466dll$ABB<<4466!!r,   c                     [        U R                  [        5      (       a  U R                  R                  U5        [        U R                  [
        5      (       a  U R                  R                  U5        g g r   )r  r  r  r  r  r  r  s     r*   r  "SpeechT5Model.set_input_embeddings  sP    dll$ABBLL--e4dll$ABBLL--e4 Cr,   c                     U R                   $ r   )r  rZ  s    r*   get_encoderSpeechT5Model.get_encoder      ||r,   c                     U R                   $ r   )r  rZ  s    r*   get_decoderSpeechT5Model.get_decoder  r'  r,   c                     [        U R                  [        5      (       a%  U R                  R                  R	                  5         ggz
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)r  r  r  r  r[  rZ  s    r*   r[  $SpeechT5Model.freeze_feature_encoder  s2    
 dll$CDDLL668 Er,   r-   r/   decoder_input_valuesdecoder_attention_maskrm  decoder_head_maskr  encoder_outputsr  rB  r  r  rn  ro  r  r8   c                    Ub  UOU R                   R                  nUb  UOU R                   R                  nU
b  U
OU R                   R                  n
Ub  UOU R                   R                  nUc  U R                  UUUUUUS9nORU(       aK  [        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS9nUbV  [        U R
                  [        5      (       a7  U R
                  R                  R                  US   R                  S   U5      nOUn[        U R                  [        5      (       a  SU0nO0 nU R                  " S
UUUS   UUUU	U
UUUUS.UD6nU(       d  UU-   $ [        UR                   UR"                  UR$                  UR&                  UR(                  UR                   UR$                  UR&                  S	9$ )a1  
input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
    Depending on which encoder is being used, the `input_values` are either: float values of the input raw
    speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
    filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
    the vocabulary, or hidden states.
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
N)r-   r/   rm  r  rn  ro  r   r   r   ry  r  )r-   r/   r?  r@  rm  r  r  rB  r  rn  ro  r  )rz  r  r  decoder_attentionsr  encoder_last_hidden_stater?  encoder_attentionsr^  )r|   r  rn  rB  r|  r  r  r   rS   r  r  r_  r%   r  r  r   rz  r  r   r{  r  )r{   r-   r/   r.  r/  rm  r0  r  r1  r  rB  r  r  rn  ro  r  r@  decoder_argsdecoder_outputss                      r*   r   SpeechT5Model.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c*c%)\\%8%8%[%["((+^&" &4"dll$CDD02DELL,, 
-1"1!"4#9'!5+/!5#)
 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r,   )r|   r  r  r  NNNNNNNNNNNNNNN)r   r   r   r   r   r   r   rc  ro   r  r  r%  r)  r[  r   r   r   r  r  r  rP   r   r   r   r   r   r   s   @r*   r  r    s    (,'+	 "))$ "))$	 ("59  04597;=A159=7;EIEI$(:>,0/3&*15!k
u||,k
 !!1!12k
 'u||4	k

 !))9)9 :k
 E--.k
 $E$5$56k
 'u||4k
 "%e.?.?(@"ABk
 "%e.?.?(@"ABk
 D>k
 %U%6%67k
 $D>k
 'tnk
 d^k
  !.!k
" 
uU&&');;	<#k
 k
r,   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            $       "  ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	 r\               SS
\\R                     S\\R                      S\\R                      S\\R                      S\\R                     S\\R                     S\\R"                     S\\\\R                           S\\\\R                           S\\   S\\   S\\   S\\   S\\R                      S\\R"                     S\\\4   4 S jj5       rSrU =r$ )SpeechT5ForSpeechToTexti;  z#text_decoder_postnet.lm_head.weightr|   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r'   r~   r  r  r  rI  r  text_decoder_postnetrk  )r{   r|   speech_encodertext_decoderr~   s       r*   ro    SpeechT5ForSpeechToText.__init__C  s}     $00@ A/ /  9@4V<%flK$>v$F! 	r,   c                 6    U R                   R                  5       $ r   rI  r%  rZ  s    r*   r%  #SpeechT5ForSpeechToText.get_encoderW      }}((**r,   c                 6    U R                   R                  5       $ r   rI  r)  rZ  s    r*   r)  #SpeechT5ForSpeechToText.get_decoderZ  rE  r,   c                 T    U R                  5       R                  R                  5         gr,  r%  r  r[  rZ  s    r*   r[  .SpeechT5ForSpeechToText.freeze_feature_encoder]      
 	!!88:r,   c                 6    U R                   R                  5       $ r   )r>  r  rZ  s    r*   r  -SpeechT5ForSpeechToText.get_output_embeddingsd  s    ((>>@@r,   c                 :    U R                   R                  U5        g r   )r>  r  r  s     r*   r  -SpeechT5ForSpeechToText.set_output_embeddingsg  s    !!77Gr,   r-   r/   decoder_input_idsr/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r8   c                    Ub  UOU R                   R                  nUb7  Uc4  [        XR                   R                  U R                   R                  5      nU R                  UUUUUUUUU	U
UUSUS9nU R                  US   5      nSnUbF  [        5       nU" UR                  SU R                   R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                   UR"                  S9	$ )a`  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Indices of decoder input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)

    SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
    or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
    only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

    Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
>>> from datasets import load_dataset

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
>>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
>>> predicted_ids = model.generate(**inputs, max_length=100)

>>> # transcribe speech
>>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
>>> transcription[0]
'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
```

```python
>>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

>>> # compute loss
>>> loss = model(**inputs).loss
>>> round(loss.item(), 2)
19.68
```
NT)r-   r/   r.  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r   r#   r   )	r  r  r  r  r3  r  r4  r?  r5  )r|   r|  r+   r    r!   rI  r>  r   r   r  r   r  r  r3  r  r4  r?  r5  )r{   r-   r/   rQ  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r3  r  r  loss_fctoutputs                        r*   r   SpeechT5ForSpeechToText.forwardj  s[   v &1%<k$++B]B] ($6KK44dkk6X6X%! --%)!2#9/!5++/!5)   
" **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   )rI  r>  r9  )r   r   r   r   _tied_weights_keysr   ro   r%  r)  r[  r  r  r   r   r   r  r  r   r  rP   r   r   r   r   r   r   s   @r*   r;  r;  ;  s    @@~ (++;AH  59598<=A159=7;EIEI$(,0/3&*-115!H
u001H
 !!1!12H
 $E$4$45	H

 !))9)9 :H
 E--.H
 $E$5$56H
 'u||4H
 "%e.?.?(@"ABH
 "%e.?.?(@"ABH
 D>H
 $D>H
 'tnH
 d^H
 ))*H
  !.!H
" 
uo%	&#H
 H
r,   r;  modelr  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
                    Uc  [        S5      eUc*  SXR                  R                  :H  R                  5       -
  n
OUn
UR	                  S5      nU R
                  R                  UU
SS9nUR                  n[        U R
                  R                  [        5      (       a@  U R
                  R                  R                  R                  US   R                  S   U
5      n
[        UR	                  S5      U-  U R                  R                  -  5      n[        UR	                  S5      U-  U R                  R                  -  5      nUR                  USU R                  R                  5      n/ n/ nS nSn0 n US-  nU R
                  R                   R                  UU5      nU R
                  R                   R#                  US S 2SS 24   S UU
USUSS9nU(       a.  UR%                  [&        R(                  " UR*                  SS95        UR                  R-                  S5      nUR.                  nU R0                  R3                  U5      nUR5                  XR                  R                  U R                  R                  5      nUR%                  U5        US S 2SS S 24   R5                  USU R                  R                  5      n[&        R(                  " UU4SS9n[&        R6                  " U R0                  R9                  U5      5      nUU:  a  GM  UU:  a@  [&        R:                  " USS9U:  n[&        R<                  " U5      S   R?                  5       nO[A        [C        U5      5      nU Vs/ sH  nUU;  d  M  UPM     nn[C        U5      S:  ad  [&        RD                  " U5      nURG                  SS5      RI                  SS	5      nU R0                  RK                  U5      nU H  n UU    UU '   M     [C        U5      U:  a  OGM  [A        [C        U5      5       Vs/ sH  nUU   PM
     nnU	(       d  US:X  a  US   O1[&        RL                  RN                  RP                  RS                  USS
9nUb	  U" U5      n!OUn!U(       a_  [&        R(                  " US	S9nUS:  a@  UR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 nU!U4n!U!$ / n"[A        U5       H&  nU"R%                  UU   R	                  S5      5        M(     Uc7  [&        RL                  RN                  RP                  RS                  USS
9nUU"4n!Ox/ n#[&        RL                  RN                  RP                  RS                  USS
9nU" U5      n#U" Vs/ sH,  n[        U#R	                  S5      [U        U"5      -  5      U-  PM.     n$nU#U$4n!U(       a\  [&        R(                  " US	S9nUR4                  " U[        UR	                  S5      U-  5      /UR	                  5       SS  Q76 n/ U!QUP7n!U!$ s  snf s  snf s  snf )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r-   r/   ro  r#   )r   r/   r?  r@  r  rB  r  ro  r   r   )batch_first)+r'   r|   r    r<   r   rI  r  rz  r  r  r  r_  r%   r.   r$   r  r  r  rW   r   r   r  squeezer  speech_decoder_postnetr  r   sigmoidr  rL   r  rM   rN   rS   stackr   flattenr  r   r   rnnpad_sequencer=   )%rW  r-   r  r/   rX  rY  rZ  r[  r\  r]  r@  r   encoder_outr4  maxlenminlenoutput_sequencespectrogramr  r  r  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr1  spectrograms
meet_indexr3  spectrogram_lengths	waveformswaveform_lengthss%                                        r*   _generate_speechrz    s    !
 	
 !"lll6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ll&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S|!q@R7RA|LS< 1$${{;7+55a;CCAqI$;;CCLQ".J5A*5M&z2 #/%&#-i j 49=O9P3QR3Qa&q)3QLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !sA&&|A';';A'>? ? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rs_rZ[INN1$5<O8P$P QTU U_rs "23G"$yy)9qA/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   7	Y Y (Y#2Y
zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            (         ^  \ rS rSrSrS\4U 4S jjr\S\4S j5       r	S r
S r\                 S#S\\R                     S	\\R                     S
\\R                      S\\R                     S\\R                      S\\R                      S\\R"                     S\\\\R                            S\\\\R                            S\\   S\\   S\\   S\\   S\\R                      S\\R                      S\\R"                     S\\R"                     S\\\4   4$S jj5       r\R,                  " 5               S$S\R                  S	\\R                     S\\R                      S\S\S\S\\R2                     S\S\S\\R                   \\R                   \R                   4   4   4S  jj5       r\R,                  " 5               S$S\R                  S\\R                      S	\\R                     S\S\S\S\\R2                     S\S\S\\R                   \\R                   \R                   4   4   4S! jj5       rS"rU =r$ )%SpeechT5ForTextToSpeechi	  r   r|   c                    > [         TU ]  U5        UR                  c  [        SU R                   S35      e[        U5      n[        U5      n[        XU5      U l        [        U5      U l
        U R                  5         g )Nr=  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rn   ro   r  r'   r~   r  r  r  rI  r  rb  rk  )r{   r|   text_encoderspeech_decoderr~   s       r*   ro    SpeechT5ForTextToSpeech.__init__	  s}     $00@ A/ /  5V<8@%fNK&B6&J# 	r,   r8   c                     gr:  r^  )clss    r*   can_generate$SpeechT5ForTextToSpeech.can_generate	  s    
 r,   c                 6    U R                   R                  5       $ r   rC  rZ  s    r*   r%  #SpeechT5ForTextToSpeech.get_encoder	  rE  r,   c                 6    U R                   R                  5       $ r   rG  rZ  s    r*   r)  #SpeechT5ForTextToSpeech.get_decoder	  rE  r,   r/   r.  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r  r  c                 r   Ub  UOU R                   R                  nUbB  Uc"  [        XR                   R                  U5      u  p4U R                   R                  (       a  SnU R                  UUUUUUUUU	U
UUUSUS9nU R                  US   5      u  nnnSnUb,  [        U R                   5      nU" UUUUUUR                  5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )aH  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
    [`~PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
    computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
    for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
>>> import torch

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
>>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([15872])
```
NTr-   r/   r.  r/  rm  r0  r  r1  r  rB  r  r  rn  ro  r  r   r   	r  rl  r  r  r3  r  r4  r?  r5  )r|   r|  r4   r.   r  rI  rb  r  r  r   r  r  r3  r4  r?  r5  )r{   r   r/   r.  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r  r  r3  r  r  r  r  	criterionrT  s                            r*   r   SpeechT5ForTextToSpeech.forward	  s{   Z &1%<k$++B]B]#+?WKK88:P@<$ {{44$(!--")!5#9/!5++1/!5)   
$ AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   rX  rY  rZ  r[  r\  r]  c
                     UbY  UR                  S5      nUR                  S5      U:w  a3  UR                  S5      S:X  a  UR                  US5      nO[        S5      e[        U UUUUUUUUU	5
      $ )a  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Attention mask from the tokenizer, required for batched inference to signal to the model where to
        ignore padded tokens from the input_ids.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   r  r'   rz  )r{   r   r/   r  rX  rY  rZ  r[  r\  r]  kwargsr\   s               r*   generate SpeechT5ForTextToSpeech.generate5
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   c
                     UbY  UR                  S5      n
UR                  S5      U
:w  a3  UR                  S5      S:X  a  UR                  U
S5      nO[        S5      e[        U UUUUUUUUU	5
      $ )aW  
Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
speech waveform using a vocoder.

Args:
    input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
        Indices of input sequence tokens in the vocabulary.

        Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
        [`~PreTrainedTokenizer.__call__`] for details.

        [What are input IDs?](../glossary#input-ids)
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r  )r{   r   r  r/   rX  rY  rZ  r[  r\  r]  r\   s              r*   generate_speech'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r,   rb  rI  NNNNNNNNNNNNNNNNNNNg      ?r2   g      4@NFF)r   r   r   r   ra  r   ro   classmethodrP   r  r%  r)  r   r   r   r  r  r   r  r   r   r   r   r   r   rc  r  r  r   r   r   s   @r*   r|  r|  	  s    "O~ ( T  ++  1559<@=A159=7;EIEI$(,0/3&*:>.2.215%D
E,,-D
 !!1!12D
 'u'8'89	D

 !))9)9 :D
 E--.D
 $E$5$56D
 'u||4D
 "%e.?.?(@"ABD
 "%e.?.?(@"ABD
 D>D
 $D>D
 'tnD
 d^D
 %U%6%67D
  **+!D
" ell+#D
$ !.%D
& 
u..	/'D
 D
L ]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v ]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r,   r|  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            (       4  ^  \ rS rSrS\4U 4S jjrS rS rS r\	                 S"S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\\\R                           S\
\\\R                           S\
\   S\
\   S\
\   S\
\   S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\\\4   4$S jj5       r\R(                  " 5               S#S\R                  S\
\R                     S\
\R                     S\S\S\S\
\R.                     S\S\S\R                  4S  jj5       rS!rU =r$ )$SpeechT5ForSpeechToSpeechi
  r|   c                    > [         TU ]  U5        [        U5      n[        U5      n[	        XU5      U l        [        U5      U l        U R                  5         g r   )	rn   ro   r  r  r  rI  r  rb  rk  )r{   r|   r?  r  r~   s       r*   ro   "SpeechT5ForSpeechToSpeech.__init__
  sK     8@8@%fnM&B6&J# 	r,   c                 6    U R                   R                  5       $ r   rC  rZ  s    r*   r%  %SpeechT5ForSpeechToSpeech.get_encoder  rE  r,   c                 6    U R                   R                  5       $ r   rG  rZ  s    r*   r)  %SpeechT5ForSpeechToSpeech.get_decoder  rE  r,   c                 T    U R                  5       R                  R                  5         gr,  rJ  rZ  s    r*   r[  0SpeechT5ForSpeechToSpeech.freeze_feature_encoder
  rL  r,   r-   r/   r.  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r  r  r8   c                    Ub  UOU R                   R                  nUb%  Uc"  [        XR                   R                  U5      u  p4U R	                  UUUUUUUUU	U
UUUSUS9nU R                  US   5      u  nnnSnU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                  S9	$ )a[  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into
    a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
    Float values of input mel spectrogram.

    SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
    `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
    `past_key_values`).
decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
    Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
    also be used by default.

    If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
    and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
    information on the default strategy.
cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
    Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

    - 1 indicates the head is **not masked**,
    - 0 indicates the head is **masked**.
speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
    Tensor containing the speaker embeddings.
labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
    Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
    [`SpeechT5Processor.__call__`] for details.
stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Binary tensor indicating the position of the stop token in the sequence.

Example:

```python
>>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
>>> from datasets import load_dataset
>>> import torch

>>> dataset = load_dataset(
...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation"
... )  # doctest: +IGNORE_RESULT
>>> dataset = dataset.sort("id")
>>> sampling_rate = dataset.features["audio"].sampling_rate

>>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
>>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
>>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

>>> # audio file is decoded on the fly
>>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

>>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

>>> set_seed(555)  # make deterministic

>>> # generate speech
>>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
>>> speech.shape
torch.Size([77824])
```
NTr  r   r   r  )r|   r|  r4   r.   rI  rb  r   r  r  r3  r  r4  r?  r5  )r{   r-   r/   r.  r/  rm  r0  r  r1  r  rB  r  rn  ro  r  r  r  r  r3  r]   rl  r  r  rT  s                           r*   r   !SpeechT5ForSpeechToSpeech.forward  s(   h &1%<k$++B]B]#+?WKK88:P@<$ --%)!5#9/!5++1/!5)   
$ "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r,   rX  rY  rZ  r[  r\  r]  c
                 n    Uc  [         R                  " SUR                  S9n[        U UUUUUUUUU	5
      $ )ao  
Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
speech waveform using a vocoder.

Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
        Float values of input raw speech waveform.

        Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `list[float]`,
        a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library (`pip install torchcodec`)
        or the soundfile library (`pip install soundfile`).
        To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding and
        conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
    speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
        Tensor containing the speaker embeddings.
    attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
        `[0, 1]`:

        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.

        [What are attention masks?](../glossary#attention-mask)
    threshold (`float`, *optional*, defaults to 0.5):
        The generated sequence ends when the predicted stop token probability exceeds this value.
    minlenratio (`float`, *optional*, defaults to 0.0):
        Used to calculate the minimum required length for the output sequence.
    maxlenratio (`float`, *optional*, defaults to 20.0):
        Used to calculate the maximum allowed length for the output sequence.
    vocoder (`nn.Module`, *optional*, defaults to `None`):
        The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
        spectrogram.
    output_cross_attentions (`bool`, *optional*, defaults to `False`):
        Whether or not to return the attentions tensors of the decoder's cross-attention layers.
    return_output_lengths (`bool`, *optional*, defaults to `False`):
        Whether or not to return the concrete spectrogram/waveform lengths.

Returns:
    `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
    - when `return_output_lengths` is False
        - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
        - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(num_frames,)` -- The predicted speech waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
    - when `return_output_lengths` is True
        - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
        are padded to the maximum length.
        - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `list[Int]` -- A list of
        all the concrete lengths for each spectrogram.
        - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
        `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
        - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `list[Int]` -- A list of all
        the concrete lengths for each waveform.
        - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
        `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
        output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
)r   i   rh  )r   rO   r   rz  )
r{   r-   r  r/   rX  rY  rZ  r[  r\  r]  s
             r*   r  )SpeechT5ForSpeechToSpeech.generate_speech  sM    T %!&Xl>Q>Q!R#!
 	
r,   r  r  r  )r   r   r   r   r   ro   r%  r)  r[  r   r   r   r  r  r   r  rP   r   r   r   r   r   r   rc  r  r   r   r   s   @r*   r  r  
  s   
~ 
++;  5959<@=A159=7;EIEI$(,0/3&*:>.2.215%
u001
 !!1!12
 'u'8'89	

 !))9)9 :
 E--.
 $E$5$56
 'u||4
 "%e.?.?(@"AB
 "%e.?.?(@"AB
 D>
 $D>
 'tn
 d^
 %U%6%67
  **+!
" ell+#
$ !.%
& 
u..	/'
 
B ]]_ ;?59 !'+(-&+W
''W
 %U%6%67W
 !!1!12	W

 W
 W
 W
 "))$W
 "&W
  $W
 
		W
 W
r,   r  c                   H   ^  \ rS rSrSU 4S jjrS	S jrS rS rS rSr	U =r
$ )
HifiGanResidualBlocki  c                   > [         TU ]  5         X@l        [        R                  " [        [        U5      5       Vs/ sH0  n[        R                  " UUUSX5   U R                  X#U   5      S9PM2     sn5      U l	        [        R                  " [        [        U5      5       Vs/ sH,  n[        R                  " UUUSSU R                  US5      S9PM.     sn5      U l
        g s  snf s  snf )Nr   )rl   dilationr   )rn   ro   leaky_relu_sloper   r-  rN   rS   rs   get_paddingconvs1convs2)r{   channelsrk   r  r  r1  r]   r~   s          r*   ro   HifiGanResidualBlock.__init__  s     0mm s8}-
 .A 		%[ ,,[1+F .

 mm s8}-
 .A 		 ,,[!< .



s   6C#$2C(c                     X-  U-
  S-  $ r   r^  )r{   rk   r  s      r*   r   HifiGanResidualBlock.get_padding  s    &1a77r,   c                 >   [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU R
                   H  nU" U5        M     U R                   H  nU" U5        M     g Nr   )r   r   r   r   r   r  r  r{   r   r*  s      r*   apply_weight_norm&HifiGanResidualBlock.apply_weight_norm  si    hh**288,,m<<((33??K[[E ![[E !r,   c                     U R                    H"  n[        R                  R                  U5        M$     U R                   H"  n[        R                  R                  U5        M$     g r   )r  r   r   remove_weight_normr  r{   r*  s     r*   r  'HifiGanResidualBlock.remove_weight_norm  sB    [[EHH''. ![[EHH''. !r,   c                 (   [        U R                  U R                  5       Hm  u  p#Un[        R                  R                  XR                  5      nU" U5      n[        R                  R                  XR                  5      nU" U5      nX-   nMo     U$ r   )ru  r  r  r   r  
leaky_relur  )r{   r   conv1conv2r2  s        r*   r   HifiGanResidualBlock.forward!  sz    T[[9LE$HMM44]DYDYZM!-0MMM44]DYDYZM!-0M)4M : r,   )r  r  r  )r	   )r   r	      g?)r   )r   r   r   r   ro   r  r  r  r   r   r   r   s   @r*   r  r    s!    
>8/ r,   r  z
    HiFi-GAN vocoder.
    c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
S rS r\" S	S
9S\R                  S\R                  4S j5       rSrU =r$ )SpeechT5HifiGani,  r|   rl  c                   > [         TU ]  U5        [        UR                  5      U l        [        UR
                  5      U l        [        R                  " UR                  UR                  SSSS9U l        [        R                  " 5       U l        [        [        UR
                  UR                   5      5       Ha  u  nu  p4U R                  R#                  [        R$                  " UR                  SU-  -  UR                  SUS-   -  -  UUXC-
  S-  S95        Mc     [        R                  " 5       U l        [)        [        U R                  5      5       Hp  nUR                  SUS-   -  -  n[        UR                  UR*                  5       H4  u  pFU R&                  R#                  [-        XTXaR.                  5      5        M6     Mr     [        R                  " WSSSSS9U l        U R3                  S[4        R6                  " UR                  5      5        U R3                  S[4        R8                  " UR                  5      5        U R;                  5         g )N   r   r	   )rk   rl   r   r   rM  r  )rn   ro   rS   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   rs   model_in_dimupsample_initial_channelconv_prer-  	upsamplerr}  ru  upsample_kernel_sizesrW   ConvTranspose1d	resblocksrN   resblock_dilation_sizesr  r  	conv_postr   r   rO   rU   rk  )r{   r|   r1  upsample_raterk   r  r  r~   s          r*   ro   SpeechT5HifiGan.__init__5  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r+A+NN!!""331=33a!eE +((8Q> 0s s4>>*+A661Q<HH),V-I-I6KiKi)j%%%&:8RZ\s\s&tu *k ,
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r,   rJ  c                 8   [        U[        R                  [        R                  45      (       ak  UR                  R
                  R                  SU R                  R                  S9  UR                  b%  UR                  R
                  R                  5         ggg)zInitialize the weights.r2   rL  N)r  r   rs   r  r   rV  rS  r|   rQ  rm   rY  )r{   rJ  s     r*   r\  SpeechT5HifiGan._init_weights[  so    fryy"*<*<=>>MM&&CT[[5R5R&S{{&  &&( ' ?r,   c                    [         R                  R                  n[        [         R                  R                  S5      (       a$  [         R                  R                  R                  nU" U R
                  5        U R                   H  nU" U5        M     U R                   H  nUR                  5         M     U" U R                  5        g r  )
r   r   r   r   r   r  r  r  r  r  r  s      r*   r  !SpeechT5HifiGan.apply_weight_normb  s    hh**288,,m<<((33??KDMM"^^E $^^E##% $DNN#r,   c                 R   [         R                  R                  U R                  5        U R                   H"  n[         R                  R                  U5        M$     U R
                   H  nUR                  5         M     [         R                  R                  U R                  5        g r   )r   r   r  r  r  r  r  r  s     r*   r  "SpeechT5HifiGan.remove_weight_normn  sh    
##DMM2^^EHH''. $^^E$$& $
##DNN3r,   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r8   c                    U R                   R                  (       a  XR                  -
  U R                  -  nUR	                  5       S:H  nU(       d  UR                  S5      nUR                  SS5      nU R                  U5      n[        U R                  5       H  n[        R                  R                  X0R                   R                  5      nU R                  U   " U5      nU R                  X@R                   -     " U5      n[        SU R                   5       H)  nXPR                  X@R                   -  U-      " U5      -  nM+     XPR                   -  nM     [        R                  R                  U5      nU R#                  U5      n[$        R&                  " U5      nU(       d2  UR)                  S5      R                  SS5      R+                  S5      nU$ UR)                  S5      nU$ )a  
spectrogram (`torch.FloatTensor`):
    Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
    config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

Returns:
    `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
    shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
r	   r   r   r   r#   )r|   normalize_beforerM  r  r   r   r   r  rN   r  r   r  r  r  r  r  r  r  r   tanhra  r   )r{   rl  
is_batchedr   r1  	res_statejwaveforms           r*   r   SpeechT5HifiGan.forwardv  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))*AMM44]KKD`D`aM NN1-m<Mq+;+;';<]KI1d../^^A0@0@,@1,DEmTT	 0%(8(88M + 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr,   )r  r  r  r  r  r  )r   r   r   r   r   r_  ra  ro   r   rc  r\  r  r  r   r   r  r   r   r   r   s   @r*   r  r  ,  sp     "!#O$4 $L)BII )
$4 (5#4#4 (9J9J ((r,   r  )r;  r  r|  r  rH  r  )r   Nr#  r  )er   r   typingr   r   numpyrG   r   torch.utils.checkpointr   torch.nnr   r   r   activationsr
   cache_utilsr   r   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   r   configuration_speecht5r   r   
get_loggerr   r  _HIDDEN_STATES_START_POSITIONr   r<   r+   r4   r  r   r  ndarrayre   rg   r   r   rc  r   r   r  r  r   r'  r?  rM  r  r  r  r  r  r  r  r  r(  r6  rH  re  r  r  r  r  r  r  r  r  r  r  r;  r  rP   rz  r|  r  r  r  __all__r^  r,   r*   <module>r     s     "     @ @ ! 5 ) @ 7 e 9  D , I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp#= ,!; 8!; 2A8BII A8J*bii *Zryy 0" "(299 %RYY %R1		 1D")) DN1")) 1h% %P<299 <2		+? ")-		+? )-X&,@ &$a2		 a2H")) 0:5 :zb5 bJ (7o (7 (7V|
- |
~"&= "J'$; 'T
#: 
@C
- C
L/&= /d3$; 3l*#: *Z8M299 8Mv:bii :z 
\
+ \

\
~ 
s
5 s

s
r 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
e
5 e

e
P 
t
 7 t

t
n;299 ;| 
to t
tnr,   