
    <hO                         S r SSKrSSKJr  SSKJrJr  SSKJrJ	r	  SSK
Jr  SSKJrJrJrJr  SS	KJrJr  \" 5       (       a  SSKr\" 5       (       a  SSKr " S
 S\SS9r " S S\SS9r " S S\5      rS/rg)zProcessor class for Dia    N)Path)OptionalUnion   )
AudioInputmake_list_of_audio)BatchFeature)AudioKwargsProcessingKwargsProcessorMixinUnpack)is_soundfile_availableis_torch_availablec                   N    \ rS rSr% \\S'   \\S'   \\S'   \\   \S'   \\S'   Srg)	DiaAudioKwargs"   bos_token_ideos_token_idpad_token_iddelay_pattern
generation N)	__name__
__module____qualname____firstlineno__int__annotations__listbool__static_attributes__r       ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/processing_dia.pyr   r   "   s$    9r"   r   F)totalc                   J    \ rS rSr% \\S'   SSSS.SSS	/ S
QSSS.SS0S.rSrg)DiaProcessorKwargs*   audio_kwargsTrightF)paddingpadding_sideadd_special_tokensi   i  i  )	r      	   
                  iD  )r   r   r   r   r   sampling_ratereturn_tensorspt)text_kwargsr(   common_kwargsr   N)r   r   r   r   r   r   	_defaultsr!   r   r"   r#   r&   r&   *   sB       #"'
 !  >"
 +D1Ir"   r&   c                     ^  \ rS rSrSrSrSrSrU 4S jr\	S 5       r
  S#S\\\\   4   S	\\   S
\\   S\\   4S jjr S$SSS\\   S\\   S\S   4S jjr S$SSS\\   S\\   SS4S jjrSSS\\   S\4S jrS	\S\\\\\\\4      4   S\\   4S jr\ S%S\S\S\S\\   S\S\S   4S jj5       r\S	SS\S\S \S   SS4
S! j5       rS"rU =r$ )&DiaProcessor>   a  
Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
information.

Args:
    feature_extractor (`DiaFeatureExtractor`):
        An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
    tokenizer (`DiaTokenizer`):
        An instance of [`DiaTokenizer`]. The tokenizer is a required input.
    audio_tokenizer (`DacModel`):
        An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
DiaFeatureExtractorDiaTokenizerDacModelc                 "   > [         TU ]  XUS9  g )N)audio_tokenizer)super__init__)selffeature_extractor	tokenizerrB   	__class__s       r#   rD   DiaProcessor.__init__R   s    *Wr"   c                 v    U R                   R                  nSS/n[        [        R	                  X-   5      5      $ )z
We no longer pass the raw audio values but the codebooks encoded by the `audio_tokenizer`.
Conventions may differ between audio models due to architectural choices.
decoder_input_idsdecoder_attention_mask)rG   model_input_namesr   dictfromkeys)rE   tokenizer_input_namesaudio_tokenizer_input_namess      r#   rM   DiaProcessor.model_input_namesU   s8     !% @ @':<T&U#DMM"7"UVWWr"   textaudiooutput_labelskwargsc           
         [        5       (       d  [        S5      eUc  [        S5      eU R                  " [        40 UD6nUS   nUS   nUS   nUR	                  SS5      n	U	S:w  a"  [        U R
                  R                   S	35      e0 n
[        U[        5      (       a  U/nO=[        U[        [        45      (       a  [        S
 U 5       5      (       d  [        S5      eU R                  " U40 UD6nU
R                  U5        UR	                  SS5      nUR	                  SS5      nUR	                  SS5      nUR	                  SS5      nUR	                  SS5      nUb	  Ub  Ub  Uc  [        S5      eU(       a  U(       a  [        SU SU S35      eU
S   R                  S   n[        U5      n[!        U5      nUGbQ  [#        U5      nU R$                  " U40 UD6n[&        R(                  " U R*                  R,                  R.                  5      nUS   S   R                  S   U-  n/ n/ n[1        US   US   5       GH  u  nnU R$                  R2                  n[&        R4                  " UR7                  SS9U-  5      U-  nUU-  nUU-
  n[8        R:                  " 5          USSSU24   R=                  U R*                  R>                  5      nU R*                  RA                  U5      RB                  RE                  SS5      nSSS5        U(       d*  [8        RF                  RH                  RK                  WSS US!9n[8        RF                  RH                  RK                  WSSUS-   SSS4S US!9nUS-   U-   nUU(       a  SOS-  n[8        RL                  " S/U-  S/U-  -   [8        RN                  S"9SSS24   n URQ                  U5        URQ                  U 5        GM     [8        RR                  " USS9n[8        RR                  " USS9nObU(       aP  [8        RT                  " USU4U[8        RN                  S"9n[8        RV                  " USU-   4[8        RN                  S#9nO[        S$5      eUUR                  S   :w  a  [        S%U S&UR                  S    S'35      eUR                  S   n!U!U-
  n"U RY                  UU!UUS(S)9n#[8        RT                  " UU!U4U[8        RZ                  S*9n$UU$SS2SU"24'   U R]                  U$UUU#S+9n%U
R                  U%US,.5        U(       a  U
S-   R_                  5       SS2SS24   n&S.U&U&U:H  '   S.U&U&U:H  '   U&RE                  SS5      Ra                  UU-  S5      Rc                  5       RO                  5       U
S/'   U
S-   SS2SS24   U
S-'   U
S0   SS2SS24   U
S0'   [e        XS19$ ! , (       d  f       GN= f)2aa  
Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
to the docstring of the above methods for more information.
zThe `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't find it in your environment. You can install torch via `pip install torch`.Nz0You need to specify the `text` input to process.r8   r(   r9   r6   r7   z% only supports `return_tensors='pt'`.c              3   @   #    U H  n[        U[        5      v   M     g 7fN)
isinstancestr).0ts     r#   	<genexpr>(DiaProcessor.__call__.<locals>.<genexpr>   s     9[VZQR*Q:L:LVZs   zAInvalid input text. Please provide a string, or a list of stringsr   r   r   r   r   TzTo enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.z9Labels with `generation` is incompatible, got generation=z, output_labels=.	input_idsr   padding_maskinput_valuesdim.      )r   r   r   rg   r   r   constant)padmodevaluedtype)sizern   z;If you try to train, you should provide audio data as well.zNNeed the same amount of samples for both text and audio, but got text samples=z and audio samples = z	 instead.Fbszseq_lennum_channelsr   revert)
fill_valuern   rT   r   r   precomputed_idx)rK   rL   rK   ilabelsrL   )datatensor_type)3r   
ValueError_merge_kwargsr&   poprH   r   rZ   r[   r   tupleallrG   updateshapelenmaxr   rF   mathprodrB   configdownsampling_ratioszip
hop_lengthceilsumtorchno_gradtodeviceencodeaudio_codes	transposenn
functionalrj   tensorlongappendcatfullonesbuild_indicesr   apply_audio_delayclonereshape
contiguousr	   )'rE   rS   rT   rU   rV   output_kwargsr8   r(   r9   r6   ry   	encodingsr   audio_bos_token_idaudio_eos_token_idaudio_pad_token_idr   
batch_sizers   	max_delayinput_audioscompression_ratemax_encoded_sequence_lenrK   rL   rb   base_pad_lencurrent_audio_lenencoded_sequence_lenpadding_lenra   num_valid_inputsattention_maskmax_seq_lenmax_audio_lenrw   prefilldelayed_decoder_input_idsrx   s'                                          r#   __call__DiaProcessor.__call___   sa    "##^ 
 <OPP**


 $M2$^4%o6&**+;TBT! 7 788]^__ dC  6DTD%=11c9[VZ9[6[6[`aaNN47;7	I %(($?)--ndC)--ndC)--ndC!%%lD9
&!)!)$k 
 -KJ<Wghugvvwx  +&,,Q/
=)&	 &u-E11%H<HL#yy)=)=)D)D)X)XY'3N'CA'F'L'LR'PTd'd$ "%'" (+<+GVdIe'f#e#55@@$(IIl.>.>2.>.F.U$VYe$e!'8<L'L$69MM ]]_!$-?.?-?"?@CCDDXDXD_D_`E $ 4 4 ; ;E B N N X XYZ\] ^I % " % 3 3 7 7!'9
Rd !8 !I "HH//33Aq+/1a#C*\n 4 	 $8!#;i#G  A: !&qcK.?1#HXBX.X`e`j`j!klprsls!t!((3&--n=9 (g< !&		*; C%*YY/E1%M" %

J<+HJ\didndn o &+ZZj!i-5PX]XbXb%c"Z[[*0033`ak`l m##4#:#:1#=">iI  -2226#i/,,%' - 
 **l3)))

 &7>M>!"$($:$:+++	 %; %
! 	*C_uvw-.446q!"u=F37F6//037F6//0#--a3;;J<UWYZeegllnDN(,-@(A!SbS&(ID$%-12J-KAsPRsF-SD)*BB] %_s   'A#W
W	rK   torch.Tensoraudio_prompt_lenreturnc                 J   U R                   " [        40 UD6nUS   nUR                  SS5      nUR                  SS5      nUR                  SS5      nUb  Ub  Uc  [        S5      eUbO  [        R
                  " X!R                  [        R                  S9nUS   R                  UR                  S   5      n	OUSS2SS2S4   U:H  R                  S	S
9n	UR                  S   USS2SS2S4   U:H  R                  S	S
9-
  S-
  n
UR                  u  pnU R                  UUUUSS9nU R                  US	S	US9R                  SS5      n/ n[        R                  " 5          [        U	R                  S   5       H  nUUSS2U	U   U
U   24   S   nUR!                  U R"                  R                  5      nU R"                  R%                  US9R&                  R)                  5       R+                  5       nUR-                  U5        M     SSS5        U$ ! , (       d  f       U$ = f)aP  
Decodes a batch of audio codebook sequences into their respective audio waveforms via the
`audio_tokenizer`. See [`~DacModel.decode`] for more information.

Args:
    decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
    audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
r(   r   Nr   r   zTo enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those.)r   rn   r   rc   re   rg   Trp   rv   rh   )N.)r   )r|   r&   r}   r{   r   r   r   r   expandr   r   r   r   r   r   ranger   rB   decodeaudio_valuescpusqueezer   )rE   rK   r   rV   r   r(   r   r   r   start_of_generation_idxend_of_generation_idxrq   rr   rs   rw   output_sequencesaudiosioutput_iaudio_is                       r#   batch_decodeDiaProcessor.batch_decode  si    **

 %^4$(($?)--ndC)--ndC%);)C}G\[  '$||,<E]E]ejeoeop&6t&<&C&CDUD[D[\]D^&_#'8Aq'AEW'W&\&\ac&\&d# ##A&*;Aq!G*DHZ*Z)_)_df)_)ggjkk 	
 &7%<%<"l,,%' - 
  11# + 2 
 )Aq/ 	 ]]_288;<+Aq2I!2LOdefOg2g,ghirs#;;t';';'B'BC..55(5KXX\\^ffhg&	 =   _ s   B*H
H"c                     UR                   S   S:w  a  [        SUR                   S    S35      eU R                  " X40 UD6S   $ )z
Decodes a single sequence of audio codebooks into the respective audio waveform via the
`audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
r   rg   z5Expecting a single output to be decoded but received z samples instead.)r   r{   r   )rE   rK   r   rV   s       r#   r   DiaProcessor.decodeS  s^     ""1%*GHYH_H_`aHbGcctu    !2OOPQRRr"   rL   c                     U R                   " [        40 UD6nUS   nUR                  SS5      nUc  [        S5      eUR                  S   [        U5      -
  $ )z0Utility function to get the audio prompt length.r(   r   NzTo enable the utility of retrieving the prompt length for Dia, we need the `delay_pattern`. You may have accidentally overwritten this.rg   )r|   r&   r}   r{   r   r   )rE   rL   rV   r   r(   r   s         r#   get_audio_prompt_len!DiaProcessor.get_audio_prompt_lend  ss     **

 %^4$(($? O  &++A.]1CCCr"   saving_pathc                 z   [        5       (       d  [        S5      e[        U5      n[        U[        [
        45      (       a  U/nO=[        U[        [        45      (       a  [        S U 5       5      (       d  [        S5      e[        U5      [        U5      :w  a  [        S5      eU R                  " [        40 UD6nUS   nUS   n[        X5       Hg  u  px[        U[        R                  5      (       a,  UR!                  5       R#                  5       R%                  5       n[&        R(                  " XU5        Mi     g )Nz/Please install `soundfile` to save audio files.c              3   L   #    U H  n[        U[        [        45      v   M     g 7frY   )rZ   r[   r   )r\   ps     r#   r^   *DiaProcessor.save_audio.<locals>.<genexpr>  s#     @qep`aAPSUY{A[A[eps   "$zAInvalid input path. Please provide a string, or a list of stringsz5The number of audio and saving paths must be the samer(   r5   )r   ImportErrorr   rZ   r[   r   r   r~   r   r{   r   r|   r&   r   r   Tensorr   floatnumpysfwrite)	rE   rT   r   rV   r   r(   r5   audio_valuer   s	            r#   
save_audioDiaProcessor.save_audioy  s	    &''OPP #5) kC;//&-K[4-88S@qep@q=q=q`aau:[))TUU**

 %^4$_5!%5NK+u||44)oo/557==?HHQ]3 6r"   rq   rr   rs   r   rt   )r   r   c                    [         R                  " U[         R                  S9n[         R                  " U[         R                  S9SSS24   R	                  X5      S   nU(       d  XeSSSS24   -
  nOXeSSSS24   -   n[         R
                  " USUS-
  5      n[         R                  " U [         R                  S9SS2SS4   R	                  XU5      n[         R                  " U[         R                  S9SSSS24   R	                  XU5      n	[         R                  " UR                  S5      UR                  S5      U	R                  S5      /SS9R                  5       n
Xj4$ )z
Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
rm   N).Nr   rg   rc   re   )	r   r   int32aranger   clampstackr   r   )rq   rr   rs   r   rt   delay_arraysequence_idxvalid_sequence_idx	batch_idxchannel_idxall_idxs              r#   r   DiaProcessor.build_indices  s:    ll=D ||G5;;?aHOOPS]^gh'dD!m*DDL'dD!m*DDL"[[q'A+FLLEKK8D$GNNs]ij	ll<u{{CD$PQMRYYZ]htu++r"$6$>$>r$BKDWDWXZD[\
 $& 	
 $$r"   r   r   rw   c           	      d   U R                   nUu  pVUR                  U5      nUR                  U5      n[        R                  " USS9u  pxn	XX4   R	                  U R                  5       5      n
US:  nXPR                  S   :  n[        R                  " X[        R                  " XU
5      5      nU$ )a  
Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.

Args:
    audio: audio tokens of shape [bsz, seq_len, num_channels]
    pad_token_id: the PAD token
    bos_token_id: the BOS token
    precomputed_idx: from `build_indices`

Returns:
    final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
rc   re   r   rg   )r   r   r   unbindviewro   r   where)rT   r   r   rw   r   r   r   r   r   r   gathered_audiomask_bosmask_padfinal_audios                 r#   r   DiaProcessor.apply_audio_delay  s    *  /#v.**V$ 6;\\'r5R2	{*<IJOOPUPZPZP\]  !#;;q>1kk(%++h^l:mnr"   r   )NFrY   )F) r   r   r   r   __doc__feature_extractor_classtokenizer_classaudio_tokenizer_classrD   propertyrM   r   r[   r   r   r   r    r   r&   r   r   r   r   r   r   r   staticmethodr~   r   r   r!   __classcell__)rH   s   @r#   r<   r<   >   s&    4$O&X X X '+(-	kCCcN#kC 
#kC  ~	kC
 +,kC` +/E)E #3-E +,	E
 
n	ET +/S)S #3-S +,	S
 
S"D .D +,D 
	D* 4 4 3d5d+;&<<= 4 +,	 4D   % % %  % Cy	 %
  % 
-	. %  %D """ " =>	"
 
" "r"   r<   )r   r   pathlibr   typingr   r   audio_utilsr   r   feature_extraction_utilsr	   processing_utilsr
   r   r   r   utilsr   r   r   	soundfiler   r   r&   r<   __all__r   r"   r#   <module>r      sr       " 9 4 U U ? [ ) (c> cL 
r"   