
    <h                        S r SSKJrJrJr  SSKrSSKJr  SSKJrJ	r	  SSK
Jr  SSKJrJr  SS	KJr  SS
KJr  SSKJrJrJrJr  SSKJrJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$J%r%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-  SSK.J/r/  \ " 5       (       a  SSK0J1r1  \"Rd                  " \35      r4\ " S S\5      5       r5 " S S\Rl                  5      r7 " S S\)5      r8 " S S\%5      r9 " S S\&5      r: " S  S!\$\Rl                  5      r; " S" S#\Rl                  5      r< " S$ S%\5      r= " S& S'\55      r> " S( S)\5      r? " S* S+\55      r@\" S,S-9 " S. S/\55      5       rA\" S0S-9 " S1 S2\5\/5      5       rB/ S3QrCg)4zPyTorch Dia model.    )CallableOptionalUnionN)nn   )DynamicCacheEncoderDecoderCache)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )LlamaAttentionLlamaRMSNormLlamaRotaryEmbeddingeager_attention_forward)Phi3MLP   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSS/rSrg	)
DiaPreTrainedModel9   configmodelT	input_idsDiaEncoderLayerDiaDecoderLayer N)__name__
__module____qualname____firstlineno__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules__static_attributes__r/       [/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/modular_dia.pyr(   r(   9   s<    &*#N!!O*,=>r>   r(   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	DiaMultiChannelEmbeddingF   a  In order to efficiently compute the audio embedding from the 9 different channels,
we vectorize the embedding process by using a single embedding layer and an offset.
Example:
- num_embeds = 4
- vocab_size = 8
- num_channels = 3
We would have offsets = [0, 8, 16]
If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
then tokens = audio_codes + offsets
            = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
This allows us to use a single embedding layer for all channels.
r*   c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        UR                  U l        UR
                  U l        [        R                  " UR
                  [        R                  S9UR                  -  nU R                  SUSS9  g )N)dtypeoffsetsF)
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr*   rE   	__class__s      r?   rH   !DiaMultiChannelEmbedding.__init__T   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr>   audio_codesreturnc                    XR                   R                  UR                  5      -   R                  S5      nU R	                  U5      R                  UR                  S   UR                  S   SU R                  5      nUR                  SS9$ )Nr!   r   r   )dim)	rE   todevicesqueezerM   viewshaperL   sum)rR   rU   tokensembedss       r?   forward DiaMultiChannelEmbedding.forward\   ss    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r>   )rM   rL   rK   )r0   r1   r2   r3   __doc__r#   rH   rN   Tensorrb   r=   __classcell__rS   s   @r?   rA   rA   F   s7    C/ C!5<< !ELL ! !r>   rA   c                       \ rS rSrSrg)DiaMLPb   r/   Nr0   r1   r2   r3   r=   r/   r>   r?   ri   ri   b       r>   ri   c                       \ rS rSrSrg)
DiaRMSNormf   r/   Nrk   r/   r>   r?   rn   rn   f   rl   r>   rn   c                       \ rS rSrSrg)DiaRotaryEmbeddingj   r/   Nrk   r/   r>   r?   rq   rq   j   rl   r>   rq   c                   <    \ rS rSrSrS	S\\\4   S\S\	4S jjr
Srg)
DiaSelfAttentionn   =Multi-headed attention from 'Attention Is All You Need' paperr*   	layer_idx	is_causalc                    [         R                  R                  5         Xl        X l        UR
                  U l        U R                  R                  U l        U R                  R                  =(       d    U R                  U l        U R                  U R                  -  U l	        [        USUR
                  U R                  -  5      U l        SU l        SU l        X0l        [         R                  " U R
                  U R                  U R                  -  SS9U l        [         R                  " U R
                  U R                  U R                  -  SS9U l        [         R                  " U R
                  U R                  U R                  -  SS9U l        [         R                  " U R                  U R                  -  U R
                  SS9U l        g )Nhead_dimr!           Fbias)r   ModulerH   r*   rw   rL   num_attention_heads	num_headsnum_key_value_headsnum_key_value_groupsgetattrrz   scalingattention_dropoutrx   Linearq_projk_projv_projo_proj)rR   r*   rw   rx   s       r?   rH   DiaSelfAttention.__init__q   sD   
		"!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r>   )r   r*   rz   rL   rx   r   rw   r   r   r   r   r   r   r   N)F)r0   r1   r2   r3   rd   r   r$   r#   intboolrH   r=   r/   r>   r?   rt   rt   n   s7    G^u%57G%GH ^UX ^ei ^ ^r>   rt   c                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\\   S\\R                  \
\R                     4   4S jjrSrU =r$ )DiaCrossAttention   rv   r*   rw   c                 R  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  R                  U l        U R                  R                  U l	        U R                  U R                  -  U l
        UR                  U l        SU l        SU l        SU l        [         R"                  " U R                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R                  U R                  -  U R                  SS9U l        g )Nr!   r{   Fr|   )rG   rH   r*   rw   rL   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimrz   r   r   rx   r   r   r   r   r   r   rR   r*   rw   rS   s      r?   rH   DiaCrossAttention.__init__   s;   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r>   hidden_statescross_attention_statesattention_maskpast_key_valueskwargsrV   c                 n   UR                   S S n/ UQSPU R                  P7n/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Ub%  UR
                  R                  U R                  5      OSn
Ubb  U
(       a[  UR                  R                  U R                     R                  nUR                  R                  U R                     R                  nOU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nUbB  UR                  R                  UUU R                  5      u  pSUR
                  U R                  '   [        nU R                   R"                  S:w  a  [$        U R                   R"                     nU" U U	UUU4SU R&                  0UD6u  pUR)                  / UQSP75      R+                  5       nU R-                  U5      nX4$ )NrX   r!   r   FTeagerr   )r^   rz   r   r]   	transpose
is_updatedgetrw   cross_attention_cachelayerskeysvaluesr   r   updater   r*   _attn_implementationr   r   reshape
contiguousr   )rR   r   r   r   r   r   input_shapehidden_shapecross_shapequery_statesr   
key_statesvalue_statesattention_interfaceattn_outputattn_weightss                   r?   rb   DiaCrossAttention.forward   s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
 >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
! "))*<K*<*<=HHJkk+.((r>   )r   r*   r   rz   rL   rx   r   rw   r   r   r   r   r   r   r   NN)r0   r1   r2   r3   rd   r#   r   rH   rN   re   r   r	   r   r   tuplerb   r=   rf   rg   s   @r?   r   r      s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41) 1)r>   r   c                      ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\\   S	\
\R                  \	\R                     4   4
S
 jjrSrU =r$ )r-      r*   rw   c                    > [         TU ]  5         [        UR                  UR                  S9U l        [        XSS9U l        [        UR                  UR                  S9U l        [        U5      U l
        g )NepsFrx   )rG   rH   rn   rL   norm_epspre_sa_normrt   self_attentionpost_sa_normri   mlpr   s      r?   rH   DiaEncoderLayer.__init__   sZ    %f&8&8fooN.vER&v'9'9vO&>r>   r   position_embeddingsr   r   rV   c                     UnU R                  U5      nU R                  " U4UUS.UD6u  pxXW-   nUnU R                  U5      nU R                  U5      n	XY-   nX4$ )Nr   r   )r   r   r   r   )
rR   r   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r?   rb   DiaEncoderLayer.forward   s     !((7.2.A.A/
 3)/
 	/
+ !3 ))-8((=) *//r>   )r   r   r   r   r   )r0   r1   r2   r3   r$   r   rH   rN   re   r   r   r   r   rb   r=   rf   rg   s   @r?   r-   r-      s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40 0r>   r-   c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S\
\R                     S\
\   S\
\   S	\\   S
\\\4   4S jj5       5       rS\\R                  S4   S\R                  4S jrSrU =r$ )
DiaEncoder   r*   c           	        > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [!        U5      U l        g s  snf Nr   )rG   rH   r*   r   rI   rJ   rL   	embedding
ModuleListrangenum_hidden_layersr-   r   rn   r   normrq   rotary_embeddingsr   s      r?   rH   DiaEncoder.__init__   s     f&7&79K9KLmmAFvG_G_A`aA`I_V/A`a
 v11vG	!3F!; bs   .CNr,   r   output_attentionsoutput_hidden_statesr   rV   c                    U R                  U5      n[        R                  " UR                  S   UR                  S9S S S 24   nU R                  Xg5      nU R                  UU5      nU(       a  SOS n	U(       a  SOS n
U R                   H1  nU(       a  X4-   n	U" U4UUS.UD6nUS   nU(       d  M)  XS   4-   n
M3     U R                  U5      nU(       a  X4-  n	[        XiU
S9$ )NrX   r[   r/   r   r   r!   last_hidden_stater   
attentions)
r   rN   rO   r^   r[   r   _update_full_maskr   r   r   )rR   r,   r   r   r   r   r   position_idsr   encoder_statesall_attentionsencoder_layerlayer_outputss                r?   rb   DiaEncoder.forward   s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]Q//

  40d![[M#!/2B!B)$7- 	M *!,M  !/3C2E!E ) 		-0..N+Vd
 	
r>   inputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ )Nflash_attention_2r   sdpaflex_attentionFr   	r*   r   r   rD   
isinstancerN   re   r&   r   )rR   r   r   s      r?   r   DiaEncoder._update_full_mask.  s    
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r>   )r*   r   r   r   r   )NFF)r0   r1   r2   r3   r$   rH   r   r   rN   re   r   r   r   r   r   r   r   rb   r   r=   rf   rg   s   @r?   r   r      s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 || r>   r   c                   |  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\	\R                     S	\	\R                     S
\	\   S\	\R                     S\
\R                  \	\R                     \	\R                     4   4S jjrSrU =r$ )r.   iE  r*   rw   c                 t  > [         TU ]  5         UR                  U l        [	        XSS9U l        [        X5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        g )NTr   r   )rG   rH   rL   	embed_dimrt   r   r   cross_attentionrn   r   r   pre_ca_normpre_mlp_normri   r   r   s      r?   rH   DiaDecoderLayer.__init__F  s    ++.vDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r>   r   r   r   encoder_hidden_statesencoder_attention_maskr   cache_positionrV   c                 d   Un	[        U	[        5      (       a  U	R                  n	Un
U R                  U5      nU R                  " UUUU	4SU0UD6u  pX-   nUn
U R                  U5      nU R                  " UU4UUS.UD6u  pX-   nUn
U R                  U5      nU R                  U5      nU
U-   nXU4$ )Nr   )r   r   )	r   r	   self_attention_cacher   r   r   r   r   r   )rR   r   r   r   r   r   r   r   r   self_attn_cacher   r   r   r   cross_statescross_attn_weightsr   s                    r?   rb   DiaDecoderLayer.forwardP  s     *o':;;-BBO ((7.2.A.A 	/
 *	/
 	/
+ !3 ((7+/+?+?!,
 2+	,

 ,
( !/ ))-8((=) 7*1CCCr>   )r   r   r   r   r   r   r   )NNNNNN)r0   r1   r2   r3   r#   r   rH   rN   re   r   r   r	   
LongTensorrb   r=   rf   rg   s   @r?   r.   r.   E  s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-D -Dr>   r.   c                     ^  \ rS rSrSrS\4U 4S jjr\\        SS\	R                  S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       5       rS	\\	R                  S4   S
\\	R                  S4   S\	R(                  S\	R                  4S jrSrU =r$ )
DiaDecoderi  z-Transformer Decoder Stack using DenseGeneral.r*   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        g s  snf r   )rG   rH   rK   rJ   rA   
embeddingsrq   r   r   r   r   r   r.   r   rn   rL   r   r   r   s      r?   rH   DiaDecoder.__init__  s     "// ++26:!3F!;mmAFvG_G_A`aA`I_V/A`a
 v11vG	 bs   :B>Nr,   r   r   r   r   r   r   r   r   rV   c
           	      "   UR                  5       SS u  pUb  UR                  5       OSnU	c"  [        R                  " XU-   UR                  S9n	Uc	  U	SSS24   nU R                  U5      nU R                  X5      nUc3  [        5       (       d$  X-   n[        R                  " UUUR                  S9n[        U R                  UUU	UUS9nU R                  UUUR                  SS U5      nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSnU R                   HE  nU(       a  UU4-  nU" UUUU4UUU	S.U
D6nUS   nU(       d  M.  UUS	   4-   nUc  M<  UUS   4-   nMG     U R                  U5      nU(       a  UU4-  n[        UUUUUS
9$ )z
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
    The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

    [What are input IDs?](../glossary#input-ids)
NrX   r   r   )r*   input_embedsr   r   r   r   r   r/   )r   r   r   r!   )r   r   r   r   cross_attentions)sizeget_seq_lengthrN   rO   r[   r  r   r   onesr
   r*   _update_cross_attn_maskr^   r   r   r   )rR   r,   r   r   r   r   r   r   r   r   r   
batch_size
seq_lengthpast_key_values_lengthr   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerr   s                         r?   rb   DiaDecoder.forward  s   , "+!1#2!6
ETE`!?!?!Afg!"\\&(KT]TdTdN )$'2L 	2"44]Q!*B*D*D4AO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[E#!m%55!!#%		
 (> /-	 	M *!,M  !/=3C2E!E(4+?=QRCSBU+U() !, 		-0-!118+++%1
 	
r>   r   r   c                    Ub  Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        UUR                  US   S9nU$ U R                   R                  S:X  a/  [	        U[
        R                  5      (       a  [        UUS   SS9nU$ [        X$R                  US   S9nU$ )	Nr   r   r   rX   )tgt_lenr   F)query_lengthrx   r   )rR   r   r   r   r   s        r?   r	  "DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellCC-H.%0_"'.* &%	 *D*,?,?UW*& &%r>   )r  r   r   rK   r   rJ   )NNNNNFFN)r0   r1   r2   r3   rd   r#   rH   r   r   rN   re   r   r   FloatTensorr	   r   r   r   r   rb   Sizer	  r=   rf   rg   s   @r?   r   r     s`   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!& !&r>   r   z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   r  ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\\\4      S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\4   4S jj5       5       rSrU =r$ )DiaModeli  r*   c                    > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g N)
rG   rH   r*   r   encoder_configencoderr   decoder_configdecoder	post_initrR   r*   rS   s     r?   rH   DiaModel.__init__  sC     !&"7"78!&"7"78r>   c                     U R                   $ r  )r  rR   s    r?   get_encoderDiaModel.get_encoder      ||r>   c                     U R                   $ r  )r!  r&  s    r?   get_decoderDiaModel.get_decoder!  r)  r>   r,   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr   	use_cacher   r   r   rV   c                    Uc  Uc  [        S5      eU	b  U	OU R                  R                  n	U
b  U
OU R                  R                  n
Ub  UOU R                  R                  nU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  Uc  [        [        5       [        5       5      nUc  U R                  " SUUU	U
S.UD6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS	9nUS   R                  S   S
U R                  R                   R"                  pnUc7  [$        R&                  " USU4U R                  R(                  U R*                  S9nUR,                  S:X  a"  UR/                  XU5      R1                  SS5      nU R2                  " SUUUUS   UUU	U
UUS.
UD6n[5        UR6                  UR8                  UR:                  UR<                  UR>                  US   UR:                  UR<                  S9$ )a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r,   r   r   r   r   r!   r   r   rX   )r  
fill_valuer[   )
r,   r   r   r   r   r   r   r   r1  r   )r   r   decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater   encoder_attentionsr/   ) 
ValueErrorr*   r   r   r1  is_gradient_checkpointingtrainingloggerwarning_oncer	   r   r  r   r   lenr^   r   rK   rN   fullbos_token_idr[   ndimr   r   r!  r   r   r   r   r   r  )rR   r,   r   r-  r.  r/  r0  r   r1  r   r   r   r   bszseq_lenchannelsdecoder_outputss                    r?   rb   DiaModel.forward$  sX   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,.,.QO""ll #-"3%9	
 O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjh$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9# Q [ [\]_` a,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r>   )r*   r!  r  )NNNNNNNNNNN)r0   r1   r2   r3   r"   rH   r'  r+  r   r   r   rN   r   r   r   r   r	   r   r   rb   r=   rf   rg   s   @r?   r  r    sJ   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r>   r  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\\\4      S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\\4   4S jj5       5       rSrU =r$ )DiaForConditionalGenerationi  r+   r*   c                 v  > [         TU ]  U5        Xl        [        U5      U l        UR
                  R                  U l        UR
                  R                  U l        [        R                  " UR
                  R                  U R                  U R                  -  SS9U l        SU l        U R                  5         g )NFr|   ForMaskedLM)rG   rH   r*   r  r+   r   rK   rJ   r   r   rL   logits_dense	loss_typer"  r#  s     r?   rH   $DiaForConditionalGeneration.__init__  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r>   c                 6    U R                   R                  5       $ r  )r+   r'  r&  s    r?   r'  'DiaForConditionalGeneration.get_encoder      zz%%''r>   c                 6    U R                   R                  5       $ r  )r+   r+  r&  s    r?   r+  'DiaForConditionalGeneration.get_decoder  rO  r>   r,   r   r-  r.  r/  r0  r   r1  r   r   labelsr   rV   c                 X   U R                   " S	UUUUUUUUU	U
US.UD6nUS   nUR                  S   nU R                  U5      R                  USU R                  U R
                  45      R                  SS5      R                  5       R                  UU R                  -  SU R
                  5      nSnUb  U R                  " S	UXR
                  S.UD6n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )
a   
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in
    `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
    are ignored (masked).
)r,   r   r-  r.  r/  r0  r   r1  r   r   r   r   rX   r!   r   N)logitsrR  rJ   )	lossrT  r   r4  r5  r  r6  r   r7  r/   )r+   r^   rJ  r]   rK   rJ   r   r   loss_functionr   r   r4  r5  r  r6  r   r7  )rR   r,   r   r-  r.  r/  r0  r   r1  r   r   rR  r   r   outputsr   r
  audio_logitsrU  s                      r?   rb   #DiaForConditionalGeneration.forward  sA   X ** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %%o\&UdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r>   )r*   rJ  rK  r+   rK   rJ   )NNNNNNNNNNNN)r0   r1   r2   r3   r5   r"   rH   r'  r+  r   r   r   rN   r   r   r   r   r	   r   r   rb   r=   rf   rg   s   @r?   rG  rG    sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r>   rG  )r  r(   rG  )Drd   typingr   r   r   rN   r   cache_utilsr   r	   masking_utilsr
   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   llama.modeling_llamar   r   r   r   phi3.modeling_phi3r    configuration_diar"   r#   r$   generation_diar%   integrations.flex_attentionr&   
get_loggerr0   r;  r(   r~   rA   ri   rn   rq   rt   r   r-   r   r.   r   r  rG  __all__r/   r>   r?   <module>rk     s    , ,   < / C 9  G & v v  ) L L .  !!J 
		H	% 	? 	? 	?!ryy !8	W 		 		- 	^~ryy ^,G)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
{
! {

{
| 
l
"46H l

l
^ Lr>   