
    <h                     *   S SK JrJrJr  S SKrS SKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJrJr  SSKJ r J!r!  SSK"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0  \(" 5       (       a  SSK1J2r2  \*Rf                  " \45      r5\& " S S\!5      5       r6 " S S\Rn                  5      r8 " S S\Rn                  5      r9\" S5       " S S\Rn                  5      5       r: " S S\Rn                  5      r;S  r<SCS! jr=S"\R|                  S#\?S$\R|                  4S% jr@ SDS&\Rn                  S'\R|                  S(\R|                  S)\R|                  S*\\R|                     S+\AS,\AS-\#\%   4S. jjrB " S/ S0\Rn                  5      rC " S1 S2\Rn                  5      rD " S3 S4\5      rE " S5 S6\65      rF " S7 S8\5      rG " S9 S:\65      rH\&" S;S<9 " S= S>\65      5       rI\&" S?S<9 " S@ SA\6\05      5       rJ/ SBQrKg)E    )CallableOptionalUnionN)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)use_kernel_forward_from_hub)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_torch_flex_attn_availableis_torchdynamo_compilinglogging   )	DiaConfigDiaDecoderConfigDiaEncoderConfig)DiaGenerationMixin)make_flex_block_causal_maskc                   D    \ rS rSr% \\S'   SrSrSrSr	Sr
SrSrSS/rSrg	)
DiaPreTrainedModel>   configmodelT	input_idsDiaEncoderLayerDiaDecoderLayer N)__name__
__module____qualname____firstlineno__r"   __annotations__base_model_prefixsupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attn_can_compile_fullgraphmain_input_name_no_split_modules__static_attributes__r/       \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/modeling_dia.pyr(   r(   >   s<    &*#N!!O*,=>r>   r(   c                   n   ^  \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	DiaMultiChannelEmbeddingK   a  In order to efficiently compute the audio embedding from the 9 different channels,
we vectorize the embedding process by using a single embedding layer and an offset.
Example:
- num_embeds = 4
- vocab_size = 8
- num_channels = 3
We would have offsets = [0, 8, 16]
If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
then tokens = audio_codes + offsets
            = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
This allows us to use a single embedding layer for all channels.
r*   c                 v  > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        UR                  U l        UR
                  U l        [        R                  " UR
                  [        R                  S9UR                  -  nU R                  SUSS9  g )NdtypeoffsetsF
persistent)super__init__r   	Embedding
vocab_sizenum_channelshidden_sizeembedtorcharangelongregister_buffer)selfr*   rF   	__class__s      r?   rJ   !DiaMultiChannelEmbedding.__init__Y   s    \\&"3"3f6I6I"I6K]K]^
!--"//,,v22%**EHYHYYYEBr>   audio_codesreturnc                    XR                   R                  UR                  5      -   R                  S5      nU R	                  U5      R                  UR                  S   UR                  S   SU R                  5      nUR                  SS9$ )Nr!   r      dim)	rF   todevicesqueezerO   viewshaperN   sum)rT   rW   tokensembedss       r?   forward DiaMultiChannelEmbedding.forwarda   ss    0B0B CCLLQOF#((a+:K:KA:NPRTXTdTdezzaz  r>   )rO   rN   rM   )r0   r1   r2   r3   __doc__r#   rJ   rP   Tensorrf   r=   __classcell__rU   s   @r?   rA   rA   K   s7    C/ C!5<< !ELL ! !r>   rA   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )DiaMLPg   c                    > [         TU ]  5         Xl        [        R                  " UR
                  SUR                  -  SS9U l        [        R                  " UR                  UR
                  SS9U l        [        UR                     U l        g )Nr[   Fbias)rI   rJ   r*   r   LinearrN   intermediate_sizegate_up_proj	down_projr   
hidden_actactivation_fnrT   r*   rU   s     r?   rJ   DiaMLP.__init__h   sn    IIf&8&8!f>V>V:V]bc6#;#;V=O=OV[\#F$5$56r>   hidden_statesrX   c                     U R                  U5      nUR                  SSS9u  p2X R                  U5      -  nU R                  U5      $ )Nr[   rZ   r\   )rt   chunkrw   ru   )rT   rz   	up_statesgates       r?   rf   DiaMLP.forwardp   sH    %%m4	#//!/4 2 24 88	~~i((r>   )rw   r*   ru   rt   )
r0   r1   r2   r3   rJ   rP   FloatTensorrf   r=   rj   rk   s   @r?   rm   rm   g   s,    7)U%6%6 )5;L;L ) )r>   rm   RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )
DiaRMSNormy   c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z)
DiaRMSNorm is equivalent to T5LayerNorm
N)rI   rJ   r   	ParameterrP   onesweightvariance_epsilon)rT   rN   epsrU   s      r?   rJ   DiaRMSNorm.__init__{   s/     	ll5::k#:; #r>   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr[   rZ   T)keepdim)	rE   r^   rP   float32powmeanrsqrtr   r   )rT   rz   input_dtypevariances       r?   rf   DiaRMSNorm.forward   sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r>   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)tupler   rb   r   rT   s    r?   
extra_reprDiaRMSNorm.extra_repr   s*    ))*+6$2G2G1HIIr>   )r   r   )gư>)	r0   r1   r2   r3   rJ   rf   r   r=   rj   rk   s   @r?   r   r   y   s    $;J Jr>   r   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )DiaRotaryEmbedding   r*   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqFrG   )rI   rJ   hasattr
isinstancer   dictgetr   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   r   rope_init_fnattention_scalingrS   r   original_inv_freq)rT   r*   r_   r   rU   s       r?   rJ   DiaRotaryEmbedding.__init__   s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r>   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rZ   r!   mpscpuF)device_typeenabledr[   r\   rD   )r   floatexpandrb   r^   r_   r   r   strrP   autocast	transposecatcosr   sinrE   )
rT   xposition_idsinv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r?   rf   DiaRotaryEmbedding.forward   sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r*   r   r   r   r   r   N)r0   r1   r2   r3   r"   rJ   rP   no_gradr   rf   r=   rj   rk   s   @r?   r   r      s6    /y / /" ]]_<  <r>   r   c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrZ   r[   r\   )rb   rP   r   )r   x1x2s      r?   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r>   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)	unsqueezer   )qkr   r   r   unsqueeze_dimq_embedk_embeds           r?   apply_rotary_pos_embr      sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr>   rz   n_reprX   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r!   N)rb   r   reshape)rz   r   batchnum_key_value_headsslenhead_dims         r?   	repeat_kvr      s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr>   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr[   r   rZ   )r]   rE   )ptrainingr!   )r   num_key_value_groupsrP   matmulr   rb   r   
functionalsoftmaxr   r^   rE   r   r   
contiguous)r   r   r   r   r   r   r   r   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r?   eager_attention_forwardr      s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r>   c                   0  ^  \ rS rSrSrSS\\\4   S\S\	4U 4S jjjr
  SS\R                  S\\R                  \R                  4   S	\\R                     S
\\   S\\R                      S\\   S\\R                  \R                  4   4S jjrSrU =r$ )DiaSelfAttention   =Multi-headed attention from 'Attention Is All You Need' paperr*   	layer_idx	is_causalc                   > [         TU ]  5         Xl        X l        UR                  U l        U R                  R
                  U l        U R                  R                  =(       d    U R                  U l        U R                  U R                  -  U l        [        USUR                  U R                  -  5      U l
        SU l        SU l        X0l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  U R                  -  SS9U l        [        R                  " U R                  U R                  -  U R                  SS9U l        g )Nr   r!           Frp   )rI   rJ   r*   r   rN   num_attention_heads	num_headsr   r   getattrr   r   attention_dropoutr   r   rr   q_projk_projv_projo_proj)rT   r*   r   r   rU   s       r?   rJ   DiaSelfAttention.__init__   s@   "!--88#';;#B#B#Tdnn $(NNd6N6N$N!
F4F4F$..4XY!$"ii 0 0$..4==2PW\]ii 0 0$2J2JT]]2Zafgii 0 0$2J2JT]]2Zafgii >@P@PW\]r>   rz   position_embeddingsr   past_key_valuecache_positionr   rX   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                  U R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nU R'                  U5      nUU4$ )NrZ   r!   r[   )r   r   r  eagerr   )r   r   )rb   r   r   ra   r   r   r   r   updater   r   r*   _attn_implementationr   r   r   r   r   r   r   )rT   rz   r   r   r  r  r   input_shapehidden_shapequery_statesr   r   r   r   cache_kwargsattention_interfacer   r   s                     r?   rf   DiaSelfAttention.forward  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r>   )r   r*   r   rN   r   r   r   r   r   r   r   r   r   r   )FNN)r0   r1   r2   r3   rh   r   r$   r#   intboolrJ   rP   ri   r   r   r	   
LongTensorr   r   rf   r=   rj   rk   s   @r?   r   r      s    G^u%57G%GH ^UX ^ei ^ ^. +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*)) ))r>   r   c                      ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\R                  S\
\R                     S	\
\   S
\\   S\\R                  \
\R                     4   4S jjrSrU =r$ )DiaCrossAttentioni9  r   r*   r   c                 R  > [         TU ]  5         Xl        X l        UR                  U l        UR
                  U l        U R                  R                  U l        U R                  R                  U l	        U R                  U R                  -  U l
        UR                  U l        SU l        SU l        SU l        [         R"                  " U R                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R
                  U R                  U R                  -  SS9U l        [         R"                  " U R                  U R                  -  U R                  SS9U l        g )Nr!   r   Frp   )rI   rJ   r*   r   rN   cross_hidden_sizecross_num_attention_headsr   cross_num_key_value_headsr   r   cross_head_dimr   r   r   r   r   rr   r   r   r   r   rT   r*   r   rU   s      r?   rJ   DiaCrossAttention.__init__<  s;   "!--!'!9!9>>#';;#H#H $(NNd6N6N$N!--!$ii 0 0$..4==2PW\]ii 6 68P8PSWS`S`8`glmii 6 68P8PSWS`S`8`glmii >@P@PW\]r>   rz   cross_attention_statesr   past_key_valuesr   rX   c                 n   UR                   S S n/ UQSPU R                  P7n/ UR                   S S QSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	Ub%  UR
                  R                  U R                  5      OSn
Ubb  U
(       a[  UR                  R                  U R                     R                  nUR                  R                  U R                     R                  nOU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      nUbB  UR                  R                  UUU R                  5      u  pSUR
                  U R                  '   [        nU R                   R"                  S:w  a  [$        U R                   R"                     nU" U U	UUU4SU R&                  0UD6u  pUR)                  / UQSP75      R+                  5       nU R-                  U5      nX4$ )NrZ   r!   r[   FTr  r   )rb   r   r   ra   r   
is_updatedr   r   cross_attention_cachelayerskeysvaluesr   r   r  r   r*   r  r   r   r   r   r   )rT   rz   r  r   r  r   r  r  cross_shaper	  r  r   r   r  r   r   s                   r?   rf   DiaCrossAttention.forwardO  s    $))#2.88b8$--8M.44Sb9M2Mt}}M{{=166|DNNqRSTGVGb_//33DNNChm
&:(>>EEdnnUZZJ*@@GGW^^L%;<AA+NXXYZ\]^J;;'=>CCKPZZ[\^_`L*+:+P+P+W+W NN,(
 >B**4>>:(?;;++w6"9$++:Z:Z"[$7%
 LL%
 %
! "))*<K*<*<=HHJkk+.((r>   )r   r*   r  r   rN   r   r   r   r   r   r   r   r   r   r   r  )r0   r1   r2   r3   rh   r#   r  rJ   rP   ri   r   r   r   r   r   rf   r=   rj   rk   s   @r?   r  r  9  s    G^/ ^C ^. 269=1)||1) !&1) !.	1)
 ""561) -.1) 
u||Xell33	41) 1)r>   r  c                      ^  \ rS rSrS\S\4U 4S jjr  SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\\   S	\
\R                  \	\R                     4   4
S
 jjrSrU =r$ )r-   i  r*   r   c                    > [         TU ]  5         [        UR                  UR                  S9U l        [        XSS9U l        [        UR                  UR                  S9U l        [        U5      U l
        g )Nr   Fr   )rI   rJ   r   rN   norm_epspre_sa_normr   self_attentionpost_sa_normrm   mlpr  s      r?   rJ   DiaEncoderLayer.__init__  sZ    %f&8&8fooN.vER&v'9'9vO&>r>   rz   r   r   r   rX   c                     UnU R                  U5      nU R                  " U4UUS.UD6u  pxXW-   nUnU R                  U5      nU R                  U5      n	XY-   nX4$ )Nr   r   )r)  r*  r+  r,  )
rT   rz   r   r   r   residualnormed_statesself_attn_outputself_attn_weightsmlp_outs
             r?   rf   DiaEncoderLayer.forward  s     !((7.2.A.A/
 3)/
 	/
+ !3 ))-8((=) *//r>   )r,  r+  r)  r*  r  )r0   r1   r2   r3   r$   r  rJ   rP   ri   r   r   r   r   rf   r=   rj   rk   s   @r?   r-   r-     s    "/ "C " LP15	0||0 &eELL%,,,F&GH0 !.	0
 -.0 
u||Xell33	40 0r>   r-   c                      ^  \ rS rSrS\4U 4S jjr\\   SS\R                  S\
\R                     S\
\   S\
\   S	\\   S
\\\4   4S jj5       5       rS\\R                  S4   S\R                  4S jrSrU =r$ )
DiaEncoderi  r*   c           	        > [         TU ]  U5        Xl        [        R                  " UR
                  UR                  5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                  S9U l        [!        U5      U l        g s  snf Nr&  )rI   rJ   r*   r   rK   rL   rN   	embedding
ModuleListrangenum_hidden_layersr-   r  r   r(  normr   rotary_embeddingsr  s      r?   rJ   DiaEncoder.__init__  s     f&7&79K9KLmmAFvG_G_A`aA`I_V/A`a
 v11vG	!3F!; bs   .CNr,   r   output_attentionsoutput_hidden_statesr   rX   c                    U R                  U5      n[        R                  " UR                  S   UR                  S9S S S 24   nU R                  Xg5      nU R                  UU5      nU(       a  SOS n	U(       a  SOS n
U R                   H1  nU(       a  X4-   n	U" U4UUS.UD6nUS   nU(       d  M)  XS   4-   n
M3     U R                  U5      nU(       a  X4-  n	[        XiU
S9$ )NrZ   r_   r/   r/  r   r!   last_hidden_staterz   
attentions)
r:  rP   rQ   rb   r_   r?  _update_full_maskr  r>  r   )rT   r,   r   rA  rB  r   rz   r   r   encoder_statesall_attentionsencoder_layerlayer_outputss                r?   rf   DiaEncoder.forward  s    y1
 ||IOOB$7	@P@PQRVXYRYZ"44]Q//

  40d![[M#!/2B!B)$7- 	M *!,M  !/3C2E!E ) 		-0..N+Vd
 	
r>   inputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ )Nflash_attention_2r   sdpaflex_attentionFr'  	r*   r  r   rE   r   rP   ri   r&   r   )rT   r   rN  s      r?   rH  DiaEncoder._update_full_mask  s    
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r>   )r*   r:  r  r>  r?  )NFF)r0   r1   r2   r3   r$   rJ   r   r   rP   ri   r   r  r   r   r   r   r   rf   rH  r=   rj   rk   s   @r?   r7  r7    s    	</ 	<  26,1/4.
<<.
 !..
 $D>	.

 'tn.
 -..
 
%	&.
  .
bellD01 || r>   r7  c                   |  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\
\R                  \R                  4      S\	\R                     S\	\R                     S	\	\R                     S
\	\   S\	\R                     S\
\R                  \	\R                     \	\R                     4   4S jjrSrU =r$ )r.   i  r*   r   c                 t  > [         TU ]  5         UR                  U l        [	        XSS9U l        [        X5      U l        [        UR                  UR                  S9U l
        [        UR                  UR                  S9U l        [        UR                  UR                  S9U l        [        U5      U l        g )NTr'  r&  )rI   rJ   rN   	embed_dimr   r*  r  cross_attentionr   r(  r)  pre_ca_normpre_mlp_normrm   r,  r  s      r?   rJ   DiaDecoderLayer.__init__  s    ++.vDQ0C%f&8&8fooN%f&8&8fooN&v'9'9vO&>r>   rz   r   r   encoder_hidden_statesencoder_attention_maskr  r  rX   c                 d   Un	[        U	[        5      (       a  U	R                  n	Un
U R                  U5      nU R                  " UUUU	4SU0UD6u  pX-   nUn
U R                  U5      nU R                  " UU4UUS.UD6u  pX-   nUn
U R                  U5      nU R                  U5      nU
U-   nXU4$ )Nr  )r   r  )	r   r   self_attention_cacher)  r*  rY  rX  rZ  r,  )rT   rz   r   r   r\  r]  r  r  r   self_attn_cacher0  r1  r2  r3  cross_statescross_attn_weightsr4  s                    r?   rf   DiaDecoderLayer.forward  s     *o':;;-BBO ((7.2.A.A 	/
 *	/
 	/
+ !3 ((7+/+?+?!,
 2+	,

 ,
( !/ ))-8((=) 7*1CCCr>   )rX  rW  r,  rY  rZ  r)  r*  )NNNNNN)r0   r1   r2   r3   r#   r  rJ   rP   ri   r   r   r   r  rf   r=   rj   rk   s   @r?   r.   r.     s    "/ "C " LP158<9=9=59-D||-D &eELL%,,,F&GH-D !.	-D
  (5-D !) 6-D ""56-D !!1!12-D 
u||Xell3Xell5KK	L-D -Dr>   r.   c                     ^  \ rS rSrSrS\4U 4S jjr\\        SS\	R                  S\\	R                     S\\	R                     S	\\	R                     S
\\	R                     S\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       5       rS	\\	R                  S4   S
\\	R                  S4   S\	R(                  S\	R                  4S jrSrU =r$ )
DiaDecoderi5  z-Transformer Decoder Stack using DenseGeneral.r*   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [	        U5      U l        [        U5      U l        [        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [        UR                  UR                   S9U l        g s  snf r9  )rI   rJ   rM   rL   rA   
embeddingsr   r?  r   r;  r<  r=  r.   r  r   rN   r(  r>  r  s      r?   rJ   DiaDecoder.__init__8  s     "// ++26:!3F!;mmAFvG_G_A`aA`I_V/A`a
 v11vG	 bs   :B>Nr,   r   r   r\  r]  r  rA  rB  r  rX   c
           	      "   UR                  5       SS u  pUb  UR                  5       OSnU	c"  [        R                  " XU-   UR                  S9n	Uc	  U	SSS24   nU R                  U5      nU R                  X5      nUc3  [        5       (       d$  X-   n[        R                  " UUUR                  S9n[        U R                  UUU	UUS9nU R                  UUUR                  SS U5      nU(       a  SOSnU(       a  SOSnU(       a  Ub  SOSnU R                   HE  nU(       a  UU4-  nU" UUUU4UUU	S.U
D6nUS   nU(       d  M.  UUS	   4-   nUc  M<  UUS   4-   nMG     U R                  U5      nU(       a  UU4-  n[        UUUUUS
9$ )z
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
    The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.

    [What are input IDs?](../glossary#input-ids)
NrZ   r   rD  )r*   input_embedsr   r  r  r   r[   r/   )r]  r  r  r!   )rF  r  rz   rG  cross_attentions)sizeget_seq_lengthrP   rQ   r_   rg  r?  r   r   r   r*   _update_cross_attn_maskrb   r  r>  r   )rT   r,   r   r   r\  r]  r  rA  rB  r  r   
batch_size
seq_lengthpast_key_values_lengthrz   r   mask_seq_lengthall_hidden_statesall_self_attnsall_cross_attentionslayerrL  s                         r?   rf   DiaDecoder.forwardC  s   , "+!1#2!6
ETE`!?!?!Afg!"\\&(KT]TdTdN )$'2L 	2"44]Q!*B*D*D4AO"ZZ
OIL\L\]N+;;&))+%
 "&!=!=!"#	"
 #7BD0d&7<Q<]rdh[[E#!m%55!!#%		
 (> /-	 	M *!,M  !/=3C2E!E(4+?=QRCSBU+U() !, 		-0-!118+++%1
 	
r>   r  rN  c                    Ub  Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        UUR                  US   S9nU$ U R                   R                  S:X  a/  [	        U[
        R                  5      (       a  [        UUS   SS9nU$ [        X$R                  US   S9nU$ )	NrP  r   rQ  rZ   )tgt_lenrR  F)query_lengthr   rS  )rT   r\  r]  r  rN  s        r?   rn  "DiaDecoder._update_cross_attn_mask  s     !,1G1S{{//3FFCDH^C^)?&. &%/ ei&. &%- 11V; *M*!'''O*&$ &% 115EE4ellCC-H.%0_"'.* &%	 *D*,?,?UW*& &%r>   )rg  r  r>  rM   r?  rL   )NNNNNFFN)r0   r1   r2   r3   rh   r#   rJ   r   r   rP   ri   r   r  r   r   r  r   r   r   rf   Sizern  r=   rj   rk   s   @r?   re  re  5  s`   7	H/ 	H  4815=A=A9=,1/459Z
<<Z
 u//0Z
 !.	Z

  ((9(9:Z
 !))9)9 :Z
 ""56Z
 $D>Z
 'tnZ
 !!1!12Z
 
8%?	@Z
  Z
z!&$U\\4%78!& !&ellD&8 9!& ZZ	!&
 ||!& !&r>   re  z[
    The bare Dia model outputting raw hidden-states without any specific head on top.
    )custom_introc                   r  ^  \ rS rSrS\4U 4S jjrS rS r\\	           SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\\\4      S\
\   S\
\   S\
\   S\
\   S\
\R                     S\\\4   4S jj5       5       rSrU =r$ )DiaModeli  r*   c                    > [         TU ]  U5        Xl        [        UR                  5      U l        [        UR                  5      U l        U R                  5         g r   )
rI   rJ   r*   r7  encoder_configencoderre  decoder_configdecoder	post_initrx   s     r?   rJ   DiaModel.__init__  sC     !&"7"78!&"7"78r>   c                     U R                   $ r   )r  r   s    r?   get_encoderDiaModel.get_encoder      ||r>   c                     U R                   $ r   )r  r   s    r?   get_decoderDiaModel.get_decoder  r  r>   r,   r   decoder_input_idsdecoder_position_idsdecoder_attention_maskencoder_outputsr  	use_cacherA  rB  r  rX   c                    Uc  Uc  [        S5      eU	b  U	OU R                  R                  n	U
b  U
OU R                  R                  n
Ub  UOU R                  R                  nU R
                  (       a/  U R                  (       a  U(       a  [        R                  S5        SnU(       a  Uc  [        [        5       [        5       5      nUc  U R                  " SUUU	U
S.UD6nOK[        U[        5      (       d6  [        US   [        U5      S:  a  US   OS[        U5      S:  a  US   OSS	9nUS   R                  S   S
U R                  R                   R"                  pnUc7  [$        R&                  " USU4U R                  R(                  U R*                  S9nUR,                  S:X  a"  UR/                  XU5      R1                  SS5      nU R2                  " SUUUUS   UUU	U
UUS.
UD6n[5        UR6                  UR8                  UR:                  UR<                  UR>                  US   UR:                  UR<                  S9$ )a  
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
NzXYou should either provide text ids or the cached text encodings. Neither has been found.zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...F)r,   r   rA  rB  r   r!   r[   rE  rZ   )rl  
fill_valuer_   )
r,   r   r   r\  r]  r  rA  rB  r  r  )rF  r  decoder_hidden_statesdecoder_attentionsrk  encoder_last_hidden_stater\  encoder_attentionsr/   ) 
ValueErrorr*   rA  rB  r  is_gradient_checkpointingr   loggerwarning_oncer   r
   r  r   r   lenrb   r  rM   rP   fullbos_token_idr_   ndimr   r   r  r   rF  r  rz   rG  rk  )rT   r,   r   r  r  r  r  r  r  rA  rB  r  r   bszseq_lenchannelsdecoder_outputss                    r?   rf   DiaModel.forward  sX   N !8j  2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	))dmm##p "	01,.,.QO""ll #-"3%9	
 O O_==-"1!"4474H14Loa0RV14_1E1I?1-tO #2!"4":":1"=r4;;C]C]CjCjh$ %

1h'DKK4L4LUYU`U`! !!Q& 1 9 9# Q [ [\]_` a,, 
'-1"1!"4#1+/!5)
 
 "-??+;;"1"?"?.99,==&5a&8"1"?"?.99	
 		
r>   )r*   r  r  )NNNNNNNNNNN)r0   r1   r2   r3   r"   rJ   r  r  r   r   r   rP   r  r   r   r   r   r  r   rf   r=   rj   rk   s   @r?   r  r    sJ   y   15598<;?=ACG9=$(,0/359k
E,,-k
 !!1!12k
 $E$4$45	k

 'u'7'78k
 !))9)9 :k
 "%(>"?@k
 ""56k
 D>k
 $D>k
 'tnk
 !!1!12k
 
u((	)k
  k
r>   r  zl
    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
    c                     ^  \ rS rSrSrS\4U 4S jjrS rS r\	\
            SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\\\4      S\\   S\\   S\\   S\\   S\\R                     S\\R                     S\\\4   4S jj5       5       rSrU =r$ )DiaForConditionalGenerationiI  r+   r*   c                 v  > [         TU ]  U5        Xl        [        U5      U l        UR
                  R                  U l        UR
                  R                  U l        [        R                  " UR
                  R                  U R                  U R                  -  SS9U l        SU l        U R                  5         g )NFrp   ForMaskedLM)rI   rJ   r*   r  r+   r  rM   rL   r   rr   rN   logits_dense	loss_typer  rx   s     r?   rJ   $DiaForConditionalGeneration.__init__Q  s     f%
"11>> //::II!!--0A0ADOO0S[`
 ' 	r>   c                 6    U R                   R                  5       $ r   )r+   r  r   s    r?   r  'DiaForConditionalGeneration.get_encoder`      zz%%''r>   c                 6    U R                   R                  5       $ r   )r+   r  r   s    r?   r  'DiaForConditionalGeneration.get_decoderc  r  r>   r,   r   r  r  r  r  r  r  rA  rB  labelsr  rX   c                 X   U R                   " S	UUUUUUUUU	U
US.UD6nUS   nUR                  S   nU R                  U5      R                  USU R                  U R
                  45      R                  SS5      R                  5       R                  UU R                  -  SU R
                  5      nSnUb  U R                  " S	UXR
                  S.UD6n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )
a   
decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
    1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
    the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
    tened audio logits which are used to calculate the loss.

    2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
    Dia to calculate embeddings and subsequent steps more efficiently.

    If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
    `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
    [`DiaProcessor.__call__`] for more details.

    [What are decoder input IDs?](../glossary#decoder-input-ids)
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.

    [What are position IDs?](../glossary#position-ids)
labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in
    `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
    are ignored (masked).
)r,   r   r  r  r  r  r  r  rA  rB  r  r   rZ   r!   r[   N)logitsr  rL   )	lossr  r  r  r  rk  r  r\  r  r/   )r+   rb   r  ra   rM   rL   r   r   loss_functionr   r  r  r  rk  r  r\  r  )rT   r,   r   r  r  r  r  r  r  rA  rB  r  r  r   outputsrF  ro  audio_logitsr  s                      r?   rf   #DiaForConditionalGeneration.forwardf  sA   X ** 
)/!5#9++/!5)
 
 $AJ&,,Q/
 /0T:r4#4#4dooFGYq!_Z\T*t000"dooF 	 %%o\&UdUdohnoD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r>   )r*   r  r  r+   rM   rL   )NNNNNNNNNNNN)r0   r1   r2   r3   r5   r"   rJ   r  r  r   r   r   rP   r  r   r   r   r   r  r   rf   r=   rj   rk   s   @r?   r  r  I  sj     y ((  15598<;?=ACG9=$(,0/3-159R
E,,-R
 !!1!12R
 $E$4$45	R

 'u'7'78R
 !))9)9 :R
 "%(>"?@R
 ""56R
 D>R
 $D>R
 'tnR
 ))*R
 !!1!12R
 
uo%	&R
  R
r>   r  )r  r(   r  )Nr!   )r   )Ltypingr   r   r   rP   r   activationsr   cache_utilsr	   r
   r   integrationsr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r    configuration_diar"   r#   r$   generation_diar%   integrations.flex_attentionr&   
get_loggerr0   r  r(   ModulerA   rm   r   r   r   r   ri   r  r   r   r   r   r  r-   r7  r.   re  r  r  __all__r/   r>   r?   <module>r     sP  , - ,   ! C C 7 / g B 9  L F &  M L .  !!J 
		H	% 	? 	? 	?!ryy !8)RYY )$ Y'J J (J(< <D(6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4>)ryy >)BG)		 G)T00 0BS# Sl8D0 8DvN&# N&b 
{
! {

{
| 
l
"46H l

l
^ Lr>   