
    <h֊                        S SK Jr  S SKJrJr  S SKrS SKJr  S SKJ	r	  SSK
JrJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  \RZ                  " \.5      r/\\" SS9 " S S\5      5       5       r0 " S S\%5      r1 " S S\&5      r2 " S S\#5      r3 " S S\ 5      r4 " S S \!5      r5\" S!S9\ " S" S#\5      5       5       r6\ " S$ S%\$\65      5       r7 " S& S'\Rp                  5      r9\" S(S9 " S) S*\"\5      5       r: " S+ S,\Rp                  5      r;\ " S- S.\$5      5       r<\" S/S9 " S0 S1\6\,5      5       r=/ S2Qr>g)3    )	dataclass)OptionalUnionN)check_model_inputs   )CacheDynamicCache)GenerationMixin)create_causal_mask)BaseModelOutputWithPastCausalLMOutputWithPast)PreTrainedModel)Unpack)ModelOutputauto_docstringcan_return_tuplelogging   )	AutoModel)LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaMLP
LlamaModelLlamaRMSNormLlamaRotaryEmbeddingTransformersKwargs   )	CsmConfigCsmDepthDecoderConfig)CsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )custom_introc                      \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   Sr\\R                     \	S
'   Sr\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Srg)CsmOutputWithPast1   a	  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the depth decoder model.
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the backbone model.
Nlosslogitspast_key_values.hidden_states
attentionsdepth_decoder_lossdepth_decoder_logitsdepth_decoder_past_key_valuesdepth_decoder_hidden_statesdepth_decoder_attentionsbackbone_loss )__name__
__module____qualname____firstlineno____doc__r&   r   torchFloatTensor__annotations__r'   r(   tupler)   r*   r+   r,   r-   r.   r/   r0   __static_attributes__r1       [/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/csm/modular_csm.pyr$   r$   1   s'   < )-D(5$$
%, $FE$AEOXeE%*;*;$<=>E=AM8E%"3"3S"89:A:>Ju00#567>6:!2!23:.2%++2OS!8E%8I8I2J,K#LSKO%0A0A30F*G!HOHLhuU->->-C'DEL15M8E--.5r<   r$   c                       \ rS rSrSrg)
CsmRMSNormd   r1   Nr2   r3   r4   r5   r;   r1   r<   r=   r?   r?   d       r<   r?   c                       \ rS rSrSrg)CsmRotaryEmbeddingh   r1   NrA   r1   r<   r=   rD   rD   h   rB   r<   rD   c                       \ rS rSrSrg)CsmMLPl   r1   NrA   r1   r<   r=   rG   rG   l   rB   r<   rG   c                       \ rS rSrSrg)CsmAttentionp   r1   NrA   r1   r<   r=   rJ   rJ   p   rB   r<   rJ   c                       \ rS rSrSrg)CsmDecoderLayert   r1   NrA   r1   r<   r=   rM   rM   t   rB   r<   rM   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                   b   ^  \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSr\\S.rU 4S jrS	rU =r$ )
CsmPreTrainedModelx   configmodelTrM   r(   )r)   r*   c                   > [         TU ]  U5        [        U[        5      (       a]  UR                  n[        US-
  5       H>  nUR                  R                  U   R                  SU R                  R                  S9  M@     g g )Nr   g        )meanstd)super_init_weights
isinstanceCsmCodebooksHeadnum_codebooksrangeweightdatanormal_rR   initializer_range)selfmoduler[   i	__class__s       r=   rX    CsmPreTrainedModel._init_weights   sn    f%f.//"00M=1,-""1%--3DKK<Y<Y-Z . 0r<   r1   )r2   r3   r4   r5   r   r9   base_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraph_supports_attention_backendrM   rJ   _can_record_outputsrX   r;   __classcell__rd   s   @r=   rP   rP   x   s\     &*#*+#4"5N ""&("
[ [r<   rP   c                   @  ^  \ rS rSr% \\S'   U 4S jr\\        SS\	R                  S\\	R                     S\\	R                     S\\	R                     S\\   S	\\	R                     S
\\   S\\	R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )CsmDepthDecoderModel   rR   c                    > [         TU ]  U5        [        R                  " UR                  UR
                  -  UR                  5      U l        [        R                  " UR                  UR                  SS9U l
        g NF)bias)rW   __init__nn	Embeddingr[   
vocab_sizebackbone_hidden_sizeembed_tokensLinearhidden_sizeinputs_embeds_projectorra   rR   rd   s     r=   rw   CsmDepthDecoderModel.__init__   s]     LL&*>*>ARAR*RU[UpUpq')yy1L1LfN`N`gl'm$r<   	input_idsbackbone_last_hidden_stateattention_maskposition_idsr(   inputs_embeds	use_cachecache_positionkwargsreturnc	                 (   Ub:  [         R                  R                  5       (       d  [        R	                  S5        SnUSL USL-  (       a  [        S5      eU(       a  Uc
  [        5       nUci  Ub  UR                  5       OSn
Ub  UR                  S   OUR                  S   nUb  UR                  OUR                  n[         R                  " XU-   US9nUc  [         R                  " US-
  SS9nXR                  -  nU R                  X-   5      nUS   S:H  nUb	  X&SS2S4'   O?[         R                  R                  5       (       d  U(       a  [        R                  S5        U R                  U5      n[!        U R"                  UUUUUS	9nUnUR%                  S5      nU R'                  UU5      nU R(                  SU R"                  R*                    H  nU" U4UUUUUUS
.U	D6nM     U R-                  U5      n[/        UU(       a  US9$ SS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.r   r   device)minzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)rR   input_embedsr   r   r(   r   )r   r   past_key_valuer   r   position_embeddings)last_hidden_stater(   )r7   compileris_compilingloggerwarning_once
ValueErrorr	   get_seq_lengthshaper   arangeclamprz   r|   warningr   r   rR   	unsqueeze
rotary_emblayersnum_hidden_layersnormr   )ra   r   r   r   r   r(   r   r   r   r   past_seen_tokensinputs_seq_lengthr   codebook_idxsoffsetinput_ids_are_first_codebookcausal_maskr)   r   decoder_layers                       r=   forwardCsmDepthDecoderModel.forward   s7   & #ENN,G,G,I,IM  L-t";<Z[[0*nO!CRC^==?de:G:S 3 3A 6YbYhYhijYk-:-F]))IL\L\F"\\*:O`<`iopN !KK(:BM"__4F --i.@AM+9!+<+A()5&@ad#~~22449UNN Q 44]C(;;&))+%
 & &//2"oom\J![[)H4;;+H+HIM)	*).#-$7	 	M J 		-0&+/8O
 	
>B
 	
r<   )r|   r   )NNNNNNNN)r2   r3   r4   r5   r    r9   rw   r   r   r7   
LongTensorr   r8   Tensorr   boolr   r   r   r:   r   r   r;   ro   rp   s   @r=   rr   rr      s   !!n
  '+BF1537+/59$(59R
##R
 %-U->->$?R
 !.	R

 u//0R
 "%R
   1 12R
 D>R
 !!1!12R
 +,R
 
u--	.R
  R
r<   rr   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )rZ      c                    > [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g )Nr   )rW   rw   r[   rx   	Parameterr7   emptyr]   )ra   r~   r[   rz   rd   s       r=   rw   CsmCodebooksHead.__init__   s:    *ll5;;t/A/AA/E{#_`r<   c           
         Uc3  UR                   S   nU R                  [        R                  " U5         nOUS-
  nU R                  U   n[	        UR                   S   5       Vs/ sH9  n[
        R                  R                  US S 2US S 24   XF   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr   r   dim)
r   r]   r7   r   r\   rx   
functionallinearTstack)ra   r)   r   
seq_lengthcodebook_weightr   codebook_idxs          r=   r   CsmCodebooksHead.forward   s    !&,,Q/J"kk%,,z*BCO*Q.M"kk-8O !&o&;&;A&> ?
 ? MM  q,/A!BODaDcDcd ? 	 
 Mq9
s   %?B=)r[   r]   Nr2   r3   r4   r5   rw   r   r;   ro   rp   s   @r=   rZ   rZ      s    a
 r<   rZ   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                   @  ^  \ rS rSrSrSrSrU 4S jr    SS\R                  S\
\   S\
\R                     S\
\R                     S\
\R                     4
U 4S	 jjjr\\          SS\R                  S
\
\R                     S\
\R                      S\
\R                     S\
\\\\R                     4      S\
\R                     S\
\R                     S\
\   S\
\R                     S\\\R                   4   S\\   S\\\4   4S jj5       5       rSrU =r$ )CsmDepthDecoderForCausalLMi  Nc                    > [         TU ]  U5        U ?[        UR                  UR
                  UR                  5      U l        [        U5      U l	        g r   )
rW   rw   lm_headrZ   r~   r[   rz   codebooks_headrr   rS   r   s     r=   rw   #CsmDepthDecoderForCausalLM.__init__  sE     L.v/A/A6CWCWY_YjYjk)&1
r<   r   r(   r   r   r   c                    > [         T	U ]  " XX4U40 UD6nUS   S   S:H  nU(       d  UR                  S5        UR                  S5        U$ )Nr   r   r   r   )rW   prepare_inputs_for_generationpop)
ra   r   r(   r   r   r   r   model_inputsis_first_generation_steprd   s
            r=   r   8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation!  sc     w<~
Y_
 $00@#A!#D#I '9: 	(r<   r   r   labelsr   logits_to_keepr   r   c                    U R                   " SUUUUUUUU	S.UD6nUS   n[        U
[        5      (       a!  U
S:X  a  [        SS5      nO[        U
* S5      nOU
nU R	                  USS2USS24   U	b  X   OS5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " SUSU R                  R                  US.UD6n[        UUUR                  UR                  UR                  S9$ )	a  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r   r   r   r   r(   r   r   r   r   r   N.)r'   r   rz   shift_labels)r&   r'   r(   r)   r*   r1   )rS   rY   intslicer   
contiguousloss_functionrR   rz   r   r(   r)   r*   )ra   r   r   r   r   r(   r   r   r   r   r   r   outputsr)   slice_indicesr'   r&   r   s                     r=   r   "CsmDepthDecoderForCausalLM.forward7  s8   2 ** 

'A)%+')

 

  
nc**" %a %~ot <*M$$!]A-.Q_Qk0Mqu
 ""$!#qr'?557L%% dt{{7M7M\hlrD &#33!//))
 	
r<   )r   rS   NNNN)
NNNNNNNNNr   )r2   r3   r4   r5   _tied_weights_keys_tp_plan_pp_planrw   r7   r   r   r   r8   r   r   r   r   r   listr   r   r   r   r:   r   r   r;   ro   rp   s   @r=   r   r     s    HH2 ,0595959## "% !!1!12	
   1 12 !!1!12 ,  '+BF1537KO59-1$(5934@
##@
 %-U->->$?@
 !.	@

 u//0@
 "%tE4E4E/F(F"GH@
   1 12@
 ))*@
 D>@
 !!1!12@
 c5<</0@
 +,@
 
u,,	-@
  @
r<   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )CsmBackboneModelEmbeddingsi|  c                   > [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )Naudio_tokens_offsetsF)
persistent)rW   rw   rx   ry   r[   rz   r~   embed_audio_tokensregister_bufferr7   r   r   s     r=   rw   #CsmBackboneModelEmbeddings.__init__}  sn    "$,,0D0DvGXGX0X[a[m[m"n"ELL1E1E$FIZIZ$Zgl 	 	
r<   c                 ^    U R                  XR                  -   5      nUR                  SS9nU$ )Nr   r   )r   r   sum)ra   r   r   s      r=   r   "CsmBackboneModelEmbeddings.forward  s4    ..y;T;T/TU#''A'.r<   )r   r   rp   s   @r=   r   r   |  s    
 r<   r   c                   H   ^  \ rS rSrU 4S jr\\U 4S j5       5       rSrU =r	$ )CsmBackboneModeli  c                 D   > [         TU ]  U5        [        U5      U l        g r   )rW   rw   r   r|   r   s     r=   rw   CsmBackboneModel.__init__  s     6v>r<   c                 $   > [         TU ]  " S0 UD6$ )a  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
r1   )rW   r   )ra   super_kwargsrd   s     r=   r   CsmBackboneModel.forward  s     w...r<   )r|   )
r2   r3   r4   r5   rw   r   r   r   r;   ro   rp   s   @r=   r   r     s$    ? /  /r<   r   z
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                   <  ^  \ rS rSrSS/rU 4S jrS rS rS r\	U 4S j5       r
U 4S	 jr    SS
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     4
S jjr    SS
\R                   S\\   S\\R                      S\\R$                     S\\R                      4
U 4S jjjr\\           SS
\R                   S\\R                     S\\R                     S\\R                     S\\R                      S\\\\\R$                     4      S\\R$                     S\\R                      S\\   S\\R                      S\\\R                  4   S\\   S\\\4   4S jj5       5       rSrU =r $ )CsmForConditionalGenerationi  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                   > [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g ru   )rW   rw   rz   rx   r}   r~   r   ry   text_vocab_sizeembed_text_tokensr   _from_configbackbone_modelr   depth_decoder_configdepth_decoderr   from_configcodec_configcodec_model	post_initr   s     r=   rw   $CsmForConditionalGeneration.__init__  s      ++yy!3!3V5F5FUS!#f.D.DfFXFX!Y.;;FC7DDVE`E`a$001D1DEr<   c                 .    U R                   R                  $ r   r   r|   ra   s    r=   get_input_embeddings0CsmForConditionalGeneration.get_input_embeddings  s    ""///r<   c                 $    XR                   l        g r   r   )ra   values     r=   set_input_embeddings0CsmForConditionalGeneration.set_input_embeddings  s    +0(r<   c                     U R                   R                  (       aO  U R                  U R                  R                  R
                  U R                  R                  R                  5        g g r   )rR   tie_codebooks_embeddings_tie_or_clone_weightsr   r|   r   r   rS   r   s    r=   _tie_weights(CsmForConditionalGeneration._tie_weights  sL    ;;//&&##00CC""((55 0r<   c                   > UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 sH"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )Noutput_loading_infoFdepth_decoder__from_model_config)getrW   from_pretrainedlenvarsgeneration_configitems
startswithr   updatedelattr)clsargsr   rS   loading_infoprefix
prefix_lenattrr  depth_decoder_attrsrd   s             r=   r  +CsmForConditionalGeneration.from_pretrained  s   ::+U33"''"94"J6"JE<G+T<V<E "[
  $E$;$;<BBD
Dv& %Du$D 	 
 	U  223::<PRW;o[n;op (DE++V]; ( !F*,&&L
s   /C9	C9c                    > SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr  transformers_version)r   r  to_diff_dictr   r  setattrrW   save_pretrained)ra   r  r   r  r  r  r  rd   s          r=   r$  +CsmForConditionalGeneration.save_pretrained  sq    !"00BBOOQ 6=.446KDD**FM5A 7 	00r<   r   input_valuesinput_values_cutoffsr   r   c                    U R                  U5      nUGbM  [        R                  R                  US5      nX3S:     R	                  5       nXfS:     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n[
        R                  " 5          / n[        X#5       H  u  pXS:     n
[        U
R                  S   S-
  5       Hp  nX   nXS-      nU	SX24   nU R                   R#                  UR                  S5      5      nUR$                  R'                  SS5      nUR)                  US   5        Mr     M     [        S U 5       5      n[
        R*                  " U Vs/ sH7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                   R-                  U5      nSSS5        U R.                  R0                  nUU:H  nU R2                  R5                  W5      nUW   UU'   [
        R6                  " SSU R.                  R8                  4UR                  [
        R:                  S	9U R.                  R<                  -  nU R2                  R5                  U5      R?                  S5      nXR.                  R@                  :H  nURC                  URE                  5       S5      UU'   Ubg  UR                  S5      RC                  SSU R.                  R8                  5      nUU   UU'   UUU'   US
:H  RG                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf ! , (       d  f       GN= f)a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retreive codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
Nr   r   r   r   r   .c              3   <   #    U H  oR                   S    v   M     g7f)r   N)r   ).0els     r=   	<genexpr>QCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>  s     &O=Nrxx{=Ns   )r   dtypeiTas_tuple)r   r   )$r   rx   r   paddiffr7   r   maxr   expandr  r   no_gradzipr\   r   r   encodeaudio_codes	transposeappendr   get_audio_codes_maskrR   audio_token_idr   r|   onesr[   longcodebook_eos_token_idsqueezeaudio_eos_token_idrepeatr   nonzero)ra   r   r&  r'  r   r   audio_lengthsinput_values_maskaudio_tokens_listbatch_input_valuesbatch_input_values_cutoffsrc   	start_idxend_idxaudio_batchcodec_outputscodebook_idsmax_audio_framesr-  batched_audio_token_idsaudio_codes_maskr?  audio_token_maskaudio_embedsaudio_eos_frame_idsaudio_eos_embedsaudio_eos_token_masklabels_expanded depth_decoder_ignore_frames_idxss                                r=   "_merge_input_ids_with_input_values>CsmForConditionalGeneration._merge_input_ids_with_input_values  su   * ..y9##%==#4#45I6#R 01JKPPRM)!*;<M %-A-E-E-GP\PcPc d k kM"B! !24K4KA4N N
 $&!FI,FmB&1KjkLk1l."#=#C#CA#F#JK$>$A	"<U"C&8i>O9O&P(,(8(8(?(?@U@UVW@X(Y'4'@'@'J'J1b'Q)00aA L Gn $'&O=N&O#O */++`qr`qZ\R]]&&rAq!5EQR5S+TU`qr+' $(#3#3#H#HIZ#[ ! !$ "[[77N(N:..;;<STL.:;K.LM*+ 

Aq$++";";<YEUEU]b]g]gh++334    $22??@ST\\]^_#,0N0N#N 2B2I2IJ^JbJbJdfg2hM./ !"("2"22"6"="=aDKKD]D]"^4KL\4] 018K 454:dN3K3KUY3K3Z0pt @ CEefgEhjkjl lm(!.AA= s !s    CM,=M'
"M,'M,,
M;r(   r   r   r   c           	      2  > [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)r   r(   r   r   r   r   r   r&  r'  r   )r   r&  r'  r   )r   r   r   r1   )rW   r   ndimr  r[  r  )
ra   r   r(   r   r   r   r   r   merged_inputsrd   s
            r=   r   9CsmForConditionalGeneration.prepare_inputs_for_generation;  s     w< 
+)')
 
  Y^^q%8\=M=Mo=^=f CC##ZZ7%+ZZ0F%Gzz(+	 D M "/"@MZbLcrvw r<   r   r   r   r   c                    Ub.  UR                   S:X  a  U R                  XXH5      nUS   nUS   nSnU R                  " SUUUUUU	U
S.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnSnSnSnUb  USS2SS2S4   nU R                  " SUUU R                  R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                  R                  S-
  24   n[        R                  R                  USSS9nUR                  SS9nUUS   US   S-
  SS24   nUU   nU R                   " SUUU	SUS.UD6nUR"                  nUU-   n[%        UUUUUR&                  UR(                  UR*                  Ub  UR,                  OSUb  UR&                  OSUb  UR(                  OSUb  UR*                  S9$ SS9$ )a_  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be infered from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "sesame/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr   r   r   )r   r   r   r(   r   r   r   r   )r'   r   rz   r   r3  r*  r   .r)  )r  Tr1  )r   r   r   return_dictr   )r&   r0   r+   r'   r(   r)   r*   r,   r-   r.   r/   r1   )r^  r[  r   rY   r   r   r   r   rR   rz   allr[   rx   r   r4  rF  r   r&   r$   r(   r)   r*   r'   )ra   r   r&  r   r'  r   r(   r   r   r   r   r   r   r_  backbone_outputsbackbone_hidden_statesr   backbone_logitsr&   r0   r+   depth_decoder_outputsbackbone_labels
train_maskdepth_decoder_input_ids
train_idxsbackbone_last_hidden_statesdepth_decoder_labelss                               r=   r   #CsmForConditionalGeneration.forwardZ  s   f  Y^^q%8 CC)=M */:M"8,FI.. 	
)%+')	
 	
 "2!!48B>SV8W8W~ot4]k,,'=aPQ>Q'RS! $$Q1WoO .. &4;;KaKaekM "!Q(+t388R8@@J&,Z&8>]@Y@Y\]@]>]9]&^#&(mm&7&78OQW_`&7&a##++T+:J*@APZ[\P]`aPacdAd*e'#)*#5 $($6$6 %1+F# +% %! "7!;!; #55D '1",<<*88'22AVAb!6!=!=hl$0 +@*O*O$0 )>(K(KI^Ij%:%E%E
 	
 qu
 	
r<   )r   r   r   r   r   rz   r   )NNNNNNNNNNr   )!r2   r3   r4   r5   r   rw   r   r  r	  classmethodr  r$  r   r7   r   r[  r   r   r8   r   r   r   r   r   r   r   r   r   r:   r$   r   r;   ro   rp   s   @r=   r   r     s    	@1
01  41 -1/37;)-PBELL)PB u||,PB 'u||4	PB
 &PB 
%,,	PBj ,0595959## "% !!1!12	
   1 12 !!1!12 >  '+/3157;37KO59-1$(5934[
##[
 u||,[
 !.	[

 'u||4[
 u//0[
 "%tE4E4E/F(F"GH[
   1 12[
 ))*[
 D>[
 !!1!12[
 c5<</0[
 +,[
 
u''	([
  [
r<   r   )rP   r   rr   r   r   )?dataclassesr   typingr   r   r7   torch.nnrx   transformers.utils.genericr   cache_utilsr   r	   
generationr
   masking_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   autor   llama.modeling_llamar   r   r   r   r   r   r   r   configuration_csmr   r    generation_csmr!   
get_loggerr2   r   r$   r?   rD   rG   rJ   rM   rP   rr   ModulerZ   r   r   r   r   __all__r1   r<   r=   <module>r     s    " "   9 . ) / O - & K K 	 	 	 @ . 
		H	% 
)6 )6 )6Z	 		- 		X 		> 		' 	 
 [ [ [4 \
:'9 \
 \
~ryy . c
!1? c
c
L  /z / /. 
P
"46H P

P
f
r<   