ó
    <±hÖŠ  ã                   óà  • S SK Jr  S SKJrJr  S SKrS SKJr  S SKJ	r	  SSK
JrJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJrJr  SSKJr  SSKJ r J!r!J"r"J#r#J$r$J%r%J&r&J'r'  SSK(J)r)J*r*  SSK+J,r,  \RZ                  " \.5      r/\\" SS9 " S S\5      5       5       r0 " S S\%5      r1 " S S\&5      r2 " S S\#5      r3 " S S\ 5      r4 " S S \!5      r5\" S!S9\ " S" S#\5      5       5       r6\ " S$ S%\$\65      5       r7 " S& S'\Rp                  5      r9\" S(S9 " S) S*\"\5      5       r: " S+ S,\Rp                  5      r;\ " S- S.\$5      5       r<\" S/S9 " S0 S1\6\,5      5       r=/ S2Qr>g)3é    )Ú	dataclass)ÚOptionalÚUnionN)Úcheck_model_inputsé   )ÚCacheÚDynamicCache)ÚGenerationMixin)Úcreate_causal_mask)ÚBaseModelOutputWithPastÚCausalLMOutputWithPast)ÚPreTrainedModel)ÚUnpack)ÚModelOutputÚauto_docstringÚcan_return_tupleÚloggingé   )Ú	AutoModel)ÚLlamaAttentionÚLlamaDecoderLayerÚLlamaForCausalLMÚLlamaMLPÚ
LlamaModelÚLlamaRMSNormÚLlamaRotaryEmbeddingÚTransformersKwargsé   )Ú	CsmConfigÚCsmDepthDecoderConfig)ÚCsmGenerationMixinz:
    Base class for the model autoregressive outputs.
    )Úcustom_introc                   ó  • \ rS rSr% SrSr\\R                     \	S'   Sr
\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S	'   Sr\\R                     \	S
'   Sr\R                  \	S'   Sr\\\\R                           \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   Sr\\R                     \	S'   Srg)ÚCsmOutputWithPasté1   a¡	  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction).
logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

    Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
    `past_key_values` input) to speed up sequential decoding.
depth_decoder_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the depth decoder model.
depth_decoder_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
    Prediction scores of the depth decoder (scores for each vocabulary token before SoftMax).
depth_decoder_past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
    Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
    `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
depth_decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
    Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
    one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

    Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
depth_decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
    Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
    sequence_length)`.
backbone_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
    Language modeling loss (for next-token prediction) of the backbone model.
NÚlossÚlogitsÚpast_key_values.Úhidden_statesÚ
attentionsÚdepth_decoder_lossÚdepth_decoder_logitsÚdepth_decoder_past_key_valuesÚdepth_decoder_hidden_statesÚdepth_decoder_attentionsÚbackbone_loss© )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r&   r   ÚtorchÚFloatTensorÚ__annotations__r'   r(   Útupler)   r*   r+   r,   r-   r.   r/   r0   Ú__static_attributes__r1   ó    Ú[/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/csm/modular_csm.pyr$   r$   1   s'  ‡ ñð< )-€Dˆ(5×$Ñ$Ñ
%Ó,Ø $€FˆE×ÑÓ$ØAE€OX˜e E¨%×*;Ñ*;Ñ$<Ñ=Ñ>ÓEØ=A€M8˜E %×"3Ñ"3°SÐ"8Ñ9Ñ:ÓAØ:>€J˜˜u×0Ñ0°#Ð5Ñ6Ñ7Ó>Ø6:Ð˜ ×!2Ñ!2Ñ3Ó:Ø.2Ð˜%×+Ñ+Ó2ØOSÐ! 8¨E°%¸×8IÑ8IÑ2JÑ,KÑ#LÓSØKOÐ ¨%°×0AÑ0AÀ3Ð0FÑ*GÑ!HÓOØHLÐ˜h u¨U×->Ñ->ÀÐ-CÑ'DÑEÓLØ15€M8˜E×-Ñ-Ñ.Ö5r<   r$   c                   ó   • \ rS rSrSrg)Ú
CsmRMSNorméd   r1   N©r2   r3   r4   r5   r;   r1   r<   r=   r?   r?   d   ó   † Úr<   r?   c                   ó   • \ rS rSrSrg)ÚCsmRotaryEmbeddingéh   r1   NrA   r1   r<   r=   rD   rD   h   rB   r<   rD   c                   ó   • \ rS rSrSrg)ÚCsmMLPél   r1   NrA   r1   r<   r=   rG   rG   l   rB   r<   rG   c                   ó   • \ rS rSrSrg)ÚCsmAttentionép   r1   NrA   r1   r<   r=   rJ   rJ   p   rB   r<   rJ   c                   ó   • \ rS rSrSrg)ÚCsmDecoderLayerét   r1   NrA   r1   r<   r=   rM   rM   t   rB   r<   rM   z[
    The bare Csm Model outputting raw hidden-states without any specific head on top.
    c                   ób   ^ • \ rS rSr% \\S'   SrSrS/rS/r	Sr
SrSrSr\\S.rU 4S jrS	rU =r$ )
ÚCsmPreTrainedModeléx   ÚconfigÚmodelTrM   r(   )r)   r*   c                 ó  >• [         TU ]  U5        [        U[        5      (       a]  UR                  n[        US-
  5       H>  nUR                  R                  U   R                  SU R                  R                  S9  M@     g g )Nr   g        )ÚmeanÚstd)ÚsuperÚ_init_weightsÚ
isinstanceÚCsmCodebooksHeadÚnum_codebooksÚrangeÚweightÚdataÚnormal_rR   Úinitializer_range)ÚselfÚmoduler[   ÚiÚ	__class__s       €r=   rX   Ú CsmPreTrainedModel._init_weights   sn   ø€ Ü‰Ñ˜fÔ%ÜfÔ.×/Ñ/Ø"×0Ñ0ˆMÜ˜=¨1Ñ,Ö-Ø—‘×"Ñ" 1Ñ%×-Ñ-°3¸D¿K¹K×<YÑ<YÐ-ÓZò .ð 0r<   r1   )r2   r3   r4   r5   r   r9   Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_skip_keys_device_placementÚ_supports_flash_attnÚ_supports_sdpaÚ_can_compile_fullgraphÚ_supports_attention_backendrM   rJ   Ú_can_record_outputsrX   r;   Ú__classcell__©rd   s   @r=   rP   rP   x   s\   ø‡ ð ÓØÐØ&*Ð#Ø*Ð+ÐØ#4Ð"5ÐØÐØ€Nð "ÐØ"&Ðà(Ø"ñÐ÷
[ó [r<   rP   c                   ó@  ^ • \ rS rSr% \\S'   U 4S jr\\        SS\	R                  S\\	R                     S\\	R                     S\\	R                     S\\   S	\\	R                     S
\\   S\\	R                     S\\   S\\\4   4S jj5       5       rSrU =r$ )ÚCsmDepthDecoderModelé˜   rR   c                 óü   >• [         TU ]  U5        [        R                  " UR                  UR
                  -  UR                  5      U l        [        R                  " UR                  UR                  SS9U l
        g ©NF)Úbias)rW   Ú__init__ÚnnÚ	Embeddingr[   Ú
vocab_sizeÚbackbone_hidden_sizeÚembed_tokensÚLinearÚhidden_sizeÚinputs_embeds_projector©ra   rR   rd   s     €r=   rw   ÚCsmDepthDecoderModel.__init__œ   s]   ø€ Ü‰Ñ˜Ô ÜŸLšL¨&×*>Ñ*>À×ARÑARÑ*RÐU[×UpÑUpÓqˆÔÜ')§y¢y°×1LÑ1LÈf×N`ÑN`ÐglÑ'mˆÕ$r<   Ú	input_idsÚbackbone_last_hidden_stateÚattention_maskÚposition_idsr(   Úinputs_embedsÚ	use_cacheÚcache_positionÚkwargsÚreturnc	                 ó(  • Ub:  [         R                  R                  5       (       d  [        R	                  S5        SnUSL USL-  (       a  [        S5      eU(       a  Uc
  [        5       nUci  Ub  UR                  5       OSn
Ub  UR                  S   OUR                  S   nUb  UR                  OUR                  n[         R                  " XªU-   US9nUcŒ  [         R                  " US-
  SS9nXÐR                  -  nU R                  X-   5      nUS   S:H  nUb	  X&SS2S4'   O?[         R                  R                  5       (       d  U(       a  [        R                  S5        U R                  U5      n[!        U R"                  UUUUUS	9nUnUR%                  S5      nU R'                  UU5      nU R(                  SU R"                  R*                    H  nU" U4UUUUUUS
.U	D6nM     U R-                  U5      n[/        UU(       a  US9$ SS9$ )a*  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
NzëCustom `position_ids` were provided but will be ignored. CSM depth decoder automatically determines position_ids from `cache_position` and as it requires them to be identical across the batch, the provided position_ids will be ignored.z;You must specify exactly one of input_ids or inputs_embeds.r   r   ©Údevice)ÚminzvWhen the first codebook token is provided, `backbone_last_hidden_state` should also be provided for correct inference.)rR   Úinput_embedsr„   rˆ   r(   r…   )r„   r…   Úpast_key_valuer‡   rˆ   Úposition_embeddings)Úlast_hidden_stater(   )r7   ÚcompilerÚis_compilingÚloggerÚwarning_onceÚ
ValueErrorr	   Úget_seq_lengthÚshaper   ÚarangeÚclamprz   r|   Úwarningr   r   rR   Ú	unsqueezeÚ
rotary_embÚlayersÚnum_hidden_layersÚnormr   )ra   r‚   rƒ   r„   r…   r(   r†   r‡   rˆ   r‰   Úpast_seen_tokensÚinputs_seq_lengthr   Úcodebook_idxsÚoffsetÚinput_ids_are_first_codebookÚcausal_maskr)   r‘   Údecoder_layers                       r=   ÚforwardÚCsmDepthDecoderModel.forward¡   s7  € ð& Ñ#¬E¯N©N×,GÑ,G×,IÑ,IÜ×ÑðMôð  ˆLØ˜Ð -°tÐ";×<ÜÐZÓ[Ð[æ˜Ñ0Ü*›nˆOàÑ!ØCRÑC^˜×=Ñ=Ô?ÐdeÐØ:GÑ:S × 3Ñ 3°AÒ 6ÐYb×YhÑYhÐijÑYkÐØ-:Ñ-F]×)Ò)ÈI×L\ÑL\ˆFÜ"Ÿ\š\Ð*:ÐO`Ñ<`ÐioÑpˆNàÑ Ü!ŸKšK¨¸Ñ(:ÀÑBˆMØ"§_¡_Ñ4ˆFØ ×-Ñ-¨iÑ.@ÓAˆMà+9¸!Ñ+<ÀÑ+AÐ(Ø)Ñ5Ø&@ša ˜dÒ#ä—~‘~×2Ñ2×4Ñ4Ö9UÜ—N‘Nð Qôð ×4Ñ4°]ÓCˆä(Ø—;‘;Ø&Ø)Ø)Ø+Ø%ñ
ˆð &ˆð &×/Ñ/°Ó2ˆØ"Ÿo™o¨m¸\ÓJÐà!Ÿ[™[Ð)H¨4¯;©;×+HÑ+HÓIˆMÙ)Øð	à*Ø)Ø.Ø#Ø-Ø$7ñ	ð ñ	ŠMñ Jð Ÿ	™	 -Ó0ˆÜ&Ø+Þ/8˜Oñ
ð 	
à>Bñ
ð 	
r<   )r|   r   )NNNNNNNN)r2   r3   r4   r5   r    r9   rw   r   r   r7   Ú
LongTensorr   r8   ÚTensorr   Úboolr   r   r   r:   r   r©   r;   ro   rp   s   @r=   rr   rr   ˜   s  ø‡ à!Ó!õnð
 Øð '+ØBFØ15Ø37Ø+/Ø59Ø$(Ø59ñR
à×#Ñ#ðR
ð %-¨U×->Ñ->Ñ$?ðR
ð ! §¡Ñ.ð	R
ð
 ˜u×/Ñ/Ñ0ðR
ð " %™ðR
ð   × 1Ñ 1Ñ2ðR
ð ˜D‘>ðR
ð ! ×!1Ñ!1Ñ2ðR
ð Ð+Ñ,ðR
ð 
ˆuÐ-Ð-Ñ	.ôR
ó ó öR
r<   rr   c                   ó2   ^ • \ rS rSrU 4S jrSS jrSrU =r$ )rZ   éø   c                 ó¨   >• [         TU ]  5         X l        [        R                  " [
        R                  " U R                  S-
  X5      5      U l        g )Nr   )rW   rw   r[   rx   Ú	Parameterr7   Úemptyr]   )ra   r~   r[   rz   rd   s       €r=   rw   ÚCsmCodebooksHead.__init__ù   s:   ø€ Ü‰ÑÔØ*ÔÜ—l’l¤5§;¢;¨t×/AÑ/AÀAÑ/EÀ{Ó#_Ó`ˆr<   c           
      ó„  • Uc3  UR                   S   nU R                  [        R                  " U5         nOUS-
  nU R                  U   n[	        UR                   S   5       Vs/ sH9  n[
        R                  R                  US S 2US S 24   XF   R                  5      PM;     nn[        R                  " USS9nU$ s  snf )Nr   r   ©Údim)
r™   r]   r7   rš   r\   rx   Ú
functionalÚlinearÚTÚstack)ra   r)   rˆ   Ú
seq_lengthÚcodebook_weightr¤   Úcodebook_idxs          r=   r©   ÚCsmCodebooksHead.forwardþ   s¼   € ØÑ!Ø&×,Ñ,¨QÑ/ˆJØ"Ÿk™k¬%¯,ª,°zÓ*BÑC‰Oà*¨QÑ.ˆMØ"Ÿk™k¨-Ñ8ˆOô !& o×&;Ñ&;¸AÑ&>Ô ?ó
á ?ô M‰M× Ñ  ªq°,ÂÐ/AÑ!BÀOÑDa×DcÑDcÖdÙ ?ð 	ð 
ô Ÿš M°qÑ9ˆàÐùò
s   Á%?B=)r[   r]   ©N©r2   r3   r4   r5   rw   r©   r;   ro   rp   s   @r=   rZ   rZ   ø   s   ø† õa÷
ò r<   rZ   a$  
    The CsmDepthDecoder Model transformer, with a [`CsmCodebooksHead`] on top,
    which can be seen a position-specific language modeling head, allowing to use a different linear layer for each codebook
    (e.g. position 0 is the first codebook and uses the first codebook head, etc.)
    c                   ó@  ^ • \ rS rSrSrSrSrU 4S jr    SS\R                  S\
\   S\
\R                     S\
\R                     S\
\R                     4
U 4S	 jjjr\\          SS\R                  S
\
\R                     S\
\R                      S\
\R                     S\
\\\\R                     4      S\
\R                     S\
\R                     S\
\   S\
\R                     S\\\R                   4   S\\   S\\\4   4S jj5       5       rSrU =r$ )ÚCsmDepthDecoderForCausalLMi  Nc                 ó¨   >• [         TU ]  U5        U ?[        UR                  UR
                  UR                  5      U l        [        U5      U l	        g r¿   )
rW   rw   Úlm_headrZ   r~   r[   rz   Úcodebooks_headrr   rS   r€   s     €r=   rw   Ú#CsmDepthDecoderForCausalLM.__init__  sE   ø€ Ü‰Ñ˜Ô ØˆLÜ.¨v×/AÑ/AÀ6×CWÑCWÐY_×YjÑYjÓkˆÔÜ)¨&Ó1ˆ
r<   r‚   r(   r„   r†   rˆ   c                 ó–   >• [         T	U ]  " XX4U40 UD6nUS   S   S:H  nU(       d  UR                  S5        UR                  S5        U$ )Nrˆ   r   rƒ   r…   )rW   Úprepare_inputs_for_generationÚpop)
ra   r‚   r(   r„   r†   rˆ   r‰   Úmodel_inputsÚis_first_generation_steprd   s
            €r=   rÈ   Ú8CsmDepthDecoderForCausalLM.prepare_inputs_for_generation!  sc   ø€ ô ‘wÒ<Ø¨À~ñ
ØY_ñ
ˆð $0Ð0@Ñ#AÀ!Ñ#DÈÑ#IÐ Þ'Ø×ÑÐ9Ô:ð 	×Ñ˜Ô(àÐr<   rƒ   r…   Úlabelsr‡   Úlogits_to_keepr‰   rŠ   c                 óü  • U R                   " SUUUUUUUU	S.UD6nUS   n[        U
[        5      (       a!  U
S:X  a  [        SS5      nO[        U
* S5      nOU
nU R	                  USS2USS24   U	b  Xž   OS5      nUR                  5       nSnUbB  USSS24   R                  5       nU R                  " SUSU R                  R                  US.UD6n[        UUUR                  UR                  UR                  S9$ )	a®  
backbone_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, backbone_hidden_size)`, *optional*):
    The last hidden state of the backbone model. Such input is required when the first codebook token (the one generated by the backbone model)
    is provided in the `input_ids` argument.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
    config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
    (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
)r‚   rƒ   r„   r…   r(   r†   r‡   rˆ   r   r   N.)r'   rÍ   rz   Úshift_labels)r&   r'   r(   r)   r*   r1   )rS   rY   ÚintÚslicerÅ   Ú
contiguousÚloss_functionrR   rz   r   r(   r)   r*   )ra   r‚   rƒ   r„   r…   r(   r†   rÍ   r‡   rˆ   rÎ   r‰   Úoutputsr)   Úslice_indicesr'   r&   rÐ   s                     r=   r©   Ú"CsmDepthDecoderForCausalLM.forward7  s8  € ð2 —*’*ð 

ØØ'AØ)Ø%Ø+Ø'ØØ)ñ

ð ñ

ˆð   ™
ˆän¤c×*Ñ*Ø Ó"ä % a¨£‘ä % ~ o°tÓ <‘à*ˆMà×$Ñ$Øš!˜]ªAÐ-Ñ.ÐQ_ÑQk°Ò0MÐquó
ˆð ×"Ñ"Ó$ˆàˆØÑØ! # q¡r '™?×5Ñ5Ó7ˆLØ×%Ò%ð Ø d°t·{±{×7MÑ7MÐ\hñØlrñˆDô &ØØØ#×3Ñ3Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
r<   )rÅ   rS   ©NNNN)
NNNNNNNNNr   )r2   r3   r4   r5   Ú_tied_weights_keysÚ_tp_planÚ_pp_planrw   r7   r«   r   r   r8   rÈ   r   r   r¬   r   Úlistr­   rÑ   r   r   r:   r   r©   r;   ro   rp   s   @r=   rÂ   rÂ     sÐ  ø† ð ÐØ€HØ€Hõ2ð ,0Ø59Ø59Ø59ñà×#Ñ#ðð " %™ðð ! ×!1Ñ!1Ñ2ð	ð
   × 1Ñ 1Ñ2ðð ! ×!1Ñ!1Ñ2÷ð ð, Øð '+ØBFØ15Ø37ØKOØ59Ø-1Ø$(Ø59Ø34ñ@
à×#Ñ#ð@
ð %-¨U×->Ñ->Ñ$?ð@
ð ! §¡Ñ.ð	@
ð
 ˜u×/Ñ/Ñ0ð@
ð " %¨¨t°E×4EÑ4EÑ/FÐ(FÑ"GÑHð@
ð   × 1Ñ 1Ñ2ð@
ð ˜×)Ñ)Ñ*ð@
ð ˜D‘>ð@
ð ! ×!1Ñ!1Ñ2ð@
ð ˜c 5§<¡<Ð/Ñ0ð@
ð Ð+Ñ,ð@
ð 
ˆuÐ,Ð,Ñ	-ô@
ó ó ö@
r<   rÂ   c                   ó.   ^ • \ rS rSrU 4S jrS rSrU =r$ )ÚCsmBackboneModelEmbeddingsi|  c                 ó  >• [         TU ]  5         [        R                  " UR                  UR
                  -  UR                  5      U l        U R                  S[        R                  " UR                  5      UR
                  -  SS9  g )NÚaudio_tokens_offsetsF)Ú
persistent)rW   rw   rx   ry   r[   rz   r~   Úembed_audio_tokensÚregister_bufferr7   rš   r€   s     €r=   rw   Ú#CsmBackboneModelEmbeddings.__init__}  sn   ø€ Ü‰ÑÔÜ"$§,¢,°×0DÑ0DÀv×GXÑGXÑ0XÐ[a×[mÑ[mÓ"nˆÔØ×ÑØ"¤E§L¢L°×1EÑ1EÓ$FÈ×IZÑIZÑ$ZÐglð 	ò 	
r<   c                 ó^   • U R                  XR                  -   5      nUR                  SS9nU$ )Nr   rµ   )râ   rà   Úsum)ra   r‚   r   s      r=   r©   Ú"CsmBackboneModelEmbeddings.forward„  s4   € Ø×.Ñ.¨y×;TÑ;TÑ/TÓUˆØ#×'Ñ'¨AÐ'Ð.ˆØÐr<   )râ   rÀ   rp   s   @r=   rÞ   rÞ   |  s   ø† õ
÷ð r<   rÞ   c                   óH   ^ • \ rS rSrU 4S jr\\U 4S j5       5       rSrU =r	$ )ÚCsmBackboneModeliŠ  c                 óD   >• [         TU ]  U5        [        U5      U l        g r¿   )rW   rw   rÞ   r|   r€   s     €r=   rw   ÚCsmBackboneModel.__init__Œ  s   ø€ Ü‰Ñ˜Ô Ü6°vÓ>ˆÕr<   c                 ó$   >• [         TU ]  " S0 UD6$ )aæ  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
r1   )rW   r©   )ra   Úsuper_kwargsrd   s     €r=   r©   ÚCsmBackboneModel.forward  s   ø€ ô ‰wŠÑ. Ñ.Ð.r<   )r|   )
r2   r3   r4   r5   rw   r   r   r©   r;   ro   rp   s   @r=   ré   ré   Š  s$   ø† õ?ð Øô/ó ó ö/r<   ré   zË
    The Csm model consists of two llama-like auto-regressive transformer models: a backbone model that predicts the first codebook token and a depth decoder that predicts the other codebook tokens.
    c                   ó<  ^ • \ rS rSrSS/rU 4S jrS rS rS r\	U 4S j5       r
U 4S	 jr    SS
\\R                     S\\R                     S\\R                     S\\R                     S\\R                     4
S jjr    SS
\R                   S\\   S\\R                      S\\R$                     S\\R                      4
U 4S jjjr\\           SS
\R                   S\\R                     S\\R                     S\\R                     S\\R                      S\\\\\R$                     4      S\\R$                     S\\R                      S\\   S\\R                      S\\\R                  4   S\\   S\\\4   4S jj5       5       rSrU =r $ )ÚCsmForConditionalGenerationi¢  z5backbone_model.embed_tokens.embed_audio_tokens.weightz'depth_decoder.model.embed_tokens.weightc                 óê  >• [         TU ]  U5        UR                  U l        [        R                  " UR
                  UR                  SS9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  U5      U l        [        R                  UR                  5      U l        [         R"                  " UR$                  5      U l        U R)                  5         g ru   )rW   rw   rz   rx   r}   r~   rÄ   ry   Útext_vocab_sizeÚembed_text_tokensré   Ú_from_configÚbackbone_modelrÂ   Údepth_decoder_configÚdepth_decoderr   Úfrom_configÚcodec_configÚcodec_modelÚ	post_initr€   s     €r=   rw   Ú$CsmForConditionalGeneration.__init__­  s«   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒÜ—y’y ×!3Ñ!3°V×5FÑ5FÈUÑSˆŒÜ!#§¢¨f×.DÑ.DÀf×FXÑFXÓ!YˆÔÜ.×;Ñ;¸FÓCˆÔÜ7×DÑDÀV×E`ÑE`ÓaˆÔÜ$×0Ò0°×1DÑ1DÓEˆÔØ‰Õr<   c                 ó.   • U R                   R                  $ r¿   ©rõ   r|   ©ra   s    r=   Úget_input_embeddingsÚ0CsmForConditionalGeneration.get_input_embeddings·  s   € Ø×"Ñ"×/Ñ/Ð/r<   c                 ó$   • XR                   l        g r¿   rþ   )ra   Úvalues     r=   Úset_input_embeddingsÚ0CsmForConditionalGeneration.set_input_embeddingsº  s   € Ø+0×ÑÕ(r<   c                 óØ   • U R                   R                  (       aO  U R                  U R                  R                  R
                  U R                  R                  R                  5        g g r¿   )rR   Útie_codebooks_embeddingsÚ_tie_or_clone_weightsrõ   r|   râ   r÷   rS   rÿ   s    r=   Ú_tie_weightsÚ(CsmForConditionalGeneration._tie_weights½  sL   € Ø;‰;×/×/Ø×&Ñ&Ø×#Ñ#×0Ñ0×CÑCØ×"Ñ"×(Ñ(×5Ñ5õð 0r<   c                 óþ  >• UR                  SS5      (       a  [        T
U ]  " U0 UD6u  p4O[        T
U ]  " U0 UD6nSn[        U5      n[	        UR
                  5      R                  5        VVs0 sH"  u  pxUR                  U5      (       d  M  XvS  U_M$     n	nn[	        UR                  R
                  5      R                  SS0U	E5        U	 H  n[        UR
                  XW-   5        M     SU;   a  UW4$ U$ s  snnf )NÚoutput_loading_infoFÚdepth_decoder_Ú_from_model_config)ÚgetrW   Úfrom_pretrainedÚlenÚvarsÚgeneration_configÚitemsÚ
startswithr÷   ÚupdateÚdelattr)ÚclsÚargsr‰   rS   Úloading_infoÚprefixÚ
prefix_lenÚattrr  Údepth_decoder_attrsrd   s             €r=   r  Ú+CsmForConditionalGeneration.from_pretrainedÄ  s  ø€ à:‰:Ð+¨U×3Ñ3Ü"'¡'Ò"9¸4Ð"JÀ6Ñ"JÑˆE<ä‘GÒ+¨TÐ<°VÑ<ˆEð "ˆÜ˜“[ˆ
ô  $ E×$;Ñ$;Ó<×BÑBÔDô
áD‘Ø‰˜v×&ó %ˆDÐ˜uÒ$ÙDð 	ñ 
ô 	ˆU× Ñ ×2Ñ2Ó3×:Ñ:Ð<PÐRWÐ;oÐ[nÐ;oÔpó (ˆDÜE×+Ñ+¨V©]Ö;ñ (ð ! FÓ*Ø˜,Ð&Ð&àˆLùó
s   Á/C9Â	C9c                 óú   >• SnU R                   R                  R                  5       nUR                  SS 5        UR	                  5        H  u  pV[        U R                  X5-   U5        M      [        TU ]  " U0 UD6  g )Nr  Útransformers_version)r÷   r  Úto_diff_dictrÉ   r  ÚsetattrrW   Úsave_pretrained)ra   r  r‰   r  r  r  r  rd   s          €r=   r$  Ú+CsmForConditionalGeneration.save_pretrainedß  sq   ø€ à!ˆØ"×0Ñ0×BÑB×OÑOÓQÐØ×ÑÐ 6¸Ô=Ø.×4Ñ4Ö6‰KˆDÜD×*Ñ*¨F©M¸5ÖAñ 7ô 	‰Ò Ð0¨Ó0r<   r‚   Úinput_valuesÚinput_values_cutoffsrÍ   rŠ   c                 óü  • U R                  U5      nUGbM  [        R                  R                  US5      nX3S:¬     R	                  5       nXfS:„     n[
        R                  " UR                  5       UR                  S9R                  [        U5      S5      nXvR                  S5      :  n[
        R                  " 5          / n[        X#5       H›  u  pšXªS:¬     n
[        U
R                  S   S-
  5       Hp  nX«   nX«S-      nU	SXÍ24   nU R                   R#                  UR                  S5      5      nUR$                  R'                  SS5      nUR)                  US   5        Mr     M     [        S U 5       5      n[
        R*                  " U Vs/ sH7  n[        R                  R                  USSSUUR                  S   -
  45      PM9     sn5      nU R                   R-                  U5      nSSS5        U R.                  R0                  nUU:H  nU R2                  R5                  W5      nUW   UU'   [
        R6                  " SSU R.                  R8                  4UR                  [
        R:                  S	9U R.                  R<                  -  nU R2                  R5                  U5      R?                  S5      nXR.                  R@                  :H  nURC                  URE                  5       S5      UU'   Ubg  UR                  S5      RC                  SSU R.                  R8                  5      nUU   UU'   UUU'   US
:H  RG                  SS9nSUUS   US   SS24'   UnXTS.$ s  snf ! , (       d  f       GN= f)a8  
Merges the input_ids and input_values to produce a single inputs_embeds tensor:
1 - Infers the codec model on the input_values to retreive codebook token.
2 - Embeds codebook tokens and places them at the correct positions in the inputs_embeds tensor.
3 - If labels are provided, expands them to match codebook dimensions and position the target codebook tokens in the inputs_embeds tensor.

Args:
    input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
        The input ids to embed.
    input_values (`torch.Tensor` of shape `(batch_size, channels, audio_sequence_length)`):
        The audio input values to embed.
    input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`):
        The cutoffs of the audio input values relative to its batch index, padded with -1 when no audio.
N©r   r   r   rŒ   éÿÿÿÿr   .c              3   ó<   #   • U H  oR                   S    v •  M     g7f)r   N)r™   )Ú.0Úels     r=   Ú	<genexpr>ÚQCsmForConditionalGeneration._merge_input_ids_with_input_values.<locals>.<genexpr>  s   é € Ð&OÑ=N°r§x¡x°¦{Ò=Nùs   ‚)r   Údtypei›ÿÿÿT©Úas_tupleéœÿÿÿ)r†   rÍ   )$ró   rx   r·   ÚpadÚdiffr7   rš   Úmaxr   Úexpandr  r   Úno_gradÚzipr\   r™   rú   ÚencodeÚaudio_codesÚ	transposeÚappendrº   Úget_audio_codes_maskrR   Úaudio_token_idrõ   r|   Úonesr[   ÚlongÚcodebook_eos_token_idÚsqueezeÚaudio_eos_token_idÚrepeatræ   Únonzero)ra   r‚   r&  r'  rÍ   r†   Úaudio_lengthsÚinput_values_maskÚaudio_tokens_listÚbatch_input_valuesÚbatch_input_values_cutoffsrc   Ú	start_idxÚend_idxÚaudio_batchÚcodec_outputsÚcodebook_idsÚmax_audio_framesr-  Úbatched_audio_token_idsÚaudio_codes_maskr?  Úaudio_token_maskÚaudio_embedsÚaudio_eos_frame_idsÚaudio_eos_embedsÚaudio_eos_token_maskÚlabels_expandedÚ depth_decoder_ignore_frames_idxss                                r=   Ú"_merge_input_ids_with_input_valuesÚ>CsmForConditionalGeneration._merge_input_ids_with_input_valuesé  su  € ð* ×.Ñ.¨yÓ9ˆàÒ#ä#%§=¡=×#4Ñ#4Ð5IÈ6Ó#RÐ Ø0ÈÑ1JÑK×PÑPÓRˆMØ)¸!Ñ*;Ñ<ˆMÜ %§¢Ð-A×-EÑ-EÓ-GÐP\×PcÑPcÑ d× kÑ kÜMÓ" Bó!Ðð !2×4KÑ4KÈAÓ4NÑ NÐô
 —’•Ø$&Ð!ÜFIÈ,ÖFmÑBÐ&Ø1KÐjkÑLkÑ1lÐ.Ü"Ð#=×#CÑ#CÀAÑ#FÈÑ#JÖK˜Ø$>Ñ$A˜	Ø"<À¹UÑ"C˜Ø&8¸¸iÐ>OÐ9OÑ&P˜Ø(,×(8Ñ(8×(?Ñ(?À×@UÑ@UÐVWÓ@XÓ(Y˜Ø'4×'@Ñ'@×'JÑ'JÈ1ÈbÓ'Q˜Ø)×0Ñ0°¸a±ÖAó Lñ Gnô $'Ñ&OÑ=NÓ&OÓ#OÐ Ü*/¯+ª+Ù`qÓrÑ`qÐZ\”R—]‘]×&Ñ& r¨A¨q°!Ð5EÈÏÉÐQRÉÑ5SÐ+TÖUÑ`qÑró+Ð'ð $(×#3Ñ#3×#HÑ#HÐIZÓ#[Ð ÷! !ð$ "Ÿ[™[×7Ñ7ˆNØ(¨NÑ:Ðà×.Ñ.×;Ñ;Ð<SÓTˆLØ.:Ð;KÑ.LˆMÐ*Ñ+ô —
’
˜A˜q $§+¡+×";Ñ";Ð<ÀY×EUÑEUÔ]b×]gÑ]gÑhØ—+‘+×3Ñ3ñ4ð  ð  $×2Ñ2×?Ñ?Ð@SÓT×\Ñ\Ð]^Ó_Ðà#,·±×0NÑ0NÑ#NÐ Ø2B×2IÑ2IÐJ^×JbÑJbÓJdÐfgÓ2hˆMÐ.Ñ/ð Ñ!Ø"(×"2Ñ"2°2Ó"6×"=Ñ"=¸aÀÀDÇKÁK×D]ÑD]Ó"^Ø4KÐL\Ñ4]Ð 0Ñ1Ø8KÐ 4Ñ5à4:¸d±N×3KÑ3KÐUYÐ3KÐ3ZÐ0ØptÐ @ÀÑ CÐEeÐfgÑEhÐjkÑjlÐ lÑmØ(à!.ÑAÐAùò= s÷ !–ús   Ã CM,Æ=M'
Ç"M,Í'M,Í,
M;r(   r„   r†   rˆ   c           	      ó2  >• [         T	U ]  " S	UUUUUS.UD6nUb|  UR                  S:X  al  UR                  S5      cZ  U R	                  UUR                  S5      UR                  S5      UR                  S5      S9nUR                  US   US   S S.5        U$ )
N)r‚   r(   r„   r†   rˆ   r   r†   r&  r'  rÍ   )r‚   r&  r'  rÍ   )r†   rÍ   r‚   r1   )rW   rÈ   Úndimr  r[  r  )
ra   r‚   r(   r„   r†   rˆ   r‰   rÊ   Úmerged_inputsrd   s
            €r=   rÈ   Ú9CsmForConditionalGeneration.prepare_inputs_for_generation;  sÁ   ø€ ô ‘wÒ<ð 
ØØ+Ø)Ø'Ø)ñ
ð ñ
ˆð Ñ  Y§^¡^°qÓ%8¸\×=MÑ=MÈoÓ=^Ñ=fØ ×CÑCØ#Ø#ŸZ™Z¨Ó7Ø%+§Z¡ZÐ0FÓ%GØ—z‘z (Ó+ð	 Dð ˆMð ×ÑØ"/°Ñ"@ÈMÐZbÑLcÐrvÑwôð Ðr<   r…   r‡   rÎ   r‰   c                 óâ  • Ub.  UR                   S:X  a  U R                  XXH5      nUS   nUS   nSnU R                  " SUUUUUU	U
S.UD6nUS   n[        U[        5      (       a  [        U* S5      OUnU R                  USS2USS24   5      nSnSnSnSnUbç  USS2SS2S4   nU R                  " SUUU R                  R                  S.UD6nUSS2SS2SS24   S	:H  R                  S
S9) nUU   SSU R                  R                  S-
  24   n[        R                  R                  USSS9nUR                  SS9nUUS   US   S-
  SS24   nUU   nU R                   " SUUU	SUS.UD6nUR"                  nUU-   n[%        UUUUUR&                  UR(                  UR*                  Ub  UR,                  OSUb  UR&                  OSUb  UR(                  OSUb  UR*                  S9$ SS9$ )a_  
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks) or (batch_size, sequence_length)`):
    1. (batch_size, sequence_length): corresponds to the input sequence prepared with the processor from the text prompt. Such input
    requires `input_values` to be provided so that audio can be encoded in codebook tokens and then merged with the text tokens.

    2. (batch_size, sequence_length, num_codebooks): codebook tokens generated during the autoregressive decoding. Such input is not meant to be used by end users.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
input_values_cutoffs (`torch.Tensor` of shape `(batch_size, max_num_audio)`, *optional*):
    Specify the end positions of audio segments within each batch entry, relative to the concatenated audio input.
    If a batch entry has fewer segments than the maximum, it is padded with -1. For example, in a batch of 2 sequences
    where the first contains 2 audio segments of length l1, and the second contains 1 audio segment of length l2,
    the input_values_cutoffs would be: [[l1, 2 * l1], [l2, -1]].
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[config.audio_token_id, -100, -101]`.
    Requires targeted `input_values` to be provided as audio tokens will be infered from it using the `codec_model`.
    - `config.audio_token_id` indicates an audio frames (considering sequence length elements as frames)
    - `-100` will be ignored in the loss computation
    - `-101` indicates the audio frame will be used only for the backbone model (using the first codebook token as labels)

    Such labels can be prepared using `output_labels=True` when calling [`CsmProcessor`].
logits_to_keep (`int` or `torch.Tensor`, *optional*):
    Kept for compatibility. Does not support another value than:
    1. `0`, which is equivalent to keeping all logits, used in the training regime
    2. `1`, which is equivalent to keeping only the last logit, used in the generation regime

Example:

```python
>>> import torch
>>> from transformers import CsmForConditionalGeneration, AutoProcessor
>>> from datasets import load_dataset, Audio

>>> model_id = "sesame/csm-1b"
>>> torch_device = "cuda" if torch.cuda.is_available() else "cpu"

>>> processor = AutoProcessor.from_pretrained(model_id)

>>> ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
>>> # ensure the audio is 24kHz
>>> ds = ds.cast_column("audio", Audio(sampling_rate=24000))

>>> conversation = []
>>> # prepare a conversation with text and corresponding audio
>>> for text, audio, speaker_id in zip(ds[:4]["text"], ds[:4]["audio"], ds[:4]["speaker_id"]):
...     conversation.append(
...         {
...             "role": f"{speaker_id}",
...             "content": [{"type": "text", "text": text}, {"type": "audio", "path": audio["array"]}],
...         }
...     )

>>> inputs = processor.apply_chat_template(
...     conversation,
...     tokenize=True,
...     return_dict=True,
...     output_labels=True,
... ).to(torch_device)

>>> model = CsmForConditionalGeneration.from_pretrained(model_id, device_map=torch_device)
>>> output = model(**inputs)
>>> output.loss.backward()
```Nr   r†   rÍ   )r‚   r„   r…   r(   r†   r‡   rˆ   r   )r'   rÍ   rz   r   r3  r*  rµ   .r)  )r  Tr1  )r‚   rƒ   r‡   Úreturn_dictrÍ   )r&   r0   r+   r'   r(   r)   r*   r,   r-   r.   r/   r1   )r^  r[  rõ   rY   rÑ   rÒ   rÄ   rÔ   rR   rz   Úallr[   rx   r·   r4  rF  r÷   r&   r$   r(   r)   r*   r'   )ra   r‚   r&  r„   r'  r…   r(   r†   rÍ   r‡   rˆ   rÎ   r‰   r_  Úbackbone_outputsÚbackbone_hidden_statesrÖ   Úbackbone_logitsr&   r0   r+   Údepth_decoder_outputsÚbackbone_labelsÚ
train_maskÚdepth_decoder_input_idsÚ
train_idxsÚbackbone_last_hidden_statesÚdepth_decoder_labelss                               r=   r©   Ú#CsmForConditionalGeneration.forwardZ  s©  € ðf Ñ  Y§^¡^°qÓ%8Ø ×CÑCØÐ)=óˆMð *¨/Ñ:ˆMØ" 8Ñ,ˆFØˆIà×.Ò.ð 	
ØØ)Ø%Ø+Ø'ØØ)ñ	
ð ñ	
Ðð "2°!Ñ!4Ðä8BÀ>ÔSV×8WÑ8Wœ˜~˜o¨tÔ4Ð]kˆØŸ,™,Ð'=ºaÀÒPQÐ>QÑ'RÓSˆàˆØˆØ!ÐØ $ÐØÑà$¢Qª¨1 W™oˆOØ ×.Ò.ð Ø&¨È4Ï;É;×KaÑKañØekñˆMð "¢!¢Q¨© (Ñ+¨tÑ3×8Ñ8¸RÐ8Ð@Ð@ˆJØ&,¨ZÑ&8¸Ð>]ÀÇÁ×@YÑ@YÐ\]Ñ@]Ð>]Ð9]Ñ&^Ð#ä&(§m¡m×&7Ñ&7Ð8OÐQWÐ_`Ð&7Ð&aÐ#à#×+Ñ+°TÐ+Ð:ˆJØ*@ÀÈAÁÐPZÐ[\ÑP]Ð`aÑPaÒcdÐAdÑ*eÐ'Ø#)¨*Ñ#5Ð à$(×$6Ò$6ð %Ø1Ø+FØ#Ø Ø+ñ%ð ñ%Ð!ð "7×!;Ñ!;ÐØ Ð#5Ñ5ˆDä ØØ'Ø1Ø"Ø,×<Ñ<Ø*×8Ñ8Ø'×2Ñ2ØAVÑAbÐ!6×!=Ò!=Ðhlà$Ñ0ð +@×*OÒ*Oàà$Ñ0ð )>×(KÒ(KàØI^ÑIjÐ%:×%EÑ%Eñ
ð 	
ð quñ
ð 	
r<   )rõ   rú   r÷   ró   rÄ   rz   rØ   )NNNNNNNNNNr   )!r2   r3   r4   r5   rÙ   rw   r   r  r	  Úclassmethodr  r$  r   r7   r¬   r[  r«   r   r8   rÈ   r   r   r   rÜ   r­   rÑ   r   r   r:   r$   r©   r;   ro   rp   s   @r=   rð   rð   ¢  s‰  ø† ð 	@Ø1ðÐõ
ò0ò1òð ôó ðõ41ð -1Ø/3Ø7;Ø)-ñPBà˜EŸL™LÑ)ðPBð ˜uŸ|™|Ñ,ðPBð ' u§|¡|Ñ4ð	PBð
 ˜Ÿ™Ñ&ðPBð 
%—,‘,Ñ	õPBðj ,0Ø59Ø59Ø59ñà×#Ñ#ðð " %™ðð ! ×!1Ñ!1Ñ2ð	ð
   × 1Ñ 1Ñ2ðð ! ×!1Ñ!1Ñ2÷ð ð> Øð '+Ø/3Ø15Ø7;Ø37ØKOØ59Ø-1Ø$(Ø59Ø34ñ[
à×#Ñ#ð[
ð ˜uŸ|™|Ñ,ð[
ð ! §¡Ñ.ð	[
ð
 ' u§|¡|Ñ4ð[
ð ˜u×/Ñ/Ñ0ð[
ð " %¨¨t°E×4EÑ4EÑ/FÐ(FÑ"GÑHð[
ð   × 1Ñ 1Ñ2ð[
ð ˜×)Ñ)Ñ*ð[
ð ˜D‘>ð[
ð ! ×!1Ñ!1Ñ2ð[
ð ˜c 5§<¡<Ð/Ñ0ð[
ð Ð+Ñ,ð[
ð 
ˆuÐ'Ð'Ñ	(ô[
ó ó ö[
r<   rð   )rP   ré   rr   rÂ   rð   )?Údataclassesr   Útypingr   r   r7   Útorch.nnrx   Útransformers.utils.genericr   Úcache_utilsr   r	   Ú
generationr
   Úmasking_utilsr   Úmodeling_outputsr   r   Úmodeling_utilsr   Úprocessing_utilsr   Úutilsr   r   r   r   Úautor   Úllama.modeling_llamar   r   r   r   r   r   r   r   Úconfiguration_csmr   r    Úgeneration_csmr!   Ú
get_loggerr2   r•   r$   r?   rD   rG   rJ   rM   rP   rr   ÚModulerZ   rÂ   rÞ   ré   rð   Ú__all__r1   r<   r=   Ú<module>r‚     s¶  ðõ  "ß "ã Ý å 9ç .Ý )Ý /ß OÝ -Ý &ß KÓ KÝ ÷	÷ 	ó 	÷ @Ý .ð 
×	Ò	˜HÓ	%€ð Ùðñô
)6˜ó )6óó ð)6ôZ	ô 	ô	Ð-ô 	ô	ˆXô 	ô	>ô 	ô	Ð'ô 	ñ ðñð
 ô[˜ó [ó óð[ð4 ô\
˜:Ð'9ó \
ó ð\
ô~r—y‘yô ñ. ðñôc
Ð!1°?ó c
óðc
ôL §¡ô ð ô/zó /ó ð/ñ. ðñô
P
Ð"4Ð6Hó P
óð
P
òf
r<   