
    <h                        S SK JrJrJr  S SKrS SKrS SKJr  S SK	J
r
Jr  SSKJr  SSKJrJrJr  SSKJr  SSKJr  SS	KJrJr  SS
KJr  SSKJr  SSKJrJrJ r J!r!J"r"  SSK#J$r$J%r%  SSK&J'r'J(r(  SSK)J*r*  SSK+J,r,J-r-J.r.  SSK/J0r0   " S S\Rb                  5      r2 " S S\Rb                  5      r3S\Rh                  S\5S\Rh                  4S jr6 SDS\Rb                  S\Rh                  S\Rh                  S\Rh                  S\\Rh                     S \7S!\7S"\*\,   4S# jjr8S$ r9SES% jr: " S& S'\Rb                  5      r; " S( S)\Rb                  5      r< " S* S+\5      r= " S, S-\5      r>\- " S. S/\(5      5       r? " S0 S1\?5      r@\- " S2 S3\?5      5       rA  SFS4\B\5\54   S5\7S6\5S\\R                     S7\5S\R                  4S8 jjrE\- " S9 S:\?5      5       rFS;\Rh                  S<\5S=\54S> jrG\-" S?S@9 " SA SB\?\5      5       rH/ SCQrIg)G    )CallableOptionalUnionN)OutputRecordercheck_model_inputs   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)create_causal_mask)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tuple   )MoonshineConfigc                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineEncoderMLP2   c                 
  > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  5      U l	        [
        R                  " UR                  UR                  5      U l
        g Nsuper__init__configr	   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr*   
hidden_act	__class__s      h/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/moonshine/modeling_moonshine.pyr)   MoonshineEncoderMLP.__init__3   s\    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r&   )r0   r+   r1   )r3   r9   s     r6   forwardMoonshineEncoderMLP.forward:   s4    /**=9/r8   r+   r*   r0   r1   
__name__
__module____qualname____firstlineno__r)   torchTensorr<   __static_attributes____classcell__r5   s   @r6   r#   r#   2   s)    KU\\ ell  r8   r#   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )MoonshineDecoderMLPA   c                   > [         TU ]  5         Xl        [        U   U l        [
        R                  " UR                  UR                  S-  5      U l	        [
        R                  " UR                  UR                  5      U l
        g )N   r'   r2   s      r6   r)   MoonshineDecoderMLP.__init__B   sa    #J/99V//1I1IA1MN99V55v7I7IJr8   r9   r:   c                     U R                  U5      nUR                  SSS9u  pU R                  U5      U-  nU R                  U5      nU$ )NrM   dim)r0   chunkr+   r1   )r3   r9   gates      r6   r<   MoonshineDecoderMLP.forwardI   sQ    /+11!1<**40=@/r8   r>   r?   rH   s   @r6   rJ   rJ   A   s)    KU\\ ell  r8   rJ   r9   n_repr:   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r    N)shapeexpandreshape)r9   rV   batchnum_key_value_headsslenhead_dims         r6   	repeat_kvr_   Q   s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr8   modulequerykeyvalueattention_maskscalingdropoutkwargsc                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )NrM   r   rP   )rR   dtype)ptrainingr    )r_   num_key_value_groupsrD   matmul	transposerX   r,   
functionalsoftmaxfloat32torj   rf   rl   
contiguous)r`   ra   rb   rc   rd   re   rf   rg   
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r6   eager_attention_forwardrz   ]   s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r8   c                 x    U SSSS24   nU SSSS24   n[         R                  " U* U4SS9R                  S5      $ )	z*Rotates half the hidden dims of the input..r   NrM   r    rP   rQ   ri   )rD   stackflatten)xx1x2s      r6   rotate_halfr   w   sJ    	
319B	
319B;;Ryb)11"55r8   c                    UR                  U5      nUR                  U5      nUSSUR                  S   S-  24   R                  SSS9nUSSUR                  S   S-  24   R                  SSS9nUR                  S   nU SSU24   U SUS24   pUSSU24   USUS24   pXr-  [        U5      U-  -   nX-  [        U	5      U-  -   n[        R
                  " X/SS9n[        R
                  " X/SS9nX4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
.NrP   rM   rQ   )	unsqueezerX   repeat_interleaver   rD   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r6   apply_rotary_pos_embr   ~   s6   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6c;J;&'3
+;)<6 {{51C78G{{51C78G ii)r2Gii)r2Gr8   c                   |  ^  \ rS rSrSrS\S\S\S\S\4
U 4S jjr     SS	\	R                  S
\\\	R                  \	R                  4      S\\	R                     S\\   S\\	R                     S\\	R                     S\\   S\\	R                  \\	R                     \\\	R                        4   4S jjrSrU =r$ )MoonshineAttention   z=Multi-headed attention from 'Attention Is All You Need' paperr*   	layer_idx	is_causalnum_attention_headsr\   c                   > [         TU ]  5         UR                  XES.5        Xl        X l        [        USUR                  UR                  -  5      U l        UR                  UR                  -  U l
        U R                  S-  U l        UR                  U l        X0l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  UR                  U R                  -  UR                   S9U l        [        R                  " UR                  U R                  -  UR                  SS9U l        U R                  R*                  bA  U R                  R*                  nX`R                  U-   S-
  U-  -  nXpR                  -
  U l        g SU l        g )N)r   r\   r^   g      ࿩biasFr    r   )r(   r)   updater*   r   getattrr.   r   r^   r\   rm   re   attention_dropoutr   r,   r-   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r3   r*   r   r   r   r\   target_multipletarget_head_dimr5   s	           r6   r)   MoonshineAttention.__init__   s    	.Ano"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO---/2QTU2UZi1ijO$3mm$CD!$%D!r8   r9   position_embeddingsrd   past_key_valuecache_positionkey_value_statesrg   r:   c                    UR                   S S u  pU R                  U5      R                  XU R                  R                  U R
                  5      R                  SS5      n
US LnUb^  UR                  R                  U R                  5      nU(       a&  SUR                  U R                  '   UR                  nOUR                  nUb  UOUnU(       aU  U(       aN  W(       aG  UR                  U R                     R                  nUR                  U R                     R                  nOU R                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU R!                  U5      R                  USU R                  R                  U R
                  5      R                  SS5      nU(       a$  Ub!  UR#                  XU R                  SU05      u  pU(       d<  Uu  nn[%        XUU5      u  pUb%  UUUS.nUR#                  XU R                  U5      u  p[&        nU R                  R(                  S:w  a  [*        U R                  R(                     nU R,                  =(       a    US L =(       a    U	S:  nU R.                  S:  a  [0        R2                  R4                  R7                  U
SU R.                  45      n
[0        R2                  R4                  R7                  USU R.                  45      n[0        R2                  R4                  R7                  USU R.                  45      nU" U U
UUU4U R8                  (       d  S	OU R:                  U R<                  US
.UD6u  nnU R.                  S:  a  USS U R.                  * 24   nUR?                  XS5      RA                  5       nU RC                  U5      nUU4$ )NrP   r    rM   Tr   )r   r   r   eagerr           )rf   re   r   .)"rX   r   viewr*   r\   r^   ro   
is_updatedgetr   cross_attention_cacheself_attention_cachelayerskeysvaluesr   r   r   r   rz   _attn_implementationr   r   r   rD   r,   rp   padrl   r   re   rZ   rt   r   )r3   r9   r   rd   r   r   r   rg   bszq_lenquery_statesis_cross_attentionr   current_statesru   rv   r   r   cache_kwargsattention_interfacer   ry   rw   s                          r6   r<   MoonshineAttention.forward   sq    #(("-
 KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'..t~~>CCJ)00@GGL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
 "*HC';LVY[^'_$L)'*3.Y+9+@+@dnnl,(
 )@;;++w6"9$++:Z:Z"[NNK~'=K%!)	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#b9DDFkk+.L((r8   )r   r*   r^   r   r   r   r   rm   r   r   re   r   )NNNNN)r@   rA   rB   rC   __doc__r!   intboolr)   rD   rE   r   tupler
   
LongTensorr   r   r<   rF   rG   rH   s   @r6   r   r      s   G#&#& #& 	#&
 !#& !#&P LP15*.5937U)||U) &eELL%,,,F&GHU) !.	U)
 !U) !!1!12U) #5<<0U) -.U) 
u||Xell3XeELL>Q5RR	SU) U)r8   r   c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )MoonshineRotaryEmbeddingi(  r*   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r(   r)   hasattr
isinstancer   dictr   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr*   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r3   r*   devicer   r5   s       r6   r)   !MoonshineRotaryEmbedding.__init__)  s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r8   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rP   r    mpscpuF)device_typeenabledrM   rQ   rj   )r   floatrY   rX   rs   r   r   r   strrD   autocastro   r   r   r   r   rj   )
r3   r~   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r6   r<    MoonshineRotaryEmbedding.forward:  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)r   r*   r   r   r   r   r   r&   )r@   rA   rB   rC   r!   r)   rD   no_gradr   r<   rF   rG   rH   s   @r6   r   r   (  s6    / / /" ]]_<  <r8   r   c                   8  ^  \ rS rSrS\S\4U 4S jjr      SS\R                  S\	\R                     S\	\R                     S\	\   S	\	\   S
\	\R                     S\	\\R                  \R                  4      S\\   S\\R                     4S jjrSrU =r$ )MoonshineEncoderLayeriJ  r*   r   c                 T  > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l	        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NFr*   r   r   r   r\   r   )r(   r)   r.   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr#   encoder_hidden_actmlpr,   	LayerNorminput_layernormpost_attention_layernormr3   r*   r   r5   s      r6   r)   MoonshineEncoderLayer.__init__K  s    !--+ & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%r8   r9   rd   r   r   	use_cacher   r   rg   r:   c                     Un	U R                  U5      nU R                  " SUUUUUUUS.UD6u  pX-   nUn	U R                  U5      nU R                  U5      nX-   nU$ )Nr9   rd   r   r   r   r   r    )r   r   r   r   )r3   r9   rd   r   r   r   r   r   rg   residual_s              r6   r<   MoonshineEncoderLayer.forward[  s     !,,];>> 	
')%)) 3	
 	
 !0 !55mD/ 0r8   )r.   r   r   r   r   )NNNFNN)r@   rA   rB   rC   r!   r   r)   rD   rE   r   r   r
   r   r   r   r   r<   rF   rG   rH   s   @r6   r   r   J  s    U U3 U& 2637*.$)59KO|| !. u//0	
 ! D> !!1!12 &eELL%,,,F&GH +, 
u||	 r8   r   c                      ^  \ rS rSrSS\S\\   4U 4S jjjr          SS\R                  S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\   S\\   S\\R                     S\\\R                  \R                  4      S\\\R                  \R                  4      S\\   S\\R                   \\\R                   \R                   4      4   4S jjrSrU =r$ )MoonshineDecoderLayeri}  r*   r   c                   > [         TU ]  5         UR                  U l        [        UUSUR                  UR
                  S9U l        [        UUSUR                  UR
                  S9U l        [        XR                  5      U l
        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        [        R                  " UR                  SS9U l        g )NTr   Fr   )r(   r)   r.   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrJ   decoder_hidden_actr   r,   r   r   r   final_layernormr   s      r6   r)   MoonshineDecoderLayer.__init__~  s    !--+ & B B & B B
 / & B B & B B
 'v/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr8   r9   rd   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   encoder_position_embeddingsrg   r:   c                    UnU R                  U5      nU R                  " SUUUUUU	U
S.UD6u  pX-   nUb,  UnU R                  U5      nU R                  UUUUUS9u  pX-   nUnU R	                  U5      nU R                  U5      nX-   nU$ )Nr   )r9   r   rd   r   r   r   )r   r   r   r  r  r   )r3   r9   rd   r
  r  r   r  r   r   r   r   r  rg   r   r   s                  r6   r<   MoonshineDecoderLayer.forward  s     !,,];>> 	
')%)) 3	
 	
 !0 ,$H 99-HM#00+!65-#  1  M %4M ,,];/ 0r8   )r  r  r.   r   r   r   r   r&   )
NNNNNNFNNN)r@   rA   rB   rC   r!   r   r   r)   rD   rE   r   r
   r   r   r   r   FloatTensorr<   rF   rG   rH   s   @r6   r  r  }  st   L L8C= L L6 268<9=37;?*.$)59KOSW.||. !..  (5	.
 !) 6. u//0. 'u'7'78. !. D>. !!1!12. &eELL%,,,F&GH. &.eELL%,,4N.O%P. +,. 
u  (51B1BEDUDU1U+V"WW	X. .r8   r  c                   b    \ rS rSr% \\S'   SrSrSrSS/r	Sr
SrSrS\R                  4S	 jrS
rg)MoonshinePreTrainedModeli  r*   modelinput_valuesTr   r  input_lengthsc                 ~    [        US-
  S-  S-   5      n[        US-
  S-  S-   5      n[        US-
  S-  S-   5      nU$ )z8
Computes the output length of the convolutional layers
   @   r       r   rM   )r   )r3   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r6    _get_feat_extract_output_lengths9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r8   r   N)r@   rA   rB   rC   r!   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_supports_sdpa_can_compile_fullgraphrD   r   r  rF   r   r8   r6   r  r    sH    $O&*#02IJN!#e>N>N #r8   r  c            
          ^  \ rS rSrSrSr\\S.rS\	4U 4S jjr
S\R                  4S jrS	\R                  4S
 jr\ SS\R"                  S\\R&                     S\\   S\4S jj5       rSrU =r$ )MoonshineEncoderi  z
Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

Args:
    config: MoonshineConfig
r  )
attentionsr9   r*   c           	      J  > [         TU ]  U5        Xl        UR                  n[        R
                  " SUSSSS9U l        [        R
                  " USU-  SSS	9U l        [        R
                  " SU-  USSS	9U l        [        R                  " SUS
S9U l
        [        US9U l        [        R                  " [        UR                  5       Vs/ sH  n[!        X5      PM     sn5      U l        [        R$                  " USS9U l        SU l        U R+                  5         g s  snf )Nr    r  r  F)kernel_sizestrider   rM   r  r   )r+  r,  gh㈵>)
num_groupsnum_channelsepsr*   r   )r(   r)   r*   r.   r,   Conv1dconv1conv2conv3	GroupNorm	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   r   r   
layer_normgradient_checkpointing	post_init)r3   r*   	embed_dimidxr5   s       r6   r)   MoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bc;bC"6/;bc
 ,,yu=&+#	 ds   D r:   c                     U R                   $ r&   r2  r3   s    r6   get_input_embeddings%MoonshineEncoder.get_input_embeddings  s    zzr8   rc   c                     Xl         g r&   rB  r3   rc   s     r6   set_input_embeddings%MoonshineEncoder.set_input_embeddings  s    
r8   rd   rg   c                    UR                  S5      n[        R                  R                  U R	                  U5      5      nU R                  U5      n[        R                  R                  U R                  U5      5      n[        R                  R                  U R                  U5      5      nUR                  SSS5      nUb  U R                  UR                  S   5      nSnUSSSU24   SSU24   nU R                  R                  S:X  a  US	:H  R                  5       (       a  UOSnOEU R                  R                  S
:X  a  [        X$R                   5      nO[#        X$R                   5      n[$        R&                  " SUR                  S   UR(                  S9R                  S5      nU R+                  XG5      nU R,                   H  n	U	" U4UUUS.UD6nM     U R/                  U5      n[1        US9$ )a  
Args:
    input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
        Float values of the raw speech waveform. Raw speech waveform can be
        obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
        `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
        the soundfile library (`pip install soundfile`). To prepare the array into
        `input_values`, the [`AutoFeatureExtractor`] should be used for padding
        and conversion into a tensor of type `torch.FloatTensor`.
    attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
        Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
        - 1 for tokens that are **not masked**,
        - 0 for tokens that are **masked**.
        [What are attention masks?](../glossary#attention-mask)
r    r   rM   NrP     .flash_attention_2r   sdpar   )rd   r   r   )last_hidden_state)r   r,   rp   tanhr2  r6  gelur3  r4  permuter  rX   r*   r   anyr   rj   r   rD   aranger   r7  r   r;  r   )
r3   r  rd   rg   r9   mask_lendownsample_strider   r   encoder_layers
             r6   r<   MoonshineEncoder.forward  s   , $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3P3PVZ11V;!D^UhUh!i!;NL_L_!`||A}':':1'=mFZFZ[eefgh"oomJ![[M)-)$7	
 M ) 6&+
 	
r8   )	r*   r2  r3  r4  r<  r6  r;  r   r7  r&   )r@   rA   rB   rC   r   r!  r   r   _can_record_outputsr!   r)   r,   ModulerD  rH  r   rD   r  r   rE   r   r   r   r<   rF   rG   rH   s   @r6   r(  r(    s     %O(.
 $bii "))   268
''8
 !.8
 +,	8

 
!8
 8
r8   r(  c                     ^  \ rS rSrSr\" \SSS9\\" \SSS9S.rS\	4U 4S	 jjr
\         SS\\R                     S
\\R                     S\\R                     S\\   S\\R"                     S\\   S\\R                     S\\R"                     S\\R                     S\\   S\\\4   4S jj5       rSrU =r$ )MoonshineDecoderiA  	input_idsr    r   )index
layer_namer  )r)  r9   cross_attentionsr*   c           	        > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " UR                  UR                  U R                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l        [
        R                  " UR                  SS9U l        [!        US9U l        SU l        U R'                  5         g s  snf )NFr   r0  )r(   r)   pad_token_idpadding_idx
vocab_sizer,   	Embeddingr.   embed_tokensr8  r9  decoder_num_hidden_layersr  r   r   normr   r7  r<  r=  )r3   r*   r?  r5   s      r6   r)   MoonshineDecoder.__init__J  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bc;bC"6/;bc
 LL!3!3%@	2&A&+# 	 ds   C?rd   r   past_key_valuesinputs_embedsr   r   r
  r  rg   r:   c
                    USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a"  Uc  [        5       n[        5       n[        X5      nUcD  Ub  UR	                  5       OSn[
        R                  " XUR                  S   -   UR                  S9nUc  UR                  S5      n[        U R                  UUUUUS9nUnU R                  X5      nU	b  UR                  S   nSnU	S	SSU24   S	SU24   n	U R                  R                  S
:X  a  U	S:H  R                  5       (       a  U	OSn	OaU R                  R                  S:X  a$  [        XR                   UR                  S   5      n	O#[#        XR                   UR                  S   5      n	U R$                   H  nU" UUU4U	UUUUUS.U
D6nM     U R'                  U5      n[)        UU(       a  US9$ SS9$ )a\  
encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
    Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
    of the decoder.
encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
    - 1 for tokens that are **not masked**,
    - 0 for tokens that are **masked**.
    [What are attention masks?](../glossary#attention-mask)
Nz:You must specify exactly one of input_ids or inputs_embedsr   r    rN  )r*   input_embedsrd   r   rj  r   ri   rK  .rL  r   rM  )r  r   r   r   r   r   )rO  rj  )
ValueErrorrf  r   r   get_seq_lengthrD   rT  rX   r   r   r   r*   r7  r   rS  r   rj   r   r   rh  r   )r3   r]  rd   r   rj  rk  r   r   r
  r  rg   r   r   past_seen_tokensrx   r9   r   rU  rV  decoder_layers                       r6   r<   MoonshineDecoder.forwardZ  s7   0 -t";<YZZ  --i8M0#/> $0N!12F^O!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L(;;&))+%
 &"oomJ!-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfChCh)?nr&11V;)L*,?,?ATATUWAX*& *D*,?,?ATATUWAX*& "[[M)% (>).#-$7 M ) 		-08+/8O
 	
>B
 	
r8   )rf  r<  r   rh  rc  r7  rd  )	NNNNNNNNN)r@   rA   rB   rC   r!  r   r   r  rY  r!   r)   r   r   rD   r   rE   r
   r  r   r   r   r   r   r   r<   rF   rG   rH   s   @r6   r\  r\  A  sF   !O$%7q[Y.*+=QSab    151537+/59$(59=A9=Y
E,,-Y
 !.Y
 u//0	Y

 "%Y
   1 12Y
 D>Y
 !!1!12Y
  ((9(9:Y
 !) 6Y
 +,Y
 
u--	.Y
 Y
r8   r\  rX   	mask_probmask_length	min_masksc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ sH  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonrt  rs  ru  sequence_lengths     r6   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr8   NrP   r   r   F)replace)rn  nprandomranditemdetachsumtolistr9  zerosr   choicerT  lenconcatenateonesint32appendarraybroadcast_torZ   ry  put_along_axis)rX   rs  rt  rd   ru  
batch_sizer~  r   r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrz  r{  spec_aug_mask_idxdummy_mask_idxoffsetsr|  r}  s    `` `            @@r6   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I/c                     ^  \ rS rSrS\4U 4S jjrS rS rS rS r	S r
 SS	\R                  S
\\R                     4S jjr\\          SS\\R                     S
\\R                     S\\R                     S\\R                     S\\\\R                           S\\\\\R                     4      S\\\R                        S\\\R                        S\\   S\\R                     S\\   S\4S jj5       5       rSrU =r$ )MoonshineModeli.  r*   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r&   )r(   r)   r(  encoderr\  decoderr=  r3   r*   r5   s     r6   r)   MoonshineModel.__init__0  s2     '/'/r8   c                 .    U R                   R                  $ r&   r  rf  rC  s    r6   rD  #MoonshineModel.get_input_embeddings8  s    ||(((r8   c                 $    XR                   l        g r&   r  rG  s     r6   rH  #MoonshineModel.set_input_embeddings;  s    $)!r8   c                     U R                   $ r&   )r  rC  s    r6   get_encoderMoonshineModel.get_encoder>      ||r8   c                     U R                   $ r&   )r  rC  s    r6   get_decoderMoonshineModel.get_decoderA  r  r8   c                 8    U R                   R                  5         g)z
Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
not be updated during training.
N)r  _freeze_parametersrC  s    r6   freeze_encoderMoonshineModel.freeze_encoderD  s    
 	'')r8   input_featuresrd   c                 2   [        U R                  SS5      (       d  U$ UR                  5       u  p4nU R                  R                  S:  a  U R                  (       a  [        X54U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " XaR                  [        R                  S9nUSS2S4   R                  SUS5      nSX'   U R                  R                  S:  a  U R                  (       az  [        X44U R                  R                  U R                  R                  U R                  R                  S9n[        R                  " XqR                  [        R                  S9nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTr   )rs  rt  rd   ru  )r   rj   NrP   )rs  rt  ru  )r   r*   sizemask_time_probrl   r  mask_time_lengthmask_time_min_masksrD   tensorr   r   rY   mask_feature_probmask_feature_lengthmask_feature_min_masks)r3   r  rd   r  r.   r}  mask_time_indicesmask_feature_indicess           r6   _mask_input_features#MoonshineModel._mask_input_featuresK  sN    t{{$8$??!! 4B3F3F3H0
;;%%)dmm 5-++44 KK88-++99! !&->G\G\didndn o 1!T' : A A"kSU V01N-;;((1,#8)++77 KK;;++<<	$  $)<<0DMbMbjojtjt#u 34N0r8   r  decoder_input_idsdecoder_attention_maskencoder_outputsrj  decoder_inputs_embedsdecoder_position_idsr   r   rg   r:   c                 >   Uc  U R                   " U4SU0UD6nU R                  " SUUUUR                  UUUU	U
S.	UD6n[        UR                  UR                  UR
                  UR                  UR                  UR                  UR
                  UR                  S9$ )a9  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, MoonshineModel
>>> from datasets import load_dataset

>>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values
>>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
>>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
>>> list(last_hidden_state.shape)
[1, 2, 288]
```
rd   )	r]  rd   r  r
  rj  rk  r   r   r   )rO  rj  decoder_hidden_statesdecoder_attentionsr`  encoder_last_hidden_stater
  encoder_attentionsr   )r  r  rO  r   rj  r9   r)  r`  )r3   r  rd   r  r  r  rj  r  r  r   r   rg   decoder_outputss                r6   r<   MoonshineModel.forwardv  s    \ "/3||L/rYg/rkq/rOEI\\ F
'1#1"1"C"C+/-)F
 F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r8   )r  r  r&   )
NNNNNNNNNN)r@   rA   rB   rC   r!   r)   rD  rH  r  r  r  rD   r  r   r   r  r   r   r   r   r   r   r   r   r   r<   rF   rG   rH   s   @r6   r  r  .  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(59E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 "%e.?.?(@"ABE
 "%(;U5CTCT=U(U"VWE
  (e.?.?(@AE
 'uU-=-='>?E
 D>E
 !!1!12E
 +,E
 
E
  E
r8   r  r]  rb  decoder_start_token_idc                     U R                  U R                  5      nU SS2SS24   R                  5       USS2SS24'   X#SS2S4'   Uc  [        S5      eUR	                  US:H  U5        U$ )z)
Shift input ids one token to the right.
NrP   r    r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrX   clonern  masked_fill_)r]  rb  r  shifted_input_idss       r6   shift_tokens_rightr    sz     "++IOO<(CRC0668ae4adLMM""#4#<lKr8   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                     ^  \ rS rSrS/rS\4U 4S jjrS rS rS r	S r
S	\R                  4S
 jr\\           SS\\R$                     S\\R&                     S\\R&                     S\\R&                     S\\\\R$                           S\\\\\R$                     4      S\\\R$                        S\\\R&                        S\\   S\\R&                     S\\R&                     S\\   S	\4S jj5       5       rSrU =r$ )!MoonshineForConditionalGenerationi  zproj_out.weightr*   c                    > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  UR                  SS9U l        U R                  5         g )NFr   )
r(   r)   r  r  r,   r-   r.   rd  proj_outr=  r  s     r6   r)   *MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r8   c                 6    U R                   R                  5       $ r&   )r  r  rC  s    r6   r  -MoonshineForConditionalGeneration.get_encoder      zz%%''r8   c                 6    U R                   R                  5       $ r&   )r  r  rC  s    r6   r  -MoonshineForConditionalGeneration.get_decoder  r  r8   c                     U R                   $ r&   r  rC  s    r6   get_output_embeddings7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r8   c                     Xl         g r&   r  )r3   new_embeddingss     r6   set_output_embeddings7MoonshineForConditionalGeneration.set_output_embeddings  s    &r8   r:   c                 6    U R                   R                  5       $ r&   )r  rD  rC  s    r6   rD  6MoonshineForConditionalGeneration.get_input_embeddings  s    zz..00r8   r  rd   r  r  r  rj  r  r  r   r   labelsrg   c                    Ub:  Uc7  Uc4  [        XR                  R                  U R                  R                  5      nU R                  " U4UUUUUUUU	U
S.	UD6nU R                  UR                  5      nSnUb$  U R                  XU R                  R                  S9n[        UUUR                  UR                  UR                  UR                  UR                  UR                  UR                   S9	$ )ag  
input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
    Float values of the raw speech waveform. Raw speech waveform can be
    obtained by loading a `.flac` or `.wav` audio file into an array of type `list[float]`, a
    `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec libary (`pip install torchcodec`) or
    the soundfile library (`pip install soundfile`). To prepare the array into
    `input_values`, the [`AutoFeatureExtractor`] should be used for padding
    and conversion into a tensor of type `torch.FloatTensor`.
decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
    Indices of positions of each input sequence tokens in the position embeddings.
    Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`

Example:

```python
>>> import torch
>>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
>>> from datasets import load_dataset

>>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
>>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

>>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
>>> input_values = inputs.input_values

>>> generated_ids = model.generate(input_values, max_new_tokens=100)

>>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
>>> transcription
'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
```N)	rd   r  r  r  rj  r  r  r   r   )logitsr  rd  )	lossr  rj  r  r  r`  r  r
  r  )r  r*   rb  r  r  r  rO  loss_functionrd  r   rj  r  r  r`  r  r
  r  )r3   r  rd   r  r  r  rj  r  r  r   r   r  rg   outputsr  r  s                   r6   r<   )MoonshineForConditionalGeneration.forward  s
   f  (-B-J$6KK44dkk6X6X%! '+jj'
)/+#9+"7!5)'
 '
 w889%%Vt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r8   )r  r  )NNNNNNNNNNN)r@   rA   rB   rC   _tied_weights_keysr!   r)   r  r  r  r  r,   rZ  rD  r   r   r   rD   r  r   r   r   r   r   r   r   r   r<   rF   rG   rH   s   @r6   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(59-1T
u001T
 !!1!12T
 $E$4$45	T

 !))9)9 :T
 "%e.?.?(@"ABT
 "%(;U5CTCT=U(U"VWT
  (e.?.?(@AT
 'uU-=-='>?T
 D>T
 !!1!12T
 ))*T
 +,T
 
T
  T
r8   r  )r  r  r  )r   )Nr    )Nr   )Jtypingr   r   r   numpyr  rD   torch.nnr,   transformers.utils.genericr   r   activationsr	   cache_utilsr
   r   r   
generationr   masking_utilsr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   configuration_moonshiner!   rZ  r#   rJ   rE   r   r_   r   rz   r   r   r   r   r   r  r  r(  r\  r   r   ndarrayr  r  r  r  __all__r   r8   r6   <module>r     sz  * - ,    I ! C C ) / g B 9  L F & I I 4")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%46'T}) })@<ryy <D06 0fG6 GT # # #._
/ _
D r
/ r
 r
r 26tc?tt t U--.	t
 t ZZtn N
- N
 N
b%,, c [^   
p
(@/ p

p
f ^r8   