
    <hQ                     6   S SK r S SKrS SKJrJrJr  S SKrS SKrS SKJ	r	  S SK
Jr  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJr  SSKJrJr  SSKJr  SSKJ r J!r!  SSK"J#r#  \!RH                  " \%5      r& " S S\5      r' " S S\5      r( " S S\5      r) " S S\	RT                  5      r+ " S S\	RT                  5      r, " S S\	RT                  5      r- " S S\	RT                  5      r.   SBS\	RT                  S \R^                  S!\R^                  S"\R^                  S#\\R^                     S$\\0   S%\0S&\\R^                     4S' jjr1 " S( S)\	RT                  5      r2 " S* S+\	RT                  5      r3 " S, S-\5      r4 " S. S/\	RT                  5      r5\  " S0 S1\5      5       r6  SCS2\7\8\84   S3\0S4\8S#\\Rr                     S5\8S6\Rt                  4S7 jjr;\  " S8 S9\65      5       r<Sr=\ " S:S;9 " S< S=\65      5       r>\ " S>S;9 " S? S@\65      5       r?/ SAQr@g)D    N)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputSequenceClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )	SEWConfigc                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWNoLayerNormConvLayer.   c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr	   feat_extract_activation
activationselfconfiglayer_id	__class__s      \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/sew/modeling_sew.pyr!    SEWNoLayerNormConvLayer.__init__/   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@    c                 J    U R                  U5      nU R                  U5      nU$ N)r)   r+   r-   hidden_statess     r1   forwardSEWNoLayerNormConvLayer.forward=   s$    		-06r3   )r+   r)   r#   r$   r   __name__
__module____qualname____firstlineno__r!   r8   __static_attributes____classcell__r0   s   @r1   r   r   .   s    A r3   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWLayerNormConvLayerC   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r   r   T)elementwise_affine)r    r!   r"   r#   r$   r   r%   r&   r'   r(   r)   	LayerNorm
layer_normr	   r*   r+   r,   s      r1   r!   SEWLayerNormConvLayer.__init__D   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r3   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )N)r)   	transposerI   r+   r6   s     r1   r8   SEWLayerNormConvLayer.forwardS   sV    		-0%//B76%//B76r3   r+   r)   r#   rI   r$   r:   r;   rB   s   @r1   rD   rD   C   s    A r3   rD   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )SEWGroupNormConvLayer^   c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r   r   T)
num_groupsnum_channelsaffine)r    r!   r"   r#   r$   r   r%   r&   r'   r(   r)   r	   r*   r+   	GroupNormrI   r,   s      r1   r!   SEWGroupNormConvLayer.__init___   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr3   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r5   )r)   rI   r+   r6   s     r1   r8   SEWGroupNormConvLayer.forwardo   s2    		-066r3   rP   r:   r;   rB   s   @r1   rR   rR   ^   s    r  r3   rR   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWPositionalConvEmbeddingv   c           	        > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R!                  U R                  R"                  SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R"                  R$                  nU R                  R                  R"                  R&                  nO,U R                  R(                  nU R                  R*                  nUR                  R-                  X5        UR                  R-                  X5        OU" U R                  SSS9U l        [/        UR
                  5      U l        [2        UR4                     U l        g ! , (       d  f       GN,= f)	N   )r   paddinggroupsr   weight_normr   modifier_rankweight)namedimparametrizations)r    r!   r   r%   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupssqueeze_factorr)   utilsrc   hasattrri   r
   	deepspeedzeroGatheredParametersrf   	original0	original1weight_gweight_vregister_external_parameterSEWSamePadLayerra   r	   r*   r+   )r-   r.   rc   rp   ru   rv   r0   s         r1   r!   #SEWPositionalConvEmbedding.__init__w   s   II6622a777((
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI&v'E'EF !?!?@ VUs   I
I"c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r5   )r)   ra   r+   r6   s     r1   r8   "SEWPositionalConvEmbedding.forward   s2    		-0]36r3   )r+   r)   ra   r;   rB   s   @r1   r]   r]   v   s     AD r3   r]   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )rx      c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr`   r   r   )r    r!   num_pad_remove)r-   rk   r0   s     r1   r!   SEWSamePadLayer.__init__   s)    #:Q#>!#Car3   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r   r6   s     r1   r8   SEWSamePadLayer.forward   s6    ")!Q0F43F3F2F0F*FGMr3   r   r;   rB   s   @r1   rx   rx      s    K r3   rx   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWUpsampling   c                    > [         TU ]  5         [        R                  " UR                  UR                  UR
                  -  5      U l        [        UR                     U l	        UR
                  U l        g r5   )
r    r!   r   Linearrj   rm   
projectionr	   r*   r+   r-   r.   r0   s     r1   r!   SEWUpsampling.__init__   sW    ))F$6$68J8JVMbMb8bc !?!?@$33r3   c                 &   U R                  U5      nU R                  U5      nU R                  S:  a^  UR                  5       u  p#nX0R                  -  nX@R                  -  nUR	                  X#U R                  U5      nUR	                  X%U5      nU$ )Nr   )r   r+   rm   sizereshape)r-   r7   bszsrc_lensrc_embed_dimtgt_lentgt_embed_dims          r1   r8   SEWUpsampling.forward   s    66"*7*<*<*>'C- 3 33G)-@-@@M)11#@S@SUbcM)11#NMr3   )r+   r   rm   r;   rB   s   @r1   r   r      s    4 r3   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )SEWFeatureEncoder   z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a?  [        USS9/[	        UR
                  S-
  5       Vs/ sH  n[        XS-   S9PM     sn-   nOUUR                  S:X  a,  [	        UR
                  5       Vs/ sH  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r/   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r    r!   feat_extract_normrR   rangenum_feat_extract_layersr   rD   
ValueErrorr   
ModuleListconv_layersgradient_checkpointing_requires_grad)r-   r.   ir   r0   s       r1   r!   SEWFeatureEncoder.__init__   s    ##w.0!DEINvOmOmpqOqIrIIrA'Q?IrI K %%0NSTZTrTrNstNs0DNsKtK01I1I0JJst  ==5&+#"I us   CC#c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr   r-   params     r1   _freeze_parameters$SEWFeatureEncoder._freeze_parameters   s#    __&E"'E '#r3   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ )NT)r   trainingr   r   )r-   input_valuesr7   
conv_layers       r1   r8   SEWFeatureEncoder.forward   sK    $QW- 4==*.M'**J&}5M + r3   )r   r   r   )
r<   r=   r>   r?   __doc__r!   r   r8   r@   rA   rB   s   @r1   r   r      s    8#"$

 
r3   r   modulequerykeyvalueattention_maskscalingdropout	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )NrM         r`   r   rh   r   )pr   )r   torchmatmulrN   r   
functionalsoftmaxviewr   r   
contiguous)r   r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs              r1   eager_attention_forwardr      s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r3   c                   Z  ^  \ rS rSrSr     SS\S\S\S\S\S\S	\\	   4U 4S
 jjjr
    SS\R                  S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )SEWAttentioni  z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderr   	is_causalr.   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   )r   )r    r!   r   r   r   head_dimr.   r   r   r   r   r   r   k_projv_projq_projout_proj)	r-   r   r   r   r   r   r   r.   r0   s	           r1   r!   SEWAttention.__init__	  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr3   r7   key_value_statesr   layer_head_maskoutput_attentionsr   returnc                     USLnUR                   SS u  pU(       a  UR                   S   OU	n
XSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nU(       a  UOUnU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUU4U R                  (       d  SOU R                  U R                  UUS.UD6u  nnUR                  XS5      R                  5       nU R!                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNrM   r   r`   eager        )r   r   r   r   )shaper   r   r   rN   r   r   r   r.   _attn_implementationr   r   r   r   r   r   r   )r-   r7   r   r   r   r   r   is_cross_attentionr   r   r   q_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer   r   s                       r1   r8   SEWAttention.forward(  s    .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV-?)][[055~FPPQRTUV
{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L$..r3   )r.   r   r   r   r   r   r   r   r   r   r   r   )r   FTFN)NNNF)r<   r=   r>   r?   r   intfloatboolr   r   r!   r   Tensorr   r   tupler8   r@   rA   rB   s   @r1   r   r     s    G  &*CC C 	C
 C C C #C CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/ 3/r3   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )SEWFeedForwardi^  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g r5   )r    r!   r   Dropoutactivation_dropoutintermediate_dropoutr   rj   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr	   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r1   r!   SEWFeedForward.__init___  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r3   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r5   )r   r   r   r   r   r6   s     r1   r8   SEWFeedForward.forwardl  sX    //>00?11-@))-8++M:r3   )r   r   r   r   r   r;   rB   s   @r1   r   r   ^  s    @ r3   r   c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )SEWEncoderLayeriv  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )NF)r   r   r   r   r.   eps)r    r!   r   rj   num_attention_headsattention_dropout	attentionr   r   r   r   rH   layer_norm_epsrI   r   feed_forwardfinal_layer_normr   s     r1   r!   SEWEncoderLayer.__init__w  s    %((00,,
 zz&"7"78,,v'9'9v?T?TU*62 "V-?-?VEZEZ [r3   c                     UnU R                  XUS9u  pnU R                  U5      nXA-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xu4-  nU$ )Nr   r   )r  r   rI   r	  r
  )r-   r7   r   r   attn_residualr   _outputss           r1   r8   SEWEncoderLayer.forward  s    %)-L] *8 *
&Q ]3%56%(9(9-(HH--m< "&Gr3   )r  r   r	  r
  rI   r   r;   rB   s   @r1   r  r  v  s    \ r3   r  c                   :   ^  \ rS rSrU 4S jr    SS jrSrU =r$ )
SEWEncoderi  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  5      U l        [
        R                  " UR                  UR                  S9U l        [
        R                  " UR                  5      U l        [
        R                   " [#        UR$                  5       Vs/ sH  n['        U5      PM     sn5      U l        [+        U5      U l        SU l        g s  snf )Nr  F)r    r!   r.   r]   pos_conv_embedr   	AvgPool1drm   poolrH   rj   r  rI   r   r   r   r   r   num_hidden_layersr  layersr   upsampler   )r-   r.   r  r0   s      r1   r!   SEWEncoder.__init__  s    8@LL!6!68M8MN	,,v'9'9v?T?TUzz&"7"78mmeFLdLdFe$fFe_V%<Fe$fg%f-&+# %gs   Dc           	         U(       a  SOS nU(       a  SOS nUGb  UR                  S5      R                  SSUR                  S   5      nU R                  R                  S:X  a  SX) '   Ub  SU;   a  UOS nGO_SX) '   UR                  5       R                  S5      n	XR                  R                  -  n
UR                  S   U R                  R                  -  n[        R                  " SXR                  S9R                  SS5      R                  U
R                  S   S5      nXR                  SS5      :  R                  5       nS	US S 2S S S S 24   R                  UR                  S
9-
  nU[        R                  " UR                  5      R                   -  nUR                  UR                  S   SUR                  S   UR                  S   5      nUR                  S   nUR#                  SS5      nU R%                  U5      nU R'                  U5      n[!        UR)                  S5      UR)                  S5      5      nUSS U24   USS U24   -   nUR#                  SS5      nU R+                  U5      nU R-                  U5      n[/        5       =(       d    [1        U 5      nU R2                   H  nU(       a  Xa4-   n[        R4                  " / 5      nU R6                  =(       a    UU R                  R8                  :  nU(       a  U(       a  U" XUS9nUS   nU(       a  SnU(       d  M}  UWS   4-   nM     U(       a  Xa4-   nU R;                  U5      nUR                  S   U:  a3  [<        R>                  RA                  USSSXR                  S   -
  45      nU(       d  [C        S XU4 5       5      $ [E        UUUS9$ )N rM   r   r`   flash_attention_2r   r   device      ?dtype.r  NNc              3   ,   #    U H  oc  M  Uv   M     g 7fr5   r  ).0vs     r1   	<genexpr>%SEWEncoder.forward.<locals>.<genexpr>  s     m$[q$[s   	last_hidden_stater7   
attentions)#	unsqueezerepeatr   r.   r   longsumrm   r   aranger   r   expandtor#  finfominrN   r  r  r   rI   r   r
   r   r  randr   	layerdropr  r   r   padr   r   )r-   r7   r   r   output_hidden_statesreturn_dictall_hidden_statesall_self_attentionsexpand_attention_maskinput_lengthsoutput_lengthsmax_encoder_lengthattention_idsn_input_timestepsposition_embeddingspooled_hidden_states
min_lengthsynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                         r1   r8   SEWEncoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!{{//3FF8;454B4NSTXfSfmq 9<45!/!4!4!6 ; ;B ?!.++2L2L!L%2%8%8%;t{{?Y?Y%Y"LL$6?T?TUT!R[VN003R8 
 #02E2Eb!2L"L!R!R!T "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" *//2%//15"11-@#yy7,11"57K7P7PQS7TU
,S+:+-=>ATUXZe[eZeUeAff%//156]302R6LT6R[[E#$58H$H! #(**R.!]]Z/BT[[EZEZ/ZN![ %!Te! !.a 0 ,  &9]1=M<O&O#' !*   14D Dm4q!$55MM--maAGX[n[nop[qGq=rsMm]GZ$[mmm++*
 	
r3   )r.   r   r   rI   r  r  r  r  )NFFTr;   rB   s   @r1   r  r    s"    	, "W
 W
r3   r  c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrS rS\\R                  \4   4S	 jrS
\S\R                  4S jrSrg)SEWPreTrainedModeli   r.   sewr   TFc           
         [        U[        5      (       a  [        R                  R	                  UR
                  R                  SS[        R                  " SUR
                  R                  S   UR
                  R                  -  -  5      -  S9  [        R                  R                  UR
                  R                  S5        GO)[        U[        R                  5      (       a:  UR                  R                  R	                  SU R                  R                   S9  GO[        U[        R"                  [        R$                  45      (       aK  UR                  R                  R'                  5         UR                  R                  R)                  S5        GOV[        U[        R*                  5      (       Ga6  [-        5       (       a  SSKn[1        US5      (       a~  [1        US	5      (       am  UR2                  R5                  UR6                  UR8                  /SS
9   [        R                  R;                  UR                  R                  5        SSS5        OUR2                  R5                  UR                  SS
9   [        R                  R;                  UR                  R                  5        SSS5        O3[        R                  R;                  UR                  R                  5        [        U[        R                  [        R*                  45      (       a3  UR                  b%  UR                  R                  R'                  5         ggg! , (       d  f       Nq= f! , (       d  f       N= f)zInitialize the weightsr   r`   r   )meanstdr   r!  Nrv   ru   rd   )r   r]   r   initnormal_r)   rf   mathsqrtr   in_channels	constant_r   r   datar.   initializer_rangerH   rX   zero_fill_r%   r
   rp   ro   rq   rr   rv   ru   kaiming_normal_)r-   r   rp   s      r1   _init_weights SEWPreTrainedModel._init_weights
  sB   f899GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2		** MM&&CT[[5R5R&Sr|| <==KK""$MM$$S)		**)++ 6:..76:3N3N"::FOOV__;]mn:o//0B0BC po #::6==XY:Z//0B0BC [Z ''(:(:;fryy"))455&++:QKK""$ ;R5 po [Zs   4M 4M$
M!$
M2r>  c                     S n[        U R                  R                  U R                  R                  5       H  u  p4U" XU5      nM     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder   )r   div)input_lengthr   r   s      r1   _conv_out_lengthMSEWPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length/  s      99\7wWZ[[[r3   )zipr.   r&   r'   )r-   r>  rd  r   r   s        r1    _get_feat_extract_output_lengths3SEWPreTrainedModel._get_feat_extract_output_lengths*  sG    
	\
 $'t{{'>'>@W@W#XK,]PM $Y r3   feature_vector_lengthr   c                    U R                  UR                  S5      5      R                  [        R                  5      nUR
                  S   n[        R                  " XA4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )NrM   r   )r#  r   r   r  )rg  r0  r3  r   r/  r   zerosr#  r   r1  flipcumsumr   )r-   ri  r   r?  
batch_sizes        r1   "_get_feature_vector_attention_mask5SEWPreTrainedModel._get_feature_vector_attention_mask9  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr3   r  N)r<   r=   r>   r?   r   __annotations__base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr\  r   r   
LongTensorr   rg  ro  r@   r  r3   r1   rL  rL     sg    $O&*#N%@eEDTDTVYDY>Z 
 
]b]m]m 
r3   rL  r   	mask_probmask_length	min_masksr   c           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ sH  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr   r   )r   max)rc  num_masked_spanepsilonrz  ry  r{  sequence_lengths     r1   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_spanl  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr3   NrM   r"  r   F)replace)r   nprandomr6  itemdetachr0  tolistr   rk  r   choicer1  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   ry  rz  r   r{  rn  r  r  r>  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrc  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r1   _compute_mask_indicesr  F  s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I/c                   >  ^  \ rS rSrS\4U 4S jjr  SS\R                  S\\R                     S\\R                     4S jjr
\     SS\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\\4   4S jj5       rSrU =r$ )SEWModeli  r.   c                   > [         TU ]  U5        Xl        [        U5      U l        [
        R                  " UR                  S   UR                  S9U l	        UR                  S   UR                  :g  U l        U R                  (       a3  [
        R                  " UR                  S   UR                  5      U l        [
        R                  " UR                  5      U l        UR"                  S:  d  UR$                  S:  aG  [
        R&                  " [(        R*                  " UR                  5      R-                  5       5      U l        [1        U5      U l        U R5                  5         g )NrM   r  r   )r    r!   r.   r   feature_extractorr   rH   r"   r  rI   rj   project_featuresr   feature_projectionr   feat_proj_dropoutfeature_dropoutmask_time_probmask_feature_prob	Parameterr   r   uniform_masked_spec_embedr  encoder	post_initr   s     r1   r!   SEWModel.__init__  s     !26!:,,vr':@U@UV & 3v7I7I I  &(ii0CVEWEW&XD#!zz&*B*BC  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"!&) 	r3   r7   mask_time_indicesr   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )ry  rz  r   r{  )r   r#  )ry  rz  r{  rM   )getattrr.   r   r  r3  r#  r  r   r  mask_time_lengthmask_time_min_masksr   tensorr   r   r  mask_feature_lengthmask_feature_min_masksr2  )r-   r7   r  r   rn  r  rj   mask_feature_indicess           r1   _mask_hidden_statesSEWModel._mask_hidden_states  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r3   r   r   r9  r:  r   c                 b   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nU R                  U5      nU R                  (       a  U R                  U5      nU R                  U5      nUb  U R                  UR                  S   U5      nU R                  XS9nU R                  UUUUUS9n	U	S   nU(       d	  U4U	SS -   $ [        UU	R                  U	R                   S9$ )a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr   r`   )r  r   r   r9  r:  r   r*  )r.   r   r9  use_return_dictr  rN   rI   r  r  r  ro  r   r  r  r   r7   r,  )
r-   r   r   r  r   r9  r:  extract_featuresr7   encoder_outputss
             r1   r8   SEWModel.forward  sR    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;??+;<  #667GH,,-=>%!DD]EXEXYZE[]klN000d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
r3   )r.   r  r  r  r  rI   r  r  r$  NNNNN)r<   r=   r>   r?   r   r!   r   FloatTensorr   rx  r  r   r   r   r   r   r   r8   r@   rA   rB   s   @r1   r  r    s    y . :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*3
u||,3
 !.3
 $E$5$56	3

 $D>3
 'tn3
 d^3
 
uo%	&3
 3
r3   r  zk
    SEW Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                      ^  \ rS rSrSS\\   4U 4S jjjrS rS rS r	S r
\     SS\\R                     S	\\R                     S
\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )	SEWForCTCi;  target_langc                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        X l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                   " X1R                  5      U l        U R%                  5         g)a  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`SEWForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SEWForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r    r!   r  rM  r   r   final_dropoutr   r  
vocab_sizer   r0   ro   r  output_hidden_sizerj   r   lm_headr  )r-   r.   r  r  r0   s       r1   r!   SEWForCTC.__init__A  s     	 F#zz&"6"67&$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r3   c                     U R                   nUb'  [        U R                  SS5      c  [        SU S35      eUc.  [        U R                  SS5      b  [        R                  S5        gUb  U R                  USS9  gg)a  
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.

This method is **not** supposed to be called by the user and is prone to be changed in the future.
Nadapter_attn_dimzCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  r  r.   r   loggerinfoload_adapter)r-   r  s     r1   tie_weightsSEWForCTC.tie_weights^  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r3   c                 Z    [         R                  " S[        5        U R                  5         g)
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr-   s    r1   freeze_feature_extractor"SEWForCTC.freeze_feature_extractors  '    
 	Q	

 	##%r3   c                 L    U R                   R                  R                  5         gr  NrM  r  r   r  s    r1   r   SEWForCTC.freeze_feature_encoder      
 	""557r3   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNrM  r   r   r   s     r1   freeze_base_modelSEWForCTC.freeze_base_model  #    
 XX((*E"'E +r3   r   r   r   r9  r:  labelsr   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   nU R                  U5      nU R                  U5      n	Sn
UGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U	S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9n
SSS5        U(       d  U	4U[6        S -   nU
b  U
4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r  r   r"  rM   )rh   r#  r   F)enabled)blank	reductionzero_infinitylosslogitsr7   r,  )r.   r  r  r  r   rM  r   r  r   	ones_liker/  rg  r0  r3  masked_selectr   r   log_softmaxfloat32rN   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r7   r,  )r-   r   r   r   r9  r:  r  r  r7   r  r  r>  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r1   r8   SEWForCTC.forward  s   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]](()/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)r   r  rM  r  r5   r  )r<   r=   r>   r?   r   r   r!   r  r  r  r  r   r   r   r   r   r   r   r8   r@   rA   rB   s   @r1   r  r  ;  s    HSM  :<*
&8(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r3   r  z
    SEW Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )SEWForSequenceClassificationi  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )Nr  zZSequence classification does not support the use of SEW adapters (config.add_adapter=True)r   )r    r!   ro   r  r   r  rM  r  use_weighted_layer_sumr   r  r   r  layer_weightsr   rj   classifier_proj_size	projector
num_labels
classifierr  )r-   r.   
num_layersr0   s      r1   r!   %SEWForSequenceClassification.__init__  s     6=))f.@.@l  F#--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r3   c                 Z    [         R                  " S[        5        U R                  5         g)z
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
r  Nr  r  s    r1   r  5SEWForSequenceClassification.freeze_feature_extractor  r  r3   c                 L    U R                   R                  R                  5         gr  r  r  s    r1   r  3SEWForSequenceClassification.freeze_feature_encoder  r  r3   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r1   r  .SEWForSequenceClassification.freeze_base_model  r  r3   r   r   r   r9  r:  r  r   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nUc  UR                  SS9n
OU R                  UR                   S   U5      nUR#                  S5      R%                  SSUR                   S   5      nS	X) '   UR                  SS9UR                  SS9R                  SS5      -  n
U R'                  U
5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )a  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
    into a tensor of type `torch.FloatTensor`. See [`SEWProcessor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r   r   rM   r   r`   r   r  )r.   r  r  rM  r  r   stackr   r   r   r  r   r0  r
  rO  ro  r   r-  r.  r  r   r  r   r7   r,  )r-   r   r   r   r9  r:  r  r  r7   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    r1   r8   $SEWForSequenceClassification.forward	  s   . &1%<k$++B]B]'+{{'I'ItOc(()/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r3   )r  r  r
  rM  r  )r<   r=   r>   r?   r!   r  r  r  r   r   r   r   r   r   r   r   r8   r@   rA   rB   s   @r1   r  r    s    "
&8(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
r3   r  )r  r  r  rL  )Nr   Nr   )ArS  r  typingr   r   r   numpyr  r   r   torch.nnr   activationsr	   integrations.deepspeedr
   integrations.fsdpr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   rn   r   r   configuration_sewr   
get_loggerr<   r  r   rD   rR   Moduler]   rx   r   r   r   r   r   r   r   r  r  rL  r   r   rx  ndarrayr  r  r  r  r  __all__r  r3   r1   <module>r-     s  ,   , ,    % ! @ 7 B 9 Y Y F & , ( 
		H	%8 *6 66 0( (Vbii BII ,#		 #X  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<U/299 U/pRYY 0!0 !Hc
 c
L B B BR 26tc?tt t U--.	t
 t ZZtn w
! w
 w
t !"  
S
" S

S
l p
#5 p
p
f Zr3   