
    <h                        S r SSKrSSKrSSKJr  SSKJrJrJr  SSK	r
SSKrSSKrSSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJr  SSKJrJrJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'  SSK(J)r)  SSK*J+r+J,r,J-r-J.r.J/r/J0r0J1r1J2r2  SSK3J4r4  Sr5Sr6\0" 5       (       a  SSK7J8r9  \1" 5       (       a  SSK:J;r;  \2Rx                  " \=5      r>Sr?\\," SS9 " S S\+5      5       5       r@  SmS\A\B\B4   S\CS\BS \\R                     S!\BS"\
R                  4S# jjrF SnS$\AS%\BS&\\
R                     4S' jjrG " S( S)\5      rH " S* S+\5      rI " S, S-\5      rJ " S. S/\R                  5      rL " S0 S1\R                  5      rM " S2 S3\R                  5      rN " S4 S5\N5      rO " S6 S7\R                  5      rP   SoS8\R                  S9\R                  S:\R                  S;\R                  S \\R                     S<\\C   S=\CS>\\R                     4S? jjrR " S@ SA\R                  5      rS " SB SC\R                  5      rT " SD SE\5      rU " SF SG\5      rV " SH SI\R                  5      rW " SJ SK\R                  5      rX " SL SM\R                  5      rY " SN SO\R                  5      rZ " SP SQ\R                  5      r[ " SR SS\R                  5      r\\, " ST SU\'5      5       r]\, " SV SW\]5      5       r^\," SXS9 " SY SZ\]5      5       r_\, " S[ S\\]5      5       r`\," S]S9 " S^ S_\]5      5       ra\," S`S9 " Sa Sb\]5      5       rb\, " Sc Sd\]5      5       rc " Se Sf\R                  5      rd " Sg Sh\R                  5      re\," SiS9 " Sj Sk\]5      5       rf/ SlQrgg)pzPyTorch Wav2Vec2 model.    N)	dataclass)CallableOptionalUnion)nn)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputCausalLMOutputMaskedLMOutputSequenceClassifierOutputTokenClassifierOutputWav2Vec2BaseModelOutputXVectorOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputauto_docstringcached_filecheck_torch_load_is_safeis_peft_availableis_safetensors_availableis_torch_flex_attn_availablelogging   )Wav2Vec2Configzadapter.{}.binzadapter.{}.safetensors)	load_file)make_flex_block_causal_mask   za
    Output type of [`Wav2Vec2ForPreTraining`], with potential hidden states and attentions.
    )custom_introc                   f   \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\\R                        \	S'   Sr\\\R                        \	S	'   Sr\\R                     \	S
'   Sr\\R                     \	S'   Srg)Wav2Vec2ForPreTrainingOutputN   a  
loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
    Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
    paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
    projected quantized states.
projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
    Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
    target vectors for contrastive loss.
codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
    The perplexity of the codevector distribution, used to measure the diversity of the codebook.
contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
    The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
    The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentionscontrastive_lossdiversity_loss )__name__
__module____qualname____firstlineno____doc__r,   r   torchFloatTensor__annotations__r-   r.   r/   r0   tupler1   r2   r3   __static_attributes__r4       f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.pyr*   r*   N   s    $ )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju0012948hu001826NHU../6r?   r*   shape	mask_probmask_lengthattention_mask	min_masksreturnc           	        ^^^^^ U u  nmTS:  a  [        S5      eTT:  a  [        ST ST S35      e[        R                  R                  S5      R	                  5       mUUUUU4S jnUb-  UR                  5       R                  S5      R                  5       O[        U5       Vs/ sH  nTPM     snn[        R                  " UT4[        S	9n	/ n
U" T5      nUS
:X  a  U	$ U H  nU" U5      n[        R                  R                  [        R                  " UTS-
  -
  5      USS9n[        U5      S
:X  a  TS-
  nOUS
   n[        R                  " U[        R                  " X-
  [        R                   S	9U-  /5      nU
R#                  U5        M     [        R$                  " U
5      n
[        R&                  " U
SS2SS2S4   X[T45      n
U
R)                  X[T-  5      n
[        R                  " T5      SSSS24   n[        R&                  " UX[T45      R)                  X[T-  5      nU
U-   n
U
R+                  5       TS-
  :  a  TS-
  XTS-
  :  '   [        R,                  " XSS5        U	$ s  snf )a2  
Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
CPU as part of the preprocessing during training.

Args:
    shape: The shape for which to compute masks. This should be of a tuple of size 2 where
           the first element is the batch size and the second element is the length of the axis to span.
    mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                independently generated mask spans of length `mask_length` is computed by
                `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                actual percentage will be smaller.
    mask_length: size of the mask
    min_masks: minimum number of masked spans
    attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                    each batch dimension.
r#   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                    > [        TU -  T-  T-   5      n[        UT5      nUT-  T:  a  TT-  nU TS-
  -
  U:  a  [        U TS-
  -
  S5      nU$ )z;Given input length, compute how many spans should be maskedr#   r   )intmax)input_lengthnum_masked_spanepsilonrC   rB   rE   sequence_lengths     r@   compute_num_masked_span6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr?   Ndtyper   F)replace)
ValueErrornprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaperK   put_along_axis)rA   rB   rC   rD   rE   
batch_sizerP   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrL   rM   spec_aug_mask_idxdummy_mask_idxoffsetsrN   rO   s    `` `            @@r@   _compute_mask_indicesru   q   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89'8!o'89  HHj/:$GM1/Ba%1,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;(MUWU]U] ^ao op
 	!!"34/ &2 "45 1a:&+(V ,33JVa@ab ii$T4]3Goog
'UV^^+5G ,g5 /A"55GVYZGZ!0CCD mB?w :s   (I/features_shapenum_negativesmask_time_indicesc                 L   U u  p4[         R                  " U5      n[         R                  " X4U4[         R                  S9nUb  UR	                  [
        5      O[         R                  " U [
        S9n[        U5       H  nX'   R                  5       S-
  nXRU      n	[         R                  " [         R                  " US-   5      SS2S4   US-   U45      n
[         R                  R                  SXS-   U4S9nXU
:  ==   S-  ss'   X   Xg   X'   '   Xg==   Xt-  -  ss'   M     U$ )z6
Sample `num_negatives` vectors from feature vectors.
)rA   rT   NrS   r#   r   )size)rW   rb   r_   rf   astyper`   re   r^   r\   ri   rX   randint)rv   rw   rx   rl   rO   sequence_length_rangesampled_negative_indices	batch_idxhighmapped_masked_indicesfeature_indicessampled_indicess               r@   _sample_negative_indicesr      s5    #1J IIo6  "xxzM.Zbdbjbjk +<*G  &RWWUckoMp  :&	 +//1A5 5	6R S//"))D1H*=ag*FPQS`Hab))++At!8]:S+T?:;q@; MbLr +,=,HI 	!+y/JJ+ ' $#r?   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Wav2Vec2NoLayerNormConvLayeri  c                 b  > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        g )Nr   r#   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr
   feat_extract_activation
activationselfconfiglayer_id	__class__s      r@   r   %Wav2Vec2NoLayerNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r?   c                 J    U R                  U5      nU R                  U5      nU$ N)r   r   r   r0   s     r@   forward$Wav2Vec2NoLayerNormConvLayer.forward  s$    		-06r?   )r   r   r   r   r   r5   r6   r7   r8   r   r   r>   __classcell__r   s   @r@   r   r     s    A r?   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Wav2Vec2LayerNormConvLayeri!  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [
        R                  " U R                  SS9U l        [        UR                     U l        g )Nr   r#   r   T)elementwise_affine)r   r   r   r   r   r   r   r   r   r   r   	LayerNorm
layer_normr
   r   r   r   s      r@   r   #Wav2Vec2LayerNormConvLayer.__init__"  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r?   c                     U R                  U5      nUR                  SS5      nU R                  U5      nUR                  SS5      nU R                  U5      nU$ )NrR   )r   	transposer   r   r   s     r@   r   "Wav2Vec2LayerNormConvLayer.forward1  sV    		-0%//B76%//B76r?   r   r   r   r   r   r   r   r   s   @r@   r   r   !  s    A r?   r   c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )Wav2Vec2GroupNormConvLayeri<  c                   > [         TU ]  5         US:  a  UR                  US-
     OSU l        UR                  U   U l        [
        R                  " U R                  U R                  UR                  U   UR                  U   UR                  S9U l
        [        UR                     U l        [
        R                  " U R                  U R                  SS9U l        g )Nr   r#   r   T)
num_groupsnum_channelsaffine)r   r   r   r   r   r   r   r   r   r   r   r
   r   r   	GroupNormr   r   s      r@   r   #Wav2Vec2GroupNormConvLayer.__init__=  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr?   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r   r   r   s     r@   r   "Wav2Vec2GroupNormConvLayer.forwardM  s2    		-066r?   r   r   r   r   s   @r@   r   r   <  s    r  r?   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Wav2Vec2PositionalConvEmbeddingiT  c                   > [         TU ]  5         [        R                  " UR                  UR                  UR
                  UR
                  S-  UR                  S9U l        [        R                  R                  n[        [        R                  R                  S5      (       a$  [        R                  R                  R                  n[        5       (       Ga%  SS KnUR                  R                  U R                  R                   SS9   U" U R                  SSS9U l        S S S 5        [        U R                  S5      (       aU  U R                  R                  R                   R"                  nU R                  R                  R                   R$                  nO,U R                  R&                  nU R                  R(                  nUR                  R+                  X5        UR                  R+                  X5        OU" U R                  SSS9U l        [-        UR
                  5      U l        [0        UR2                     U l        g ! , (       d  f       GN,= f)	Nr'   )r   paddinggroupsweight_normr   )modifier_rankweight)namedimparametrizations)r   r   r   r   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsr   utilsr   hasattrr   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterWav2Vec2SamePadLayerr   r
   r   r   )r   r   r   r   r   r   r   s         r@   r   (Wav2Vec2PositionalConvEmbedding.__init__U  s   II6622a777
	 hh**288,,m<<((33??K%''224993C3CST2U'		aH	 Vtyy"4559955<<FF9955<<FF99--99--NN66tFNN66tF#DIIH!DDI+F,J,JK !?!?@ VUs   I
Ic                     UR                  SS5      nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SS5      nU$ Nr#   r'   )r   r   r   r   r   s     r@   r   'Wav2Vec2PositionalConvEmbedding.forwardv  sV    %//15		-0]36%//15r?   )r   r   r   r   r   s   @r@   r   r   T  s    AB r?   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r   i  c                 R   > [         TU ]  5         US-  S:X  a  SU l        g SU l        g )Nr'   r   r#   )r   r   num_pad_remove)r   r   r   s     r@   r   Wav2Vec2SamePadLayer.__init__  s)    #:Q#>!#Car?   c                 X    U R                   S:  a  US S 2S S 2S U R                   * 24   nU$ Nr   r   r   s     r@   r   Wav2Vec2SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr?   r   r   r   s   @r@   r   r     s    K r?   r   c                   8   ^  \ rS rSrSrU 4S jrS rS rSrU =r	$ )Wav2Vec2FeatureEncoderi  z.Construct the features from raw audio waveformc           	        > [         TU ]  5         UR                  S:X  a?  [        USS9/[	        UR
                  S-
  5       Vs/ sH  n[        XS-   S9PM     sn-   nOUUR                  S:X  a,  [	        UR
                  5       Vs/ sH  n[        XS9PM     nnO[        SUR                   S35      e[        R                  " U5      U l        SU l        S	U l        g s  snf s  snf )
Ngroupr   )r   r#   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r   r   feat_extract_normr   r^   num_feat_extract_layersr   r   rV   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)r   r   ir   r   s       r@   r   Wav2Vec2FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNNw,V!eDNwN K %%0HMfNlNlHmHm1*6>Hm  K 01I1I0JJst  ==5&+#"Ns   CC#c                 N    U R                  5        H
  nSUl        M     SU l        g NF)
parametersrequires_gradr   r   params     r@   _freeze_parameters)Wav2Vec2FeatureEncoder._freeze_parameters  s#    __&E"'E '#r?   c                     US S 2S 4   nU R                   (       a  U R                  (       a  SUl        U R                   H  nU" U5      nM     U$ )NT)r   trainingr   r   )r   input_valuesr0   
conv_layers       r@   r   Wav2Vec2FeatureEncoder.forward  sK    $QW- 4==*.M'**J&}5M + r?   )r   r   r   )
r5   r6   r7   r8   r9   r   r   r   r>   r   r   s   @r@   r   r     s    8#&$

 
r?   r   c                   (   ^  \ rS rSrU 4S jrSrU =r$ )Wav2Vec2FeatureExtractori  c                    > [         TU ]  U5        [        R                  " SU R                  R
                   SU R                  R                  S   R
                   S3[        5        g )NzThe class `zD` has been depreciated and will be removed in Transformers v5. Use `r   z
` instead.)r   r   warningswarnr   r5   	__bases__FutureWarningr   r   r   s     r@   r   !Wav2Vec2FeatureExtractor.__init__  s[     $..112 3NN,,Q/889E 		
r?   r4   )r5   r6   r7   r8   r   r>   r   r   s   @r@   r   r     s    
 
r?   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Wav2Vec2FeatureProjectioni  c                 4  > [         TU ]  5         [        R                  " UR                  S   UR
                  S9U l        [        R                  " UR                  S   UR                  5      U l	        [        R                  " UR                  5      U l        g )NrR   eps)r   r   r   r   r   layer_norm_epsr   Linearr   
projectionDropoutfeat_proj_dropoutdropoutr  s     r@   r   "Wav2Vec2FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r?   c                 n    U R                  U5      nU R                  U5      nU R                  U5      nX4$ r   )r   r  r  )r   r0   norm_hidden_statess      r@   r   !Wav2Vec2FeatureProjection.forward  s7    !__];(:;]300r?   )r  r   r  r   r   s   @r@   r  r    s    <1 1r?   r  modulequerykeyvaluescalingr  	head_maskc                    Uc  UR                  S5      S-  n[        R                  " XR                  SS5      5      U-  n	Ub  X-   n	[        R
                  R                  U	SS9n	Ub  XR                  SSSS5      -  n	[        R
                  R                  XU R                  S9n	[        R                  " X5      n
U
R                  SS5      R                  5       n
X4$ )NrR         r'   r	   r   r#   )pr   )rz   r:   matmulr   r   
functionalsoftmaxviewr  r   
contiguous)r  r  r  r  rD   r  r  r  kwargsattn_weightsattn_outputs              r@   eager_attention_forwardr&    s     **R.D(<<}}Q':;gEL!#4==((2(>L#nnQAq&AA==((6??([L,,|3K''1-88:K$$r?   c                   Z  ^  \ rS rSrSr     SS\S\S\S\S\S\S	\\	   4U 4S
 jjjr
    SS\R                  S\\R                     S\\R                     S\\R                     S\\   S\\   S\\R                  \\R                     \\\R                        4   4S jjrSrU =r$ )Wav2Vec2Attentioni  z=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr  
is_decoderr   	is_causalr   c                   > [         TU ]  5         Xl        X l        X0l        X-  U l        Xpl        U R
                  U-  U R                  :w  a  [        SU R                   SU S35      eU R
                  S-  U l        X@l	        X`l
        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        [        R                  " XUS9U l        g )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r  )r   )r   r   r)  r*  r  head_dimr   rV   r  r+  r,  r   r  k_projv_projq_projout_proj)	r   r)  r*  r  r+  r   r,  r   r   s	           r@   r   Wav2Vec2Attention.__init__  s     	""!.MMI%$..8MdnnM]$YKr3  }}d*$"ii	4@ii	4@ii	4@		)TBr?   r0   key_value_statesrD   layer_head_maskoutput_attentionsr#  rF   c                     USLnUR                   SS u  pU(       a  UR                   S   OU	n
XSU R                  4nXSU R                  4nU R                  U5      R                  " U6 R	                  SS5      nU(       a  UOUnU R                  U5      R                  " U6 R	                  SS5      nU R                  U5      R                  " U6 R	                  SS5      n[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UUUU4U R                  (       d  SOU R                  U R                  UUS.UD6u  nnUR                  XS5      R                  5       nU R!                  U5      nUUS4$ )z#Input shape: Batch x Time x ChannelNrR   r#   r'   eager        )r  r  r6  r  )rA   r.  r1  r!  r   r/  r0  r&  r   _attn_implementationr   r   r  r  rj   r"  r2  )r   r0   r4  rD   r5  r6  r#  is_cross_attentionbsztgt_lensrc_lenq_input_shapekv_input_shapequery_statescurrent_states
key_statesvalue_statesattention_interfacer%  r$  s                       r@   r   Wav2Vec2Attention.forward  s    .T9 %**3B//A"((+wr4==9DMM: {{=166FPPQRTUV-?)][[055~FPPQRTUV
{{>277HRRSTVWX(?;;++w6"9$++:Z:Z"[$7%
  $}}C$,,LL/%%
 %
!\ "))#;FFHmmK0L$..r?   )r   r  r)  r.  r,  r+  r/  r*  r2  r1  r  r0  )r9  FTFN)NNNF)r5   r6   r7   r8   r9   rJ   floatr`   r   r$   r   r:   Tensorr   r   r=   r   r>   r   r   s   @r@   r(  r(    s    G  +/CC C 	C
 C C C (C CD 481526,13/||3/ #5<<03/ !.	3/
 "%,,/3/ $D>3/ -.3/ 
u||Xell3XeELL>Q5RR	S3/ 3/r?   r(  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Wav2Vec2FeedForwardiE  c                   > [         TU ]  5         [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l	        [        UR                  [        5      (       a  [        UR                     U l        OUR                  U l        [        R                  " UR                  UR                  5      U l        [        R                  " UR                   5      U l        g r   )r   r   r   r  activation_dropoutintermediate_dropoutr  r   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr
   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr  s     r@   r   Wav2Vec2FeedForward.__init__F  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''--'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?r?   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU R	                  U5      nU$ r   )rO  rS  rM  rT  rV  r   s     r@   r   Wav2Vec2FeedForward.forwardS  sX    //>00?11-@))-8++M:r?   )rS  rO  rM  rT  rV  r   r   s   @r@   rJ  rJ  E  s    @ r?   rJ  c                   2   ^  \ rS rSrU 4S jrSS jrSrU =r$ )Wav2Vec2EncoderLayeri]  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        g )NFr)  r*  r  r+  r   r  )r   r   r(  r   num_attention_headsattention_dropout	attentionr   r  rU  r  r   r
  r   rJ  feed_forwardfinal_layer_normr  s     r@   r   Wav2Vec2EncoderLayer.__init__^  s    *((00,,
 zz&"7"78,,v'9'9v?T?TU/7 "V-?-?VEZEZ [r?   c                     UnU R                  XUS9u  pnU R                  U5      nXA-   nU R                  U5      nXR                  U5      -   nU R	                  U5      nU4nU(       a  Xu4-  nU$ NrD   r6  )r`  r  r   ra  rb  r   r0   rD   r6  attn_residualr$  rm   outputss           r@   r   Wav2Vec2EncoderLayer.forwardm  s    %)-L] *8 *
&Q ]3%56%(9(9-(HH--m< "&Gr?   )r`  r  ra  rb  r   r   r   r   s   @r@   r[  r[  ]  s    \ r?   r[  c                   t   ^  \ rS rSrU 4S jr  SS\R                  S\\R                     S\4S jjr	Sr
U =r$ )	#Wav2Vec2EncoderLayerStableLayerNormi  c                   > [         TU ]  5         [        UR                  UR                  UR
                  SUS9U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  S9U l        [        U5      U l        [        R                  " UR                  UR                  S9U l        [#        USS 5      b  [%        U5      U l        g S U l        g )NFr]  r  adapter_attn_dim)r   r   r(  r   r^  r_  r`  r   r  rU  r  r   r
  r   rJ  ra  rb  getattrWav2Vec2AttnAdapterLayeradapter_layerr  s     r@   r   ,Wav2Vec2EncoderLayerStableLayerNorm.__init__  s    *((00,,
 zz&"7"78,,v'9'9v?T?TU/7 "V-?-?VEZEZ [6-t4@!9&!AD!%Dr?   r0   rD   r6  c                    UnU R                  U5      nU R                  XUS9u  pnU R                  U5      nXA-   nXR                  U R	                  U5      5      -   nU R
                  b  XR                  U5      -   nU4nU(       a  Xu4-  nU$ re  )r   r`  r  ra  rb  rq  rg  s           r@   r   +Wav2Vec2EncoderLayerStableLayerNorm.forward  s     &6)-L] *8 *
&Q ]3%5%(9(9$:O:OP]:^(__)),>,>},MMM "&Gr?   )rq  r`  r  ra  rb  r   r   )r5   r6   r7   r8   r   r:   rH  r   r`   r   r>   r   r   s   @r@   rl  rl    sC    &, 26"'	|| !.  	 r?   rl  c                      ^  \ rS rSrU 4S jr    SS\R                  S\\R                     S\	S\	S\	4
S	 jjr
S\\R                  S4   S
\R                  4S jrSrU =r$ )Wav2Vec2Encoderi  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[!        U5      PM     sn5      U l        SU l        g s  snf Nr  F)r   r   r   r   pos_conv_embedr   r   r   r
  r   r  rU  r  r   r^   num_hidden_layersr[  layersr   r   r   rm   r   s      r@   r   Wav2Vec2Encoder.__init__  s    =fE,,v'9'9v?T?TUzz&"7"78mm5QWQiQiKj$kKja%9&%AKj$kl&+# %l    CNr0   rD   r6  output_hidden_statesreturn_dictc                    U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   U R                  UU5      nU R	                  U5      n	X-   nU R                  U5      nU R                  U5      n[        5       =(       d    [        U 5      n
U R                   H  nU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a    XR                  R                  :  nU(       a  U
(       a  U" XUS9nUS   nU(       a  SnU(       d  M|  UWS   4-   nM     U(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [!        UUUS	9$ )
Nr4   rR   r#   r'   r   rf  NNc              3   ,   #    U H  oc  M  Uv   M     g 7fr   r4   .0vs     r@   	<genexpr>*Wav2Vec2Encoder.forward.<locals>.<genexpr>       m$[q$[   	last_hidden_stater0   r1   )	unsqueezerepeatrA   _update_full_maskry  r   r  r   r   r{  r:   rY   r   r   	layerdropr=   r   r   r0   rD   r6  r  r  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusr   dropout_probabilityskip_the_layerlayer_outputss                  r@   r   Wav2Vec2Encoder.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M01//

 #11-@%;6]302R6LT6R[[E#$58H$H! #(**R.!]]Z/B[[EZEZ/ZN![ %!Te! !.a 0 ,  &9]1=M<O&O#' !*   14D Dm]GZ$[mmm++*
 	
r?   inputs_embedsc                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ Nflash_attention_2r   sdpaflex_attentionF)r,  	r   r:  r   rT   rP  r:   rH  r&   r   r   rD   r  s      r@   r  !Wav2Vec2Encoder._update_full_mask      
 %{{//3FF343F  MQ  11V; "E^UhUh!i  115EEnell;;%@[`%aN
  "<NL_L_!`r?   r   r  r   r   r{  ry  NFFT)r5   r6   r7   r8   r   r:   tensorr   rH  r`   r   r   r  r>   r   r   s   @r@   rv  rv    s    , 26"'%* :
||:
 !.:
  	:

 #:
 :
zellD01 || r?   rv  c                   ~   ^  \ rS rSrU 4S jr    S	S jrS\\R                  S4   S\R                  4S jr	Sr
U =r$ )
Wav2Vec2EncoderStableLayerNormi  c                   > [         TU ]  5         Xl        [        U5      U l        [
        R                  " UR                  UR                  S9U l	        [
        R                  " UR                  5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[!        U5      PM     sn5      U l        SU l        g s  snf rx  )r   r   r   r   ry  r   r   r   r
  r   r  rU  r  r   r^   rz  rl  r{  r   r|  s      r@   r   'Wav2Vec2EncoderStableLayerNorm.__init__  s    =fE,,v'9'9v?T?TUzz&"7"78mmBGH`H`BabBaQ08Bab
 ',# cr~  Nc                    U(       a  SOS nU(       a  SOS nUb4  UR                  S5      R                  SSUR                  S   5      nSX) '   U R                  UU5      nU R	                  U5      n	X-   nU R                  U5      n[        5       =(       d    [        U 5      n
U R                   H  nU(       a  Xa4-   n[        R                  " / 5      nU R                  =(       a    XR                  R                  :  nU(       a  U
(       a  U" XUS9nUS   nU(       a  SnU(       d  M|  UWS   4-   nM     U R                  U5      nU(       a  Xa4-   nU(       d  [        S XU4 5       5      $ [!        UUUS	9$ )
Nr4   rR   r#   r'   r   rf  r  c              3   ,   #    U H  oc  M  Uv   M     g 7fr   r4   r  s     r@   r  9Wav2Vec2EncoderStableLayerNorm.forward.<locals>.<genexpr>P  r  r  r  )r  r  rA   r  ry  r  r   r   r{  r:   rY   r   r   r  r   r=   r   r  s                  r@   r   &Wav2Vec2EncoderStableLayerNorm.forward  s    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M01//

 #11-@%;]302R6LT6R[[E#$58H$H! #(**R.!]]Z/B[[EZEZ/ZN![ !&!Te! !.a 0 ,  &9]1=M<O&O#) !, 6 14D Dm]GZ$[mmm++*
 	
r?   rD   r  c                 r   Ub  U R                   R                  S:X  a  SU;   a  UnU$ S nU$ U R                   R                  S:X  a  [        XR                  5      nU$ U R                   R                  S:X  a+  [	        U[
        R                  5      (       a
  [        USS9nU$ [        XR                  5      nU$ r  r  r  s      r@   r  0Wav2Vec2EncoderStableLayerNorm._update_full_maskX  r  r?   r  r  )r5   r6   r7   r8   r   r   r   r:   rH  r  r>   r   r   s   @r@   r  r    sJ    	, "<
~ellD01 || r?   r  c                   J   ^  \ rS rSrSrU 4S jr\SS j5       rSS jrSr	U =r
$ )Wav2Vec2GumbelVectorQuantizerio  z
Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
c                 8  > [         TU ]  5         UR                  U l        UR                  U l        UR                  U R                  -  S:w  a&  [        SUR                   SU R                   S35      e[        R                  " [        R                  " SU R                  U R
                  -  UR                  U R                  -  5      5      U l        [        R                  " UR                  S   U R                  U R
                  -  5      U l        SU l        g )Nr   z`config.codevector_dim z5 must be divisible by `config.num_codevector_groups` z for concatenationr#   rR   r'   )r   r   num_codevector_groupsr   num_codevectors_per_groupnum_varscodevector_dimrV   r   	Parameterr:   r;   codevectorsr  r   weight_projtemperaturer  s     r@   r   &Wav2Vec2GumbelVectorQuantizer.__init__u  s     6688  4??2a7)&*?*?)@ A559__4EEWY  <<a4==!@&BWBW[_[j[jBjk
 99V__R%8$//DMM:YZ r?   c           	         Ub}  UR                  5       S S 2S S 4   R                  U R                  5      n[        R                  " X [        R
                  " U 5      5      n U R                  SS9UR                  5       -  nOU R                  SS9n[        R                  " [        R                  " U[        R                  " US-   5      -  SS9* 5      R                  5       nU$ )Nr   r  gHz>rR   )
flattenexpandrA   r:   where
zeros_liker\   meanexplog)probsmaskmask_extendedmarginal_probs
perplexitys        r@   _compute_perplexity1Wav2Vec2GumbelVectorQuantizer._compute_perplexity  s     LLN1dD=9@@MMKKe6F6Fu6MNE"YY1Y-
:N"ZZAZ.NYY		.599^VZEZ;[*[ac ddeiik
r?   c                    UR                   u  p4nU R                  U5      nUR                  X4-  U R                  -  S5      nU R                  (       a  [
        R                  R                  UR                  5       U R                  SS9R                  U5      n[        R                  " UR                  X4-  U R                  S5      R                  5       SS9nU R                  Xr5      nO{UR                  SS9n	UR                  UR                   5      R!                  SU	R                  SS5      S5      nUR                  X4-  U R                  S5      nU R                  Xb5      nUR                  X4-  S5      nUR#                  S5      U R$                  -  n
U
R                  X4-  U R                  U R&                  S5      nUR)                  S5      R                  X4S5      nX4$ )NrR   T)tauhardr  r#         ?r   )rA   r  r!  r   r   r   r  gumbel_softmaxrG  r  type_asr:   r   r  argmax	new_zerosscatter_r  r  r  r\   )r   r0   rx   rl   rO   r   codevector_probscodevector_soft_distr  codevector_idxcodevectors_per_groupr  s               r@   r   %Wav2Vec2GumbelVectorQuantizer.forward  s   3@3F3F0
[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FZJ +11b19N,66}7J7JKTTN''A.   044Z5QSWSbSbdfg112BVJ+001MrR 0 : :2 >AQAQ Q+001Mt`d`m`moqr!oob)..zBO&&r?   )r  r   r  r  r  r   )r5   r6   r7   r8   r9   r   staticmethodr  r   r>   r   r   s   @r@   r  r  o  s+    
( 	 	#' #'r?   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )Wav2Vec2Adapteri  c                   >^ [         TU ]  5         TR                  TR                  :w  aV  [        R
                  " TR                  TR                  5      U l        [        R                  " TR                  5      U l        OS =U l        U l        [        R                  " U4S j[        TR                  5       5       5      U l        TR                  U l        g )Nc              3   8   >#    U H  n[        T5      v   M     g 7fr   )Wav2Vec2AdapterLayer)r  rm   r   s     r@   r  +Wav2Vec2Adapter.__init__.<locals>.<genexpr>  s     #kJjQ$8$@$@Jjs   )r   r   output_hidden_sizer   r   r  projr   proj_layer_normr   r^   num_adapter_layersr{  r  r  s    `r@   r   Wav2Vec2Adapter.__init__  s     $$(:(::		&"4"4f6O6OPDI#%<<0I0I#JD /33DI,mm#k%PVPiPiJj#kk))r?   c                 |   U R                   b/  U R                  b"  U R                  U5      nU R                  U5      nUR                  SS5      nU R                   HK  n[        R
                  R                  5       nU R                  (       a  X0R                  :  d  MC  U" U5      nMM     UR                  SS5      nU$ r   )r  r  r   r{  rW   rX   r   r  )r   r0   r   layerdrop_probs       r@   r   Wav2Vec2Adapter.forward  s    99 T%9%9%E IIm4M 00?M%//15[[EYY--/N==^nn%D %m 4 !
 &//15r?   )r  r{  r  r  r   r   s   @r@   r  r    s    * r?   r  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )r  i  c                    > [         TU ]  5         [        R                  " UR                  SUR                  -  UR
                  UR                  SS9U l        g )Nr'   r#   )r   r   )r   r   r   r   r  adapter_kernel_sizeadapter_strider   r  s     r@   r   Wav2Vec2AdapterLayer.__init__  sJ    II%%)))&&((
	r?   c                 d    U R                  U5      n[        R                  R                  USS9nU$ )Nr#   r  )r   r   r  glur   s     r@   r   Wav2Vec2AdapterLayer.forward  s/    		-0))-Q)?r?   )r   r   r   s   @r@   r  r    s    
 r?   r  c                   J   ^  \ rS rSrU 4S jrS\R                  4S jrSrU =r	$ )rp  i  c                   > [         TU ]  5         UR                  U l        UR                  U l        [        R                  " U R
                  5      U l        [        R                  " U R
                  U R                  5      U l
        [        R                  " 5       U l        [        R                  " U R                  U R
                  5      U l        g)z
Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
up training throughput.
N)r   r   rn  	input_dimr   
hidden_dimr   r   normr  linear_1ReLUact_fnlinear_2r  s     r@   r   !Wav2Vec2AttnAdapterLayer.__init__  s    
 	00 ,,LL1			$//4>>Bggi		$..$//Br?   r0   c                     U R                  U5      nU R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r  r  r  r  r   s     r@   r    Wav2Vec2AttnAdapterLayer.forward  s@    		-0m4M2m4r?   )r  r  r  r  r  r  )
r5   r6   r7   r8   r   r:   r;   r   r>   r   r   s   @r@   rp  rp    s     CU%6%6  r?   rp  c                       \ rS rSr% \\S'   SrSrSrSr	Sr
SrS r SS\\R                  \4   S	\\   4S
 jjr SS\S\R                  4S jjrS rS rSS\4S jjrSrg)Wav2Vec2PreTrainedModeli  r   wav2vec2r   Tc           
      t   [        U[        5      (       aW  UR                  R                  5         UR                  R                  5         SUR                  l        SUR                  l        g[        U[        5      (       a  UR                  R                  R                  R                  SSS9  UR                  R                  R                  R                  5         [        R                  R                  UR                   5        g[        U["        5      (       a  [        R                  R                  UR$                  R                  SS[&        R(                  " SUR$                  R*                  S   UR$                  R,                  -  -  5      -  S9  [        R                  R/                  UR$                  R                  S5        g[        U[0        5      (       a  [&        R(                  " SUR2                  R4                  -  5      n[        R                  R                  UR2                  R                  U* US9  [        R                  R                  UR2                  R                  U* US9  g[        U[        R6                  5      (       ak  UR                  R                  R                  SU R8                  R:                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R<                  [        R>                  45      (       aJ  UR                  R                  R                  5         UR                  R                  RA                  S	5        g[        U[        RB                  5      (       a  [        R                  RE                  UR                  5        UR                  bh  [&        R(                  " URF                  UR,                  UR*                  S   -  -  5      n[        R                  R                  UR                  U* US9  ggg)
zInitialize the weightsTr9  r#   )r  stdr   r'   )abNr  )$rP  Wav2Vec2ForPreTrainingproject_hidreset_parameters	project_q_is_hf_initializedr  r  r   datanormal_r   zero_r   inituniform_r  r   r   mathsqrtr   in_channels	constant_r  r  in_featuresr  r   initializer_ranger   r   fill_r   kaiming_normal_r   )r   r  ks      r@   _init_weights%Wav2Vec2PreTrainedModel._init_weights  s    f455//1--/48F126F/ =>>%%**222C##((..0GGV//0 ?@@GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 9::		!f//;;;<AGGV..55!qAGGV..33rQ?		**MM&&CT[[5R5R&S{{&  &&( 'r|| <==KK""$MM$$S)		**GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r?   Nrn   add_adapterc                 d   Uc  U R                   R                  OUnS n[        U R                   R                  U R                   R                  5       H  u  pEU" XU5      nM     U(       aD  [        U R                   R                  5       H!  nU" USU R                   R                  5      nM#     U$ )z8
Computes the output length of the convolutional layers
c                 8    [         R                  " X-
  USS9S-   $ )Nfloor)rounding_moder#   )r:   divrL   r   r   s      r@   _conv_out_lengthRWav2Vec2PreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length?  s      99\7wWZ[[[r?   r#   )r   r  zipr   r   r^   r  r  )r   rn   r  r!  r   r   rm   s          r@    _get_feat_extract_output_lengths8Wav2Vec2PreTrainedModel._get_feat_extract_output_lengths6  s     2=1Ddkk--+	\
 $'t{{'>'>@W@W#XK,]PM $Y 4;;99: 04;;C]C] ^ ; r?   feature_vector_lengthrD   c                    UR                  SS9S S 2S4   nU R                  XCS9nUR                  [        R                  5      nUR
                  S   n[        R                  " Xa4UR                  UR                  S9nSU[        R                  " UR
                  S   UR                  S9US-
  4'   UR                  S/5      R                  S5      R                  S/5      R                  5       nU$ )NrR   r  r  r   )rT   devicer#   )r)  )cumsumr$  tor:   longrA   r_   rT   r)  rb   flipr`   )r   r&  rD   r  non_padded_lengthsoutput_lengthsrl   s          r@   "_get_feature_vector_attention_mask:Wav2Vec2PreTrainedModel._get_feature_vector_attention_maskM  s    
 ,22r2:1b5A>>?Q>k'**5::6#))!,
/~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr?   c                    U R                   R                  c  [        U R                   S35      e0 nU R	                  5        HI  u  p#[        U[        5      (       d  M  UR                  5        H  u  pEXQSR                  X$/5      '   M     MK     [        U [        5      (       a8  U R                  R                  5        H  u  p%XQSR                  SU/5      '   M     U$ )NzF has no adapter layers. Make sure to define `config.adapter_attn_dim`..lm_head)r   rn  rV   r   named_modulesrP  rp  named_parametersjoinWav2Vec2ForCTCr4  )r   adapter_weightsr   r  
param_namer   s         r@   _get_adapters%Wav2Vec2PreTrainedModel._get_adaptersa  s    ;;''///uvww ..0LD&":;;)/)@)@)B%JDICHHd-?$@A *C 1
 dN++#||<<>?D)T): ;<  ? r?   c                     U R                  5        H+  n[        U[        5      (       d  M  U R                  U5        M-     [        U [        5      (       a  U R                  U R
                  5        gg)zS
(Re-)initialize attention adapter layers and lm head for adapter-only fine-tuning
N)modulesrP  rp  r  r8  r4  )r   r  s     r@   init_adapter_layers+Wav2Vec2PreTrainedModel.init_adapter_layersq  sV    
 llnF&":;;""6* %
 dN++t||, ,r?   target_langc                    U R                   R                  c  [        SU S35      eXR                  :X  a!  U(       d  [        R                  SU S35        gUR                  SS5      nUR                  SS5      nUR                  S	S5      nUR                  S
S5      nUR                  SS5      nUR                  SS5      n	UR                  SS5      n
UR                  SS5      nUR                  S[        5       (       a  SOS5      nU
b+  [        R                  " S[        5        U	b  [        S5      eU
n	U R                   R                  nSnUSLa2  [        R                  U5      n [        UUUUUUU	UUS9	n[        U5      nUcG  [$        R                  U5      n [        UUUUUUU	UUS9	n['        5         [(        R*                  " USSS9nU R-                  5       n[/        UR1                  5       5      [/        UR1                  5       5      -
  n[/        UR1                  5       5      [/        UR1                  5       5      -
  n[3        U5      S:  a!  [        SW SSR5                  U5       S35      e[3        U5      S:  a!  [        SW SSR5                  U5       S35      eUS   R6                  S   nUU R                   R8                  :w  aU  [:        R<                  " U R                   R>                  UU R@                  U RB                  S9U l"        UU R                   l        URG                  5        VVs0 sH  u  nnUURI                  UU   5      _M     nnnU RK                  USS 9  Xl        g! [          a    U(       a  e  GN["         a     U(       a  [!        SU SU SU S35      e GN,f = f! [          a    e [         a    e ["         a    [!        SU SU SU S35      ef = fs  snnf )!a/  
Load a language adapter model from a pre-trained adapter model.

Parameters:
    target_lang (`str`):
        Has to be a language id of an existing adapter weight. Adapter weights are stored in the format
        adapter.<lang>.safetensors or adapter.<lang>.bin
    force_load (`bool`, defaults to `True`):
        Whether the weights shall be loaded even if `target_lang` matches `self.target_lang`.
    cache_dir (`Union[str, os.PathLike]`, *optional*):
        Path to a directory in which a downloaded pretrained model configuration should be cached if the
        standard cache should not be used.
    force_download (`bool`, *optional*, defaults to `False`):
        Whether or not to force the (re-)download of the model weights and configuration files, overriding the
        cached versions if they exist.
    resume_download:
        Deprecated and ignored. All downloads are now resumed by default when possible.
        Will be removed in v5 of Transformers.
    proxies (`dict[str, str]`, *optional*):
        A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128',
        'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request.
    local_files_only(`bool`, *optional*, defaults to `False`):
        Whether or not to only look at local files (i.e., do not try to download the model).
    token (`str` or `bool`, *optional*):
        The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use
        the token generated when running `hf auth login` (stored in `~/.huggingface`).
    revision (`str`, *optional*, defaults to `"main"`):
        The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
        git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
        identifier allowed by git.

        <Tip>

        To test a pull request you made on the Hub, you can pass `revision="refs/pr/<pr_number>"`.

        </Tip>

    mirror (`str`, *optional*):
        Mirror source to accelerate downloads in China. If you are from China and have an accessibility
        problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety.
        Please refer to the mirror site for more information.

<Tip>

Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to
use this method in a firewalled environment.

</Tip>

Examples:

```python
>>> from transformers import Wav2Vec2ForCTC, AutoProcessor

>>> ckpt = "facebook/mms-1b-all"
>>> processor = AutoProcessor.from_pretrained(ckpt)
>>> model = Wav2Vec2ForCTC.from_pretrained(ckpt, target_lang="eng")
>>> # set specific language
>>> processor.tokenizer.set_target_lang("spa")
>>> model.load_adapter("spa")
```
NzCannot load_adapter for - if `config.adapter_attn_dim` is not defined.z#Adapter weights are already set to r3  	cache_dirforce_downloadFresume_downloadproxieslocal_files_onlytokenuse_auth_tokenrevisionuse_safetensorszrThe `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.zV`token` and `use_auth_token` are both specified. Please set only the argument `token`.)filenamerE  rF  rG  rH  rI  rK  rD  zCan't load the model for 'z'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'z=' is the correct path to a directory containing a file named cpuT)map_locationweights_onlyr   zThe adapter weights z has unexpected keys: z, z has missing keys: zlm_head.weightr)  rT   )strict)&r   rn  rV   rA  loggerwarningpopr    r   r   r  _name_or_pathWAV2VEC2_ADAPTER_SAFE_FILEformatr   safe_load_fileOSError	ExceptionWAV2VEC2_ADAPTER_PT_FILEr   r:   loadr;  setkeysrc   r7  rA   
vocab_sizer   r  r  r)  rT   r4  itemsr+  load_state_dict)r   rA  
force_loadr#  rD  rE  rF  rG  rH  rI  rJ  rK  rL  model_path_or_id
state_dictfilepathweight_pathr9  unexpected_keysmissing_keystarget_vocab_sizer  r  s                          r@   load_adapter$Wav2Vec2PreTrainedModel.load_adapter~  sA   ~ ;;''/7}Dqrss***:NN@QOPJJ{D1	$4e< **%6=**Y-!::&8%@

7D)$4d;::j$/ **%6@X@Z@Z`ef%MM E   l  #E;;44
 %'188EH)$%#1$3#%5%'
 ,K8
& /66{CH#)$%#1$3#%5%'
 )*"ZZ!&!%
. ,,.joo/037K7K7M3NN?//12S9J5KK!#3K=@VW[W`W`apWqVrrstuu"3K=@STXT]T]^jTkSllmnoo ''78>>qA 6 6699..0A$++]a]g]gDL &7DKK" ?I>N>N>PQ>Pdaaoa011>P
QZ6 '_  "  #
  "!45E4F G==M<N O>>FZqJ  #D     01A0B C99I8J K::B1F 6 Rs*   +M6  1N7 ;!O.6N4%N43N474O+)r4  rA  r   )T)r5   r6   r7   r8   r$   r<   base_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_supports_sdpa_supports_flex_attnr  r   r:   
LongTensorrJ   r   r`   r$  r0  r;  r?  rR  rk  r>   r4   r?   r@   r  r    s    "$O&*#N%9P Z^"5#3#3S#89HPQU0 Y]%(:?:J:J( -@' @' @'r?   r  c                   J  ^  \ rS rSrS\4U 4S jjrS rS r  SS\R                  S\
\R                     S\
\R                     4S	 jjr\     SS
\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\4   4S jj5       rSrU =r$ )Wav2Vec2ModeliA  r   c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        UR                  S:  d  UR                  S:  aG  [        R                  " [        R                  " UR                  5      R                  5       5      U l        UR                   (       a  [#        U5      U l        O['        U5      U l        UR(                  (       a  [+        U5      OS U l        U R/                  5         g )Nr9  )r   r   r   r   feature_extractorr  feature_projectionmask_time_probmask_feature_probr   r  r:   rH  r   r  masked_spec_embeddo_stable_layer_normr  encoderrv  r  r  adapter	post_initr  s     r@   r   Wav2Vec2Model.__init__C  s     !7!?";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&9&ADL*62DL282D2Dv.$ 	r?   c                 Z    [         R                  " S[        5        U R                  5         gz
Calling this function will disable the gradient computation for the feature encoder so that its parameters will
not be updated during training.
The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.Nr   r   r  freeze_feature_encoderr   s    r@   freeze_feature_extractor&Wav2Vec2Model.freeze_feature_extractorW  '    
 	Q	

 	##%r?   c                 8    U R                   R                  5         g
Calling this function will disable the gradient computation for the feature encoder so that its parameter will
not be updated during training.
N)rw  r   r  s    r@   r  $Wav2Vec2Model.freeze_feature_encoderc  s    
 	113r?   r0   rx   rD   c                    [        U R                  SS5      (       d  U$ UR                  5       u  pEnUb(  U R                  R	                  UR
                  5      X'   OU R                  R                  S:  a  U R                  (       a  [        XE4U R                  R                  U R                  R                  UU R                  R                  S9n[        R                  " X!R                  [        R                  S9nU R                  R	                  UR
                  5      X'   U R                  R                  S:  a  U R                  (       a  [        XF4U R                  R                  U R                  R                   U R                  R"                  S9n[        R                  " XqR                  [        R                  S9nUSS2S4   R%                  SUS5      nSX'   U$ )	z
Masks extracted features along time axis and/or along feature axis according to
[SpecAugment](https://huggingface.co/papers/1904.08779).
apply_spec_augmentTNr   )rB   rC   rD   rE   rQ  )rB   rC   rE   rR   )ro  r   rz   r{  r+  rT   ry  r   ru   mask_time_lengthmask_time_min_masksr:   r  r)  r`   rz  mask_feature_lengthmask_feature_min_masksr  )r   r0   rx   rD   rl   rO   r   mask_feature_indicess           r@   _mask_hidden_states!Wav2Vec2Model._mask_hidden_statesj  s    t{{$8$??   4A3E3E3G0
[(/3/E/E/H/HI\I\/]M,[[''!+ 5-++44 KK88-++99! !&->G[G[chcmcm n/3/E/E/H/HI\I\/]M,;;((1,#8)++77 KK;;++<<	$  $)<<0DMaMainisis#t #74#@#G#GO]_#` 23M/r?   r   r6  r  r  rF   c                 >   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  U5      nUR                  SS5      nUb  U R                  UR                  S   USS9nU R                  U5      u  pU R                  XUS9nU R                  UUUUUS9n	U	S   nU R                  b  U R                  U5      nU(       d	  X4U	SS -   $ [        UUU	R                  U	R                  S	9$ )
a  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
Nr#   r'   Fr(  )rx   rD   rD   r6  r  r  r   )r  extract_featuresr0   r1   )r   r6  r  use_return_dictrw  r   r0  rA   rx  r  r}  r~  r   r0   r1   )
r   r   rD   rx   r6  r  r  r  r0   encoder_outputss
             r@   r   Wav2Vec2Model.forward  sY    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DD &&q)>u E N +/*A*ABR*S'00~ 1 
 ,,)/!5# ' 
 (*<<# LL7M!4qr7JJJ&+-)77&11	
 	
r?   )r~  r   r}  rw  rx  r{  r  NNNNN)r5   r6   r7   r8   r$   r   r  r  r:   r;   r   rs  r  r   rH  r`   r   r=   r   r   r>   r   r   s   @r@   ru  ru  A  s    ~ (
&4 :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*7
u||,7
 !.7
 $E$5$56	7

 $D>7
 'tn7
 d^7
 
u--	.7
 7
r?   ru  z?
    Wav2Vec2 Model with a quantizer and `VQ` head on top.
    c                   x  ^  \ rS rSrS\4U 4S jjrS\4S jrS rS r	\
 SS\R                  S	\R                  S
\R                  S\4S jj5       r\      SS\\R                      S\\R                      S\\R"                     S\\R"                     S\\   S\\   S\\   S\\\4   4S jj5       rSrU =r$ )r  i  r   c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        [        U5      U l	        [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g r   )r   r   ru  r   r   r  feat_quantizer_dropoutdropout_featuresr  	quantizerr  r   proj_codevector_dimr  r  r  r  r  s     r@   r   Wav2Vec2ForPreTraining.__init__  s     %f- "

6+H+H I6v>99V%7%79S9ST6#8#8&:T:TU 	r?   r  c                 $    XR                   l        g)zR
Set the Gumbel softmax temperature to a given value. Only necessary for training
N)r  r  )r   r  s     r@   set_gumbel_temperature-Wav2Vec2ForPreTraining.set_gumbel_temperature  s     &1"r?   c                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r@   r  /Wav2Vec2ForPreTraining.freeze_feature_extractor  r  r?   c                 L    U R                   R                  R                  5         gr  r   rw  r   r  s    r@   r  -Wav2Vec2ForPreTraining.freeze_feature_encoder      
 	''::<r?   target_featuresnegative_featurespredicted_featuresc                     [         R                  " X/SS9n [         R                  " UR                  5       U R                  5       SS9R	                  U 5      nXC-  nU$ )z
Compute logits for contrastive loss based using cosine similarity as the distance measure between
`[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
r   r  rR   )r:   catcosine_similarityrG  r  )r  r  r  r  logitss        r@   compute_contrastive_logits1Wav2Vec2ForPreTraining.compute_contrastive_logits  s\      ))_$HaP(();)A)A)C_EZEZE\bdemm

 %r?   r   rD   rx   r~   r6  r  r  rF   c                    Ub  UOU R                   R                  nUb  UR                  [        R                  5      nU R                  UUUUUUS9nU R                  US   5      n	U R                  US   5      n
Ub  U R                  U
R                  S   USS9nU R                  XS9u  pUR                  U R                  R                  R                  5      nU R                  U5      nS=n=pUGb  UR                  u  nnnUR                  SU5      UR                  5       R                  S5         nUR                  UUSU5      R!                  S	SSS
5      nU R#                  USSS24   UU	U R                   R$                  5      nUU:H  R'                  S5      nUR)                  5       (       a  [+        S5      USS U'   UR-                  SS	5      R/                  SUR1                  S5      5      nSUR                  5       -
  S-  R-                  SS5      R3                  5       n[4        R6                  R9                  UR+                  5       USS9nU R                   R:                  U R                   R<                  -  nUU-
  U-  UR?                  5       -  nXR                   R@                  U-  -   nU(       d  Ub
  XX4US	S -   $ XU4US	S -   $ [C        UU	UUURD                  URF                  UUS9$ )aE
  
mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
    masked extracted features in *config.proj_codevector_dim* space.
sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
    Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
    Required input for pre-training.

Example:

```python
>>> import torch
>>> from transformers import AutoFeatureExtractor, Wav2Vec2ForPreTraining
>>> from transformers.models.wav2vec2.modeling_wav2vec2 import _compute_mask_indices, _sample_negative_indices
>>> from datasets import load_dataset

>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
>>> model = Wav2Vec2ForPreTraining.from_pretrained("facebook/wav2vec2-base")

>>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
>>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1

>>> # compute masked indices
>>> batch_size, raw_sequence_length = input_values.shape
>>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
>>> mask_time_indices = _compute_mask_indices(
...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
... )
>>> sampled_negative_indices = _sample_negative_indices(
...     features_shape=(batch_size, sequence_length),
...     num_negatives=model.config.num_negatives,
...     mask_time_indices=mask_time_indices,
... )
>>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
>>> sampled_negative_indices = torch.tensor(
...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
... )

>>> with torch.no_grad():
...     outputs = model(input_values, mask_time_indices=mask_time_indices)

>>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
>>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)

>>> # show that cosine similarity is much higher than random
>>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
tensor(True)

>>> # for contrastive loss training model should be put into train mode
>>> model = model.train()
>>> loss = model(
...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
... ).loss
```N)rD   r6  r  rx   r  r   r#   Fr(  )rx   rR   r'   r	   z-infir\   )	reduction)r,   r-   r.   r/   r0   r1   r2   r3   )$r   r  r+  r:   r`   r   r  r  r0  rA   r  r  r   rT   r!  r,  permuter  contrastive_logits_temperatureallanyrG  r   rj   rz   r  r   r  cross_entropyr  r  r\   diversity_loss_weightr*   r0   r1   )r   r   rD   rx   r~   r6  r  r  ri  transformer_featuresr  quantized_featuresr/   r,   r2   r3   rl   rO   r   negative_quantized_featuresr  
neg_is_postargetnum_codevectorss                           r@   r   Wav2Vec2ForPreTraining.forward  s9   D &1%<k$++B]B]( 1 4 4UZZ @--)/!5/#   
  $//
;  00<%!DD &&q)>u E N 59NN 5C 5
1 0224>>3H3H3N3NO!^^,>?3777#/7I7O7O4J +=*A*A"k*R(--/44R8+' +F*J*JOR+gaAq! ( 44"47++$::	F -0KKPPQSTJ~~).vqr
:& %%a+33BAGF,1133t;FFq!LTTVF!}}::6<<>6]b:c"kkCCdkkFgFggO.1FF/Y]n]r]r]ttN $kk&G&G.&XXD4F^ahijikalll(>STW^_`_aWbbb+1'9"7!//))-)	
 		
r?   )r  r  r  r  r   )g?)NNNNNN)r5   r6   r7   r8   r$   r   rJ   r  r  r  r  r:   r;   r  r   r   rH  
BoolTensorr`   r   r=   r*   r   r>   r   r   s   @r@   r  r    s3   ~ 1# 1
&= 
 	** ,, "-- 	 (  268<?C,0/3&*\
u||,\
 !.\
 $E$4$45	\

 #+5+;+;"<\
 $D>\
 'tn\
 d^\
 
u22	3\
 \
r?   r  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\
   S\\
   S\\
   S\\R                     S	\\\4   4S
 jj5       rSrU =r$ )Wav2Vec2ForMaskedLMi  c                 D  > [         TU ]  U5        [        R                  " S[        5        [        U5      U l        [        R                  " UR                  5      U l
        [        R                  " UR                  UR                  5      U l        U R                  5         g )NzSThe class `Wav2Vec2ForMaskedLM` is deprecated. Please use `Wav2Vec2ForCTC` instead.)r   r   r   r   r  ru  r   r   r  final_dropoutr  r  r   r`  r4  r  r  s     r@   r   Wav2Vec2ForMaskedLM.__init__  sp     acp	
 &f-zz&"6"67yy!3!3V5F5FG 	r?   r   rD   r6  r  r  labelsrF   c                 
   Ub  UOU R                   R                  nU R                  UUUUS9nUS   nU R                  U5      nU R	                  U5      n	U(       d  U	4USS  -   n
U
$ [        XR                  UR                  S9$ )N)r6  r  r  r   r'   )r  r0   r1   )r   r  r   r  r4  r   r0   r1   )r   r   rD   r6  r  r  r  ri  r0   r  outputs              r@   r   Wav2Vec2ForMaskedLM.forward  s     &1%<k$++B]B]--/!5#	   
  
]3m,Y,FMV;P;P]d]o]oppr?   )r  r4  r   r  )r5   r6   r7   r8   r   r   r:   r;   r   rs  r`   rH  r   r=   r   r   r>   r   r   s   @r@   r  r    s      6:,0/3&*)-q''q !!1!12q $D>	q
 'tnq d^q &q 
un$	%q qr?   r  zp
    Wav2Vec2 Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    c                      ^  \ rS rSrSS\\   4U 4S jjjrS rS rS r	S r
\     SS\\R                     S	\\R                     S
\\   S\\   S\\   S\\R                     S\\\4   4S jj5       rSrU =r$ )r8  i  rA  c                   > [         TU ]  U5        [        U5      U l        [        R
                  " UR                  5      U l        X l        UR                  c  [        SU R                   S35      e[        US5      (       a  UR                  (       a  UR                  OUR                  n[        R                   " X1R                  5      U l        U R%                  5         g)a
  
target_lang (`str`, *optional*):
    Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
    adapter.<lang>.bin. Only relevant when using an instance of [`Wav2Vec2ForCTC`] with adapters. Uses 'eng' by
    default.
NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `Wav2Vec2ForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.r  )r   r   ru  r   r   r  r  r  rA  r`  rV   r   r   r  r  r   r  r4  r  )r   r   rA  r  r   s       r@   r   Wav2Vec2ForCTC.__init__  s     	 %f-zz&"6"67&$00@ AH H  *1)G)GFL^L^F%%djdvdv 	 yy!35F5FG 	r?   c                     U R                   nUb'  [        U R                  SS5      c  [        SU S35      eUc.  [        U R                  SS5      b  [        R                  S5        gUb  U R                  USS9  gg)a  
This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
passing `target_lang=...` to `from_pretrained(...)`.

This method is **not** supposed to be called by the user and is prone to be changed in the future.
Nrn  zCannot pass `target_lang`: rC  z)By default `target_lang` is set to 'eng'.T)rc  )rA  ro  r   rV   rS  infork  )r   rA  s     r@   tie_weightsWav2Vec2ForCTC.tie_weights  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %r?   c                 Z    [         R                  " S[        5        U R                  5         gr  r  Nr  r  s    r@   r  'Wav2Vec2ForCTC.freeze_feature_extractor  r  r?   c                 L    U R                   R                  R                  5         gr  r  r  s    r@   r  %Wav2Vec2ForCTC.freeze_feature_encoder&  r  r?   c                 T    U R                   R                  5        H
  nSUl        M     gz
Calling this function will disable the gradient computation for the base model so that its parameters will not
be updated during training. Only the classification head will be updated.
FNr   r   r   r   s     r@   freeze_base_model Wav2Vec2ForCTC.freeze_base_model-  #    
 ]]--/E"'E 0r?   r   rD   r6  r  r  r  rF   c                    Ub  UOU R                   R                  nUbJ  UR                  5       U R                   R                  :  a"  [	        SU R                   R                   35      eU R                  UUUUUS9nUS   nU R                  U5      nU R                  U5      n	Sn
UGbX  Ub  UO"[        R                  " U[        R                  S9nU R                  UR                  S5      5      R                  [        R                  5      nUS:  nUR                  S5      nUR                  U5      n[        R                   R#                  U	S[        R$                  S9R'                  SS5      n[        R(                  R*                  R-                  S	S
9   [        R                   R/                  UUUUU R                   R0                  U R                   R2                  U R                   R4                  S9n
SSS5        U(       d  U	4U[6        S -   nU
b  U
4U-   $ U$ [9        XUR:                  UR<                  S9$ ! , (       d  f       NL= f)a  
labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
    Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
    the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
    All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
    config.vocab_size - 1]`.
Nz$Label values must be <= vocab_size: r  r   rS   rR   )r   rT   r#   F)enabled)blankr  zero_infinityr,   r  r0   r1   )r   r  rK   r`  rV   r   r  r4  r:   	ones_liker,  r$  r\   r+  masked_selectr   r  log_softmaxfloat32r   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   r0   r1   )r   r   rD   r6  r  r  r  ri  r0   r  r,   rn   labels_masktarget_lengthsflattened_targets	log_probsr  s                    r@   r   Wav2Vec2ForCTC.forward5  s   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]--)/!5#   
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+:}}--%!"++22"kk<<"&++"?"? .  ; Y)F)G!HHF)-)9TGf$EvEG4I4IV]VhVh
 	
 ;:s   A H??
I)r  r4  rA  r   r   r  )r5   r6   r7   r8   r   rR  r   r  r  r  r  r   r:   rH  r`   r   r=   r   r   r>   r   r   s   @r@   r8  r8    s    HSM  :<*
&=(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
r?   r8  z
    Wav2Vec2 Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\   S	\	\   S
\	\   S\	\
R                     S\\\4   4S jj5       rSrU =r$ )!Wav2Vec2ForSequenceClassificationi}  c                 "  > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        [        R                  " UR                   UR$                  5      U l        U R)                  5         g )Nr  z_Sequence classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)r#   )r   r   r   r  rV   ru  r   rz  use_weighted_layer_sumr   r  r:   re   layer_weightsr  r   classifier_proj_size	projector
num_labels
classifierr  r   r   
num_layersr   s      r@   r   *Wav2Vec2ForSequenceClassification.__init__  s     6=))f.@.@q  &f---1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	r?   c                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r@   r  :Wav2Vec2ForSequenceClassification.freeze_feature_extractor  r  r?   c                 L    U R                   R                  R                  5         gr  r  r  s    r@   r  8Wav2Vec2ForSequenceClassification.freeze_feature_encoder  r  r?   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r@   r  3Wav2Vec2ForSequenceClassification.freeze_base_model  r  r?   r   rD   r6  r  r  r  rF   c                 0   Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nUc  UR                  SS9n
OU R                  UR                   S   U5      nUR#                  S5      R%                  SSUR                   S   5      nS	X) '   UR                  SS9UR                  SS9R                  SS5      -  n
U R'                  U
5      nSnUbF  [)        5       nU" UR                  SU R                   R*                  5      UR                  S5      5      nU(       d  U4U[        S -   nUb  U4U-   $ U$ [-        UUUR.                  UR0                  S
9$ )  
input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
    Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
    into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
    (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
    To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
    into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr  r#   r  rR   r   r'   r9  r  )r   r  r  r   r  r:   stackr   r  r   r  r!  r\   r  r  r0  rA   r  r  r   r   r  r   r0   r1   )r   r   rD   r6  r  r  r  ri  r0   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r,   loss_fctr  s                    r@   r   )Wav2Vec2ForSequenceClassification.forward  s   . &1%<k$++B]B]'+{{'I'ItOc--)/!5#   
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M./)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
r?   )r   r  r  r   r  )r5   r6   r7   r8   r   r  r  r  r   r   r:   rH  r`   r   r=   r   r   r>   r   r   s   @r@   r  r  }  s    "
&=(  26,0/3&*)-B
u||,B
 !.B
 $D>	B

 'tnB
 d^B
 &B
 
u..	/B
 B
r?   r  c                      ^  \ rS rSrU 4S jrS rS rS r\     SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\   S
\	\   S\	\   S\\\4   4S jj5       rSrU =r$ )#Wav2Vec2ForAudioFrameClassificationi  c                   > [         TU ]  U5        [        US5      (       a  UR                  (       a  [	        S5      e[        U5      U l        UR                  S-   nUR                  (       a2  [        R                  " [        R                  " U5      U-  5      U l        [        R                  " UR                  UR                   5      U l        UR                   U l        U R%                  5         g )Nr  zbAudio frame classification does not support the use of Wav2Vec2 adapters (config.add_adapter=True)r#   )r   r   r   r  rV   ru  r   rz  r  r   r  r:   re   r  r  r   r  r   init_weightsr  s      r@   r   ,Wav2Vec2ForAudioFrameClassification.__init__  s     6=))f.@.@t  &f---1
((!#ejj.Dz.Q!RD))F$6$68I8IJ ++r?   c                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r@   r  <Wav2Vec2ForAudioFrameClassification.freeze_feature_extractor  r  r?   c                 L    U R                   R                  R                  5         gr  r  r  s    r@   r  :Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder  r  r?   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r@   r  5Wav2Vec2ForAudioFrameClassification.freeze_base_model  r  r?   r   rD   r  r6  r  r  rF   c           	         Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      n
SnUbZ  [        5       nU" U
R                  SU R                  5      [
        R                   " UR                  SU R                  5      SS95      nU(       d  U
4U[        S -   nU$ [#        UU
UR$                  UR&                  S	9$ )
r  NTr  r#   r  rR   r   )axisr  )r   r  r  r   r  r:   r  r   r  r   r  r!  r\   r   r   r  r  r   r0   r1   )r   r   rD   r  r6  r  r  ri  r0   r  r  r,   r  r  s                 r@   r   +Wav2Vec2ForAudioFrameClassification.forward#  sf   . &1%<k$++B]B]'+{{'I'ItOc--)/!5#   
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM/')HFKKDOO<ell6;;WY[_[j[jKkrs>tuDY)F)G!HHFM$!//))	
 	
r?   )r   r  r  r   r  )r5   r6   r7   r8   r   r  r  r  r   r   r:   rH  r`   r   r=   r   r   r>   r   r   s   @r@   r  r    s     
&=(  26)-,0/3&*9
u||,9
 !.9
 &	9

 $D>9
 'tn9
 d^9
 
u++	,9
 9
r?   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )AMSoftmaxLossi`  c                    > [         TU ]  5         X0l        X@l        X l        [
        R                  " [        R                  " X5      SS9U l	        [
        R                  " 5       U l        g )NT)r   )r   r   scalemarginr  r   r  r:   randnr   r   r,   )r   r  r  r$  r%  r   s        r@   r   AMSoftmaxLoss.__init__a  sI    
$ll5;;y#EUYZ'')	r?   c                    UR                  5       n[        R                  R                  U R                  SS9n[        R                  R                  USS9n[
        R                  " X5      nX@R                  -
  n[        R                  R                  X R                  5      nU R                  [
        R                  " UR                  5       XT5      -  nU R                  Xr5      nU$ )Nr   r  r#   )r  r   r  	normalizer   r:   mmr%  one_hotr  r$  r  r`   r,   )	r   r0   r  r   	cos_thetapsionehotr  r,   s	            r@   r   AMSoftmaxLoss.forwardi  s    !((!(<//1/EHH]3	++%&&v?ekk&++-HHyy(r?   )r,   r%  r  r$  r   )g      >@g?r   r   s   @r@   r"  r"  `  s    * r?   r"  c                   f   ^  \ rS rSrSU 4S jjrS\R                  S\R                  4S jrSrU =r	$ )	TDNNLayeriw  c                   > [         TU ]  5         US:  a  UR                  US-
     OUR                  U   U l        UR                  U   U l        UR
                  U   U l        UR                  U   U l        [        R                  " U R                  U R                  -  U R                  5      U l        [        R                  " 5       U l        g )Nr   r#   )r   r   tdnn_dimr   r   tdnn_kernelr   tdnn_dilationdilationr   r  kernelr  r   r   s      r@   r   TDNNLayer.__init__x  s    <DqL6??8a<8foo^fNg"OOH5!--h7,,X6ii 0 043C3C CTEVEVW'')r?   r0   rF   c                 >   [        5       (       a  SSKJn  [        5       (       a1  [        U R                  W5      (       a  [
        R                  " S5        UR                  SS5      nU R                  R                  R                  U R                  U R                  U R                  5      R                  SS5      n[        R                  R                  XU R                  R                   U R"                  S9nUR                  SS5      nU R%                  U5      nU$ )Nr   )	LoraLayerzDetected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. You should exclude TDNNLayer from LoRA's target modules.r#   r'   )r6  )r   peft.tuners.lorar:  rP  r7  r   r   r   r   r!  r   r   r   r   r  conv1dr   r6  r   )r   r0   r:  r   s       r@   r   TDNNLayer.forward  s    2$++y11O &//15##(():):D<L<LdN^N^_iijkmno,,]DKKDTDT_c_l_l,m%//156r?   )r   r6  r   r7  r   r   r   )
r5   r6   r7   r8   r   r:   rH  r   r>   r   r   s   @r@   r1  r1  w  s(    $U\\ ell  r?   r1  zl
    Wav2Vec2 Model with an XVector feature extraction head on top for tasks like Speaker Verification.
    c                     ^  \ rS rSrU 4S jrS rS rS rS\\	R                  \4   4S jr\     SS\\	R                     S	\\	R                     S
\\   S\\   S\\   S\\	R                     S\\\4   4S jj5       rSrU =r$ )Wav2Vec2ForXVectori  c                 0  > [         TU ]  U5        [        U5      U l        UR                  S-   nUR
                  (       a2  [        R                  " [        R                  " U5      U-  5      U l
        [        R                  " UR                  UR                  S   5      U l        [        [!        UR                  5      5       Vs/ sH  n[#        X5      PM     nn[        R$                  " U5      U l        [        R                  " UR                  S   S-  UR(                  5      U l        [        R                  " UR(                  UR(                  5      U l        [/        UR(                  UR0                  5      U l        U R5                  5         g s  snf )Nr#   r   rR   r'   )r   r   ru  r   rz  r  r   r  r:   re   r  r  r   r3  r  r^   rc   r1  r   tdnnxvector_output_dimrw  r   r"  r  	objectiver  )r   r   r  r   tdnn_layersr   s        r@   r   Wav2Vec2ForXVector.__init__  s    %f---1
((!#ejj.Dz.Q!RD6#5#5vq7IJ5:3v;O5PQ5Py+5PQMM+.	!#6??2+>+BFD]D]!^))F$=$=v?X?XY&v'@'@&BSBST Rs   Fc                 Z    [         R                  " S[        5        U R                  5         gr  r  r  s    r@   r  +Wav2Vec2ForXVector.freeze_feature_extractor  r  r?   c                 L    U R                   R                  R                  5         gr  r  r  s    r@   r  )Wav2Vec2ForXVector.freeze_feature_encoder  r  r?   c                 T    U R                   R                  5        H
  nSUl        M     gr  r  r   s     r@   r  $Wav2Vec2ForXVector.freeze_base_model  r  r?   rn   c                 X    S nU R                   R                   H  nU" XS5      nM     U$ )z/
Computes the output length of the TDNN layers
c                     X-
  U-  S-   $ )Nr#   r4   r   s      r@   r!  EWav2Vec2ForXVector._get_tdnn_output_lengths.<locals>._conv_out_length  s     !.69A==r?   r#   )r   r4  )r   rn   r!  r   s       r@   _get_tdnn_output_lengths+Wav2Vec2ForXVector._get_tdnn_output_lengths  s1    
	>
  ;;22K,]KM 3 r?   r   rD   r6  r  r  r  rF   c                    Ub  UOU R                   R                  nU R                   R                  (       a  SOUnU R                  UUUUUS9nU R                   R                  (       ai  U[           n[
        R                  " USS9n[        R                  R                  U R                  SS9n	XR                  SSS5      -  R                  SS9nOUS   nU R                  U5      nU R                   H  n
U
" U5      nM     Uc  UR                  SS9nUR!                  SS9nOU R#                  UR                  SS95      nU R%                  U5      n/ n/ n['        U5       HN  u  nnUR)                  XSU24   R                  SS95        UR)                  XSU24   R!                  SS95        MP     [
        R                  " U5      n[
        R                  " U5      n[
        R*                  " X/SS9nU R-                  U5      nU R/                  U5      nSnUb  U R1                  UU5      nU(       d  UU4U[        S -   nUb  U4U-   $ U$ [3        UUUUR4                  UR6                  S9$ )	r  NTr  r#   r  rR   r   )r,   r  
embeddingsr0   r1   )r   r  r  r   r  r:   r  r   r  r   r  r!  r\   r  rA  r  r  r$  rO  	enumeraterg   r  rw  r   rC  r   r0   r1   )r   r   rD   r6  r  r  r  ri  r0   r  
tdnn_layermean_featuresstd_featuresfeat_extract_output_lengthstdnn_output_lengthsr   lengthstatistic_poolingoutput_embeddingsr  r,   r  s                         r@   r   Wav2Vec2ForXVector.forward  s   . &1%<k$++B]B]'+{{'I'ItOc--)/!5#   
 ;;--#$ABM!KK1=M==001C1C0LL*->->r1a-HHMMRSMTM#AJM}5))J&}5M $ !)..1.5M(,,,3L*.*O*OP^PbPbghPbPi*j'"&"?"?@["\ML&':;	6$$]gvg:%>%C%C%C%JK##MWfW*$=$A$Aa$A$HI < "KK6M ;;|4L!II}&CL 223DE!23>>&&1D/07;X;Y3ZZF)-)9TGf$EvE(!//))
 	
r?   )r   rw  r  rC  r  rA  r   r  )r5   r6   r7   r8   r   r  r  r  r   r:   rs  rJ   rO  r   r   rH  r`   r=   r   r   r>   r   r   s   @r@   r?  r?    s    &
&=(eE<L<Lc<Q6R   26,0/3&*)-O
u||,O
 !.O
 $D>	O

 'tnO
 d^O
 &O
 
um#	$O
 O
r?   r?  )r  r8  r  r  r  r?  ru  r  r   r   )Nr9  N)hr9   r  r   dataclassesr   typingr   r   r   numpyrW   r:   torch.utils.checkpointr   torch.nnr   activationsr
   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   r   r   modeling_utilsr   r   processing_utilsr   r   r   r   r   r   r   r    r!   r"   configuration_wav2vec2r$   r\  rW  safetensors.torchr%   rY  integrations.flex_attentionr&   
get_loggerr5   rS  r  r*   r=   rJ   rG  rs  ndarrayru   r   r   r   r   Moduler   r   r   r   r  rH  r&  r(  rJ  r[  rl  rv  r  r  r  r  rp  r  ru  r  r  r8  r  r  r"  r1  r?  __all__r4   r?   r@   <module>rr     s\      ! , ,     % ! @ 7 C 9   G &	 	 	 3 , 5 =  !!J 
		H	% !"  
7; 7 7B 26tc?tt t U--.	t
 t ZZtp Z^!$!$*-!$BJ2::BV!$H#= *!; 6!; 0*bii *Z299 %RYY %P
5 
1		 1,  $(,%II%<<% 
% <<	%
 U\\*% e_% % %%<U/		 U/p")) 0!5 !H+*D +\[bii [|_RYY _DI'BII I'Xbii >299 $ryy 2 x'o x' x'v	 N
+ N
 N
b 
Y
4 Y

Y
x *q1 *q *qZ 
S
, S

S
l p
(? p
p
f f
*A f
 f
RBII .		 @ 
N
0 N

N
b	r?   