
    <hG                       S r SSKrSSKrSSKJr  SSKJrJrJrJ	r	  SSK
r
SSKJs  Jr  SSK
Jr  SSKJr  SSKJr  SS	KJrJrJr  SS
KJrJr  SSKJrJrJrJr  SSKJ r J!r!J"r"J#r#J$r$  SSK%J&r&J'r'J(r(  \#RR                  " \*5      r+S r,S r-S r.SbS jr/S\
R`                  S\
R`                  4S jr1\\!" SS9 " S S\ 5      5       5       r2\\!" SS9 " S S\ 5      5       5       r3\\! " S S\ 5      5       5       r4 " S S \Rj                  5      r6 " S! S"\Rj                  5      r7 " S# S$\Rj                  5      r8 " S% S&\Rj                  5      r9 " S' S(\Rj                  5      r: " S) S*\Rj                  5      r; " S+ S,\Rj                  5      r< " S- S.\Rj                  5      r= " S/ S0\Rj                  5      r> " S1 S2\5      r? " S3 S4\Rj                  5      r@ " S5 S6\Rj                  5      rA " S7 S8\Rj                  5      rB " S9 S:\Rj                  5      rC  ScS;\Rj                  S<\
R`                  S=\
R`                  S>\
R`                  S?\\
R`                     S@\DSA\DSB\\
R`                     4SC jjrE " SD SE\Rj                  5      rF " SF SG\Rj                  5      rG " SH SI\Rj                  5      rH " SJ SK\Rj                  5      rI " SL SM\Rj                  5      rJ " SN SO\5      rK " SP SQ\Rj                  5      rL " SR SS\Rj                  5      rM\! " ST SU\5      5       rN " SV SW\N5      rO\!" SXS9 " SY SZ\N5      5       rP\! " S[ S\\N5      5       rQ\! " S] S^\N5      5       rR\! " S_ S`\N5      5       rS/ SaQrTg)dzPyTorch CLAP model.    N)	dataclass)AnyCallableOptionalUnion)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringcan_return_tuplelogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     U R                   u  p#nU SS2SS2SSS24   R                  SSUS5      nUR                  X#U-  U5      nU$ )aI  
Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

Args:
    hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
        Input hidden states
    ratio (`int`):
        The ratio of the length of the output to the length of the input.
Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         ^/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/clap/modeling_clap.pyinterpolater)   +   sT     .;-@-@*ZkaD!m,33Aq%CI!!*E.A;OI    c                     U R                   u  p#pEU R                  X#U-  XU-  X5      n U R                  SSSSSS5      R                  5       R                  SXU5      nU$ )a2  
Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
num_channels)`

Args:
    hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
        Input hidden states
    window_size (`int`):
        Window size
r   r   r	            r   viewpermute
contiguous)r"   window_sizer$   heightwidthnum_channelswindowss          r(   window_partitionr9   <   so     /<.A.A+J!&&k);8LkM ##Aq!Q15@@BGGKfrsGNr*   c                     U R                   S   nU R                  SX!-  X1-  XU5      n U R                  SSSSSS5      R                  5       R                  SX#U5      n U $ )a_  
Merges windows to produce higher resolution features.
Args:
    windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
        Input windows
    window_size (`int`):
        Window size
    height (`int`):
        Height of the resized audio
    width (`int`):
        Width of the resized audio
r/   r   r   r	   r,   r-   r.   r0   )r8   r4   r5   r6   r7   s        r(   window_reverser;   Q   se     ==$Lll2v4e6JKfrsGooaAq!Q/::<AA"fUabGNr*   c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-   U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r(   "create_position_ids_from_input_idsrJ   e   sW     <<$((*D <<!4<<TBE[[_cc##%33r*   logitsreturnc                     [         R                  " [        U 5      U R                  S9n[        R
                  R                  X5      $ )Ndevice)rA   arangelenrO   r   
functionalcross_entropy)rK   labelss     r(   contrastive_lossrU   w   s/    \\#f+fmm<F==&&v66r*   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
ClapTextModelOutput|   z
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
    The text embeddings obtained by applying the projection layer to the pooler_output.
Ntext_embedslast_hidden_state.r"   
attentions )__name__
__module____qualname____firstlineno____doc__rZ   r   rA   FloatTensor__annotations__r[   r"   tupler\   __static_attributes__r]   r*   r(   rX   rX   |   sr    
 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r*   rX   zT
    ClapAudio model output to mimic the output of the original implementation.
    c                       \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\\R                  S4      \	S'   Sr\\\R                  S4      \	S'   S	rg)
ClapAudioModelOutput   z
audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
    The Audio embeddings obtained by applying the projection layer to the pooler_output.
Naudio_embedsr[   .r"   r\   r]   )r^   r_   r`   ra   rb   rj   r   rA   rc   rd   r[   r"   re   r\   rf   r]   r*   r(   rh   rh      sr    
 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r*   rh   c                      \ rS rSr% SrSr\\R                     \	S'   Sr
\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\R                     \	S'   Sr\\	S	'   Sr\\	S
'   S\\   4S jrSrg)
ClapOutput   ab  
loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
    Contrastive loss for audio-text similarity.
logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
    The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
    similarity scores.
logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
    The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
    similarity scores.
text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
    The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
text_model_output (`BaseModelOutputWithPooling`):
    The output of the [`ClapTextModel`].
audio_model_output (`BaseModelOutputWithPooling`):
    The output of the [`ClapAudioModel`].
Nlosslogits_per_audiologits_per_textrZ   rj   text_model_outputaudio_model_outputrL   c                 J   ^  [        U 4S jT R                  5        5       5      $ )Nc              3   l   >#    U H*  nUS ;  a  TU   O[        TU5      R                  5       v   M,     g7f))rq   rr   N)getattrto_tuple).0kselfs     r(   	<genexpr>&ClapOutput.to_tuple.<locals>.<genexpr>   s<      
   KKDGQXY]_`QaQjQjQll s   14)re   keysry   s   `r(   rv   ClapOutput.to_tuple   s#     
YY[
 
 	
r*   r]   )r^   r_   r`   ra   rb   rn   r   rA   rc   rd   ro   rp   rZ   rj   rq   r   rr   re   r   rv   rf   r]   r*   r(   rl   rl      s    & )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448185929
%* 
r*   rl   c                   6   ^  \ rS rSrSrSU 4S jjrS rSrU =r$ )ClapDropPath   z
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
refactored version of the `SwinDropPath` implementation.
c                 .   > [         TU ]  5         Xl        g N)super__init__	drop_prob)ry   r   	__class__s     r(   r   ClapDropPath.__init__   s    "r*   c                 P   U R                   S:X  d  U R                  (       d  U$ SU R                   -
  nUR                  S   4SUR                  S-
  -  -   nU[        R
                  " X1R                  UR                  S9-   nUR                  5         UR                  U5      U-  nU$ )N        r   r   )r   dtyperO   )
r   trainingr   ndimrA   randr   rO   floor_div)ry   r"   	keep_probr   random_tensoroutputs         r(   forwardClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJu<O<OXeXlXl$mm""9-=r*   )r   r   )	r^   r_   r`   ra   rb   r   r   rf   __classcell__r   s   @r(   r   r      s    
# r*   r   c                   :   ^  \ rS rSrSrS\4U 4S jjrS rSrU =r	$ )ClapAudioAFFBlock   z
ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
the 1D version.
configc                   > [         TU ]  5         UR                  nUR                  n[	        X#-  5      n[
        R                  " [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l
        [
        R                  " [
        R                  " S5      [
        R                  " X$SSSS9[
        R                  " U5      [
        R                  " SS9[
        R                  " XBSSSS9[
        R                  " U5      5      U l        [
        R                  " 5       U l        g )Nr   r   kernel_sizestridepaddingT)inplace)r   r   patch_embeds_hidden_sizeaff_block_rr@   r   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)ry   r   channelsdownsize_ratiointer_channelsr   s        r(   r   ClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInAaQRSNN8$
 zz|r*   c                     X-   nU R                  U5      U R                  U5      -   nU R                  U5      nSU-  U-  SU-  SU-
  -  -   nU$ )Nr,   r   )r   r   r   )ry   r"   residualattention_inputfused_layer_outputr   s         r(   r   ClapAudioAFFBlock.forward   s`    '2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar*   )r   r   r   
r^   r_   r`   ra   rb   r   r   r   rf   r   r   s   @r(   r   r      s    
$ $0 r*   r   c                   >   ^  \ rS rSrSrS\4U 4S jjrSS jrSrU =r	$ )ClapAudioPatchEmbedi	  z~
This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
Transformer block.
r   c                   > [         TU ]  5         [        UR                  [        5      (       a  UR                  UR                  4OUR                  n[        UR
                  [        5      (       a  UR
                  UR
                  4OUR
                  n[        UR                  [        5      (       a  UR                  UR                  4OUR                  nX l        X@l        US   US   -  US   US   -  4U l        U R                  S   U R                  S   -  U l	        UR                  U l        UR                  U l        US   US   -
  S-  US   US   -
  S-  4nU R                  (       a  UR                  S:X  a  SOSn[        R                  " UR                   U-  UR"                  UUUS9U l        UR&                  (       a   [        R(                  " UR"                  5      O[        R*                  " 5       U l        U R                  (       aX  [/        U5      U l        [        R                  " UR                   UR"                  US   US   S-  4US   US   S-  4US9U l        g g )Nr   r   r,   channel_mapr-   r   r	   )r   r   
isinstance	spec_sizer@   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)ry   r   r   r   r   r   scale_factorr   s          r(   r   ClapAudioPatchEmbed.__init__  s0   ;EfFVFVX[;\;\F$$f&6&67bhbrbr6@ARARTW6X6XV 1 12^d^o^o 	 ;EVEXEXZ]:^:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r*   c                    U R                   (       Ga  US S 2SS2S S 2S S 24   nUR                  u  pEpgX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nUR                  S5      n[        U5      S:  a  XSS 2S S 2S S 24   R                  5       n	U	R                  u  pEpgU	R                  XE-  SXg5      n	U R                  U	5      n	U	R                  u  ppgU	R                  XEXU5      n	U	R                  S5      R                  5       R                  S	5      n	U	R                  S5      n[        R                  R                  R                  U	SX-
  4S
S5      n	U R!                  X2   U	5      X2'   UnOwUR                  u    pnX`R                  S   :w  d  XpR                  S   :w  a2  [        SU SU SU R                  S    SU R                  S    S3	5      eU R	                  U5      nU R                  (       a!  UR                  S5      R#                  SS5      nU R%                  U5      nU$ )Nr   r   zInput audio size (*z) doesn't match model (z).r/   )r   r,   r	   r   r-   r	   constantr,   )r   r   r   
ValueErrorr   sizerQ   r3   r1   r   r2   r   rA   r   rR   padr   	transposer   )ry   r"   is_longer_idxglobal_hidden_statesr$   r7   r5   r6   output_widthlocal_hidden_states_featureslocal_widths                r(   r   ClapAudioPatchEmbed.forward9  s   #0AaCA#>  7K6P6P3Jfq))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&312q!4K&L&W&W&Y#:M:S:S7
&&9&>&>z?XZ[]c&k#&*oo6I&J#-@-F-F*V&9&>&>zYakp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\-G)H*VW'# 7;6G6G(79L7$3 1M"/"5"5Aq%q))UmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r*   )
r   r   r   r   r   r   r   r   r   r   r   r   r   s   @r(   r   r   	  s    
( (T/ /r*   r   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )ClapAudioSelfAttentionil  c                 
  > [         TU ]  5         X#-  S:w  a  [        SU SU S35      eX0l        [	        X#-  5      U l        U R                  U R
                  -  U l        [        U[        R                  R                  5      (       a  UOXD4U l        [        R                  " [        R                  " SU R                  S   -  S-
  SU R                  S   -  S-
  -  U5      5      U l        [        R"                  " U R                  S   5      n[        R"                  " U R                  S   5      n[        R$                  " ['        XV/SS95      n[        R(                  " US5      nUS S 2S S 2S 4   US S 2S S S 24   -
  n	U	R+                  SSS5      R-                  5       n	U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   U R                  S   S-
  -  ss'   U	S S 2S S 2S4==   SU R                  S   -  S-
  -  ss'   U	R/                  S	5      n
U R1                  S
U
5        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R2                  " U R                  U R                  UR4                  S9U l        [        R<                  " UR>                  5      U l         g )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r,   r   ij)indexingr/   relative_position_indexbias)!r   r   r   num_attention_headsr@   attention_head_sizeall_head_sizer   collectionsabcIterabler4   r   	ParameterrA   zerosrelative_position_bias_tablerP   stackr   r   r2   r3   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)ry   r   r>   	num_headsr4   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r   s              r(   r   ClapAudioSelfAttention.__init__m  s   ?a#C5(^_h^iijk  $- #&s#7 !558P8PP%k;??3K3KLLKS^Rl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr*   r"   attention_mask	head_maskoutput_attentionsrL   c                    UR                   u  pVnXVSU R                  4nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      n[        R                  " XR	                  SS5      5      nU[        R                  " U R                  5      -  nU R                  U R                  R                  S5         nUR                  U R                  S   U R                  S   -  U R                  S   U R                  S   -  S5      nUR                  SSS5      R                  5       nXR!                  S5      -   nUbm  UR                   S   nUR                  X^-  XR"                  Xf5      nXR!                  S5      R!                  S5      -   nUR                  SU R"                  Xf5      n[$        R&                  R)                  USS9nU R+                  U5      nUb  X-  n[        R                  " X5      nUR                  SSSS5      R                  5       nUR-                  5       S S U R.                  4-   nUR                  U5      nU(       a  UU4nU$ U4nU$ )Nr/   r   r,   r   r=   r	   )r   r   r   r1   r   r   r   rA   matmulmathsqrtr   r   r4   r2   r3   	unsqueezer   r   rR   softmaxr   r   r   )ry   r"   r  r  r  r$   r>   r7   hidden_shapequery_layer	key_layervalue_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r(   r   ClapAudioSelfAttention.forward  s    )6(;(;%
"T-E-EFjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR !<<5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.N.Nq.QQ%'--a0J/44(*6N6NPS   02J2J12M2W2WXY2ZZ/44R9Q9QSV\ --//0@b/I ,,7  -9O_B%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r*   )	r   r   r   r   r   r   r   r   r4   NNFr^   r_   r`   ra   r   rA   Tensorr   rc   boolre   r   rf   r   r   s   @r(   r   r   l  sv    #GP 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6 6r*   r   c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapAudioSelfOutputi  c                    > [         TU ]  5         [        R                  " X"5      U l        [        R
                  " UR                  5      U l        g r   )r   r   r   r   denser   r   r   ry   r   r>   r   s      r(   r   ClapAudioSelfOutput.__init__  s4    YYs(
zz&"E"EFr*   r"   input_tensorrL   c                 J    U R                  U5      nU R                  U5      nU$ r   r#  r   ry   r"   r&  s      r(   r   ClapAudioSelfOutput.forward  s$    

=1]3r*   r(  
r^   r_   r`   ra   r   rA   r  r   rf   r   r   s   @r(   r!  r!    s7    G
U\\  RWR^R^  r*   r!  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )ClapAudioAttentioni  c                    > [         TU ]  5         [        XX45      U l        [	        X5      U l        [        5       U l        g r   )r   r   r   ry   r!  r   setpruned_heads)ry   r   r>   r   r4   r   s        r(   r   ClapAudioAttention.__init__  s2    *6	O	)&6Er*   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g Nr   r   r=   rQ   r   ry   r   r   r0  r   r   r   r   r   r#  r   unionry   headsindexs      r(   prune_headsClapAudioAttention.prune_heads     u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r*   r"   r  r  r  rL   c                 f    U R                  XX45      nU R                  US   U5      nU4USS  -   nU$ )Nr   r   ry   r   )ry   r"   r  r  r  self_outputsattention_outputr  s           r(   r   ClapAudioAttention.forward  sB     yy	];;|AF#%QR(88r*   r   r0  ry   r  r^   r_   r`   ra   r   r9  rA   r  r   rc   r  re   r   rf   r   r   s   @r(   r-  r-    sy    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
 
r*   r-  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioIntermediatei  c                   > [         TU ]  5         [        R                  " U[	        UR
                  U-  5      5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r   r   r@   	mlp_ratior#  r   
hidden_actstrr
   intermediate_act_fnr$  s      r(   r   ClapAudioIntermediate.__init__  sd    YYsC(8(83(>$?@
f''--'-f.?.?'@D$'-'8'8D$r*   r"   rL   c                 J    U R                  U5      nU R                  U5      nU$ r   r#  rI  ry   r"   s     r(   r   ClapAudioIntermediate.forward
  &    

=100?r*   rL  r+  r   s   @r(   rD  rD    (    9U\\ ell  r*   rD  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapAudioOutputi  c                    > [         TU ]  5         [        R                  " [	        UR
                  U-  5      U5      U l        [        R                  " UR                  5      U l	        g r   )
r   r   r   r   r@   rF  r#  r   hidden_dropout_probr   r$  s      r(   r   ClapAudioOutput.__init__  sF    YYs6#3#3c#9:C@
zz&"<"<=r*   r"   rL   c                 J    U R                  U5      nU R                  U5      nU$ r   r(  rM  s     r(   r   ClapAudioOutput.forward  s$    

=1]3r*   r(  r+  r   s   @r(   rR  rR    s(    >
U\\ ell  r*   rR  c                      ^  \ rS rSrSU 4S jjrS rS rS r   SS\R                  S\
\\4   S\\R                     S	\\   S
\\   S\
\R                  \R                  4   4S jjrSrU =r$ )ClapAudioLayeri  c                   > [         TU ]  5         UR                  U l        X`l        UR                  U l        X0l        [        R                  " X!R                  S9U l	        [        XX@R                  S9U l        US:  a  [        U5      O[        R                  " 5       U l        [        R                  " X!R                  S9U l        [!        X5      U l        [%        X5      U l        g )Neps)r4   r   )r   r   chunk_size_feed_forward
shift_sizer4   input_resolutionr   r   layer_norm_epslayernorm_beforer-  	attentionr   r   	drop_pathlayernorm_afterrD  intermediaterR  r   )ry   r   r>   r_  r   drop_path_rater^  r   s          r(   r   ClapAudioLayer.__init__  s    '-'E'E$$!-- 0 "S6K6K L+FP`P`a9G#9Mn5SUS^S^S`!||C5J5JK1&>%f2r*   c                    [        U5      U R                  ::  an  [        S5      U l        [        R
                  R                  5       (       a*  [        R                   " [        R                  " U5      5      O
[        U5      U l        g g Nr   )minr4   r   r^  rA   jit
is_tracingtensor)ry   r_  s     r(   set_shift_and_window_size(ClapAudioLayer.set_shift_and_window_size,  s_     D$4$44'lDO=BYY=Q=Q=S=S		%,,'789Y\]mYn  5r*   c           	         U R                   S:  Gae  [        R                  " SXS4X4S9n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4n[        SU R                  * 5      [        U R                  * U R                   * 5      [        U R                   * S 5      4nSnU H  n	U H  n
XS S 2XS S 24'   US-  nM     M     [        XPR                  5      nUR                  SU R                  U R                  -  5      nUR                  S5      UR                  S5      -
  nUR                  US:g  S5      R                  US:H  S5      nU$ S nU$ )Nr   r   r   r/   r,   g      Yr   )	r^  rA   r   slicer4   r9   r1   r  masked_fill)ry   r5   r6   r   rO   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r(   get_attn_maskClapAudioLayer.get_attn_mask4  sy   ??Q{{Ava#8UHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E -#/K@EQ1<=QJE $0 !.
 ,H6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1nfEQQR[_`R`befI  Ir*   c                     U R                   X0R                   -  -
  U R                   -  nU R                   X R                   -  -
  U R                   -  nSSSUSU4n[        R                  R                  X5      nX4$ ri  )r4   r   rR   r   )ry   r"   r5   r6   	pad_right
pad_bottom
pad_valuess          r(   	maybe_padClapAudioLayer.maybe_padP  sy    %%0@0@(@@DDTDTT	&&2B2B)BBdFVFVV
Ay!Z8
))-D((r*   r"   input_dimensionsr  r  always_partitionrL   c                    U(       d  U R                  U5        O Uu  pgUR                  5       u  pn
UnU R                  U5      nUR                  XXz5      nU R	                  XU5      u  pUR
                  u  ppU R                  S:  a.  [        R                  " XR                  * U R                  * 4SS9nOUn[        XR                  5      nUR                  SU R                  U R                  -  U
5      nU R                  XUR                  UR                  S9nU R                  UUX4S9nUS   nUR                  SU R                  U R                  U
5      n[        UU R                  X5      nU R                  S:  a-  [        R                  " UU R                  U R                  4SS9nOUnUS   S:  =(       d    US   S:  nU(       a  US S 2S U2S U2S S 24   R!                  5       nUR                  XU-  U
5      nXR#                  U5      -   nU R%                  U5      nU R'                  U5      nXR)                  U5      -   nU(       a	  UUS	   4nU$ U4nU$ )
Nr   )r   r,   )shiftsdimsr/   r   )r  r	   r.   r   )rn  r   ra  r1   r  r   r^  rA   rollr9   r4   r{  r   rO   rb  r;   r3   rc  rd  re  r   )ry   r"   r  r  r  r  r5   r6   r$   r   r   shortcutr  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrz  attention_outputsr?  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r(   r   ClapAudioLayer.forwardW  s     **+;<("/"4"4"6
x --m<%**:uO %)NN=%$P!&3&9&9#y??Q$)JJ}FVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&)<)<EZEaEa ' 
	 !NN!9i + 
 -Q/,11"d6F6FHXHXZbc():D<L<Ljd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:~xX >>2C#DD++M:((6${{<'@@@Q'8';< YeWfr*   )
rb  r]  rc  r_  re  rd  ra  r   r^  r4   )r   r   NFF)r^   r_   r`   ra   r   rn  r{  r  rA   r  re   r@   r   rc   r  r   rf   r   r   s   @r(   rY  rY    s    38) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*A Ar*   rY  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\\4   S\	\R                     S\	\   S\	\   S\\R                     4S	 jjrS
rU =r$ )ClapAudioStagei  c                 P  > [         T	U ]  5         Xl        X l        [        R
                  " [        U5       Vs/ sH+  n[        UUUUXh   US-  S:X  a  SOUR                  S-  S9PM-     sn5      U l	        Ub  U" X2[        R                  S9U l        OS U l        SU l        g s  snf )Nr,   r   )r   r>   r_  r   rf  r^  )r>   
norm_layerF)r   r   r   r>   r   
ModuleListrangerY  r4   blocksr   
downsamplepointing)
ry   r   r>   r_  depthr   rc  r  ir   s
            r(   r   ClapAudioStage.__init__  s    mm u
 &A !%5'#,<%&UaZqf6H6HA6M &

 !()9r||\DO"DO'
s   1B#r"   r  r  r  r  rL   c                    Uu  pg[        U R                  5       H  u  pUb  X8   OS n
U	" XXU5      nUS   nM     UnU R                  b%  US-   S-  US-   S-  pXgX4nU R                  X5      nOXgXg4nXU4nU(       a  UWSS  -  nU$ )Nr   r   r,   )	enumerater  r  )ry   r"   r  r  r  r  r5   r6   r  layer_modulelayer_head_maskr  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r(   r   ClapAudioStage.forward  s     )(5OA.7.CilO(UeM *!,M  6 -:)??&5;aZA4EPQ	VWGW 1!'0B V OO,M`M!' >&K\]]12..Mr*   )r  r   r>   r  r  r  )r^   r_   r`   ra   r   rA   r  re   r@   r   rc   r  r   rf   r   r   s   @r(   r  r    s    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	 r*   r  c            	          ^  \ rS rSrSr\R                  4S\\   S\S\R                  SS4U 4S jjjr
S	 rS
\R                  S\\\4   S\R                  4S jrSrU =r$ )ClapAudioPatchMergingi  a  
Patch Merging Layer.

Args:
    input_resolution (`tuple[int]`):
        Resolution of input feature.
    dim (`int`):
        Number of input channels.
    norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
        Normalization layer class.
r_  r>   r  rL   Nc                    > [         TU ]  5         Xl        X l        [        R
                  " SU-  SU-  SS9U l        U" SU-  5      U l        g )Nr-   r,   Fr   )r   r   r_  r>   r   r   	reductionr   )ry   r_  r>   r  r   s       r(   r   ClapAudioPatchMerging.__init__  sE     01s7AG%@q3w'	r*   c                     US-  S:H  =(       d    US-  S:H  nU(       a-  SSSUS-  SUS-  4n[         R                  R                  X5      nU$ )Nr,   r   r   )r   rR   r   )ry   input_featurer5   r6   
should_padr  s         r(   r  ClapAudioPatchMerging.maybe_pad  sS    qjAo:519>
Q519a!<JMM--mHMr*   r  r  c                    Uu  p4UR                   u  pVnUR                  XSXG5      nU R                  XU5      nUS S 2SS S2SS S2S S 24   nUS S 2SS S2SS S2S S 24   n	US S 2SS S2SS S2S S 24   n
US S 2SS S2SS S2S S 24   n[        R                  " XX/S5      nUR                  USSU-  5      nU R                  U5      nU R                  U5      nU$ )Nr   r,   r   r/   r-   )r   r1   r  rA   catr   r  )ry   r  r  r5   r6   r$   r>   r7   input_feature_0input_feature_1input_feature_2input_feature_3s               r(   r   ClapAudioPatchMerging.forward  s   ((5(;(;%
%**:uS}eD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?_"fhjk%**:r1|;KL		-0}5r*   )r>   r_  r   r  )r^   r_   r`   ra   rb   r   r   re   r@   Moduler   r  rA   r  r   rf   r   r   s   @r(   r  r    s|    
 XZWcWc (s (# (299 (hl ( (U\\ U3PS8_ Y^YeYe  r*   r  c                      ^  \ rS rSrU 4S jrS r       SS\\R                     S\\R                     S\\	   S\\	   S\\	   S	\\	   S
\\	   S\
\\4   4S jjrSrU =r$ )ClapAudioEncoderi  c                   > [         TU ]  5         [        UR                  5      U l        Xl        [        U5      U l        UR                  U l        U R                  R                  U l	        UR                  U l
        UR                  UR                  -  U l        [        UR                  SU R                  S-
  -  -  5      U l        [         R"                  " SUR$                  ['        UR                  5      SS9 Vs/ sH  o"R)                  5       PM     nnU R                  R*                  n[-        U R                  5       Vs/ sH  oTS   SU-  -  US   SU-  -  4PM     snU l        [0        R2                  " [-        U R                  5       Vs/ sH  n[5        U[        UR                  SU-  -  5      U R.                  U   UR                  U   UR6                  U   U['        UR                  S U 5      ['        UR                  S US-    5       X`R                  S-
  :  a  [8        OS S9PM     sn5      U l        SU l        [0        R>                  " UR                  5      U l         [0        RB                  " U R                  5      U l"        UR                  U l        [0        RF                  " S5      U l$        g s  snf s  snf s  snf )Nr,   r   r   cpurN   )r   r>   r_  r  r   rc  r  F)%r   r   rQ   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior@   r   num_featuresrA   linspacerf  r   itemr   r  input_resolutionsr   r  r  r   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)ry   r   xrf  r   r  i_layerr   s          r(   r   ClapAudioEncoder.__init__  s_   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vw,vq&&(,vw$$..	\abfbqbq\r!s\rWXQ<AqD#99Q<AqD;Q"R\r!smm  %T__5  6G !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@??UVCV9V4]a  6
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   >J>KB"Kc                 n   UR                   u    p#n[        U R                  U R                  -  5      nU R                  U R                  -  nX5:  d  XF:  a  [	        S5      eX5:  a!  [
        R                  R                  XU4SSS9nXF:  a!  [
        R                  R                  XU4SSS9nUR                   u  pxpUR                  XxU R                  -  XR                  -  U
5      nUR                  SSSS5      R                  5       nUR                  XxXR                  -  XR                  -  5      nU$ )	z
The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r	   r,   )r   r@   r   r  r   r   rR   r)   r!   r2   r3   )ry   normalized_input_featuresr   r%   freq_length
spec_widthspec_heightbatchr   timefreqs              r(   reshape_mel2img ClapAudioEncoder.reshape_mel2img8  sD   
 *C)H)H&1;$//9:
nn7#{'@_`` #(*(A(A)+D9dh )B )% $(*(A(A)+EIei )B )% '@&E&E# %>$E$Edoo-t/F%
! %>$E$EaAq$Q$\$\$^!$=$E$ETOO3T__5L%
! )(r*   	is_longerr  r  output_hidden_states(output_hidden_states_before_downsamplingr  return_dictrL   c	                    UR                  SS5      nU R                  U5      n	U	R                  SS5      n	S n
U R                  (       a7  UR                  UR                  5      n[
        R                  " US:H  5      S   n
U R                  U	5      nUR                  S   nU R                  X5      nU(       a  SOS nU(       a  SOS nU(       a  SOS nU R                  S   nU(       aD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  n[        U R                  5       GH  u  nnUb  UU   OS nU R                  U   nU" UUUXG5      nUS   nUS   nUS   nUS   US   4nU(       aU  U(       aN  UR                  u  nnnUR                  " U/US   US   4QUP76 nUR                  SSSS5      nUU4-  nUU4-  nORU(       aK  U(       dD  UR                  u  nnnUR                  " U/UQUP76 nUR                  SSSS5      nX4-  nUU4-  nU(       d  M  UUSS  -  nGM     U R                  U5      nUR                  u  nnnUS[!        U R"                  5      S-
  -  -  U R$                  S   -  nUS[!        U R"                  5      S-
  -  -  U R$                  S   -  nUR                  SSS5      R'                  5       R)                  UUUU5      nUR                  u  nnn n!U U R*                  -  n"UR)                  UUU U"-  U"U!5      nUR                  SSSSS5      R'                  5       R)                  UUU"S5      nU R-                  [
        R.                  " US5      5      n#[
        R.                  " U#S5      n#U(       d  [1        S	 UU#UU4 5       5      $ [3        UU#UUS
9$ )Nr   r	   r   r,   r]   r
  r/   r-   c              3   .   #    U H  nUc  M  Uv   M     g 7fr   r]   )rw   vs     r(   rz   +ClapAudioEncoder.forward.<locals>.<genexpr>  s"      	A  s   	r[   pooler_outputr"   r\   )r   r  r   torO   rA   wherer  r   r  r  r1   r2   r  r  r   rQ   r  r   r3   r!   r  r  r   re   r   )$ry   input_featuresr  r  r  r  r  r  r  r  is_longer_list_idxis_longer_listr"   
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsr  r$   r   hidden_sizereshaped_hidden_stater  r  r  r  r  r  r[   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r(   r   ClapAudioEncoder.forward\  sm    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((K"6BD+?RT"$5b411!4)6)<)<&J;$1$6$6z$bDT$bVa$b!$9$A$A!Q1$M!!11&+@*BB&(5OA|.7.CilO#55a8(/BSM *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )J(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(5(:(::(fHX(fZe(f%(=(E(EaAq(Q%!%55!*/D.FF*  #}QR'88#G  6J !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r*   )r  r  r   r  r   r  r  r  r  r   r  r  r  r   r   )NNFFFFT)r^   r_   r`   ra   r   r  r   rA   rc   r  r   re   rh   r   rf   r   r   s   @r(   r  r    s    &/P")N 2615,1/4CH+0&*u
 E--.u
 E--.	u

 $D>u
 'tnu
 3;4.u
 #4.u
 d^u
 
u**	+u
 u
r*   r  c                   @   ^  \ rS rSrS\\\4   4U 4S jjrS rSr	U =r
$ )ClapProjectionLayeri  r   c                    > [         TU ]  5         Xl        UR                  nUR                  n[
        R                  " X#5      U l        [        UR                     U l
        [
        R                  " X35      U l        g r   )r   r   r   r  projection_dimr   r   linear1r
   projection_hidden_act
activationlinear2)ry   r   r  r  r   s       r(   r   ClapProjectionLayer.__init__  s[    ((..yy= !=!=>yy@r*   c                 l    U R                  U5      nU R                  U5      nU R                  U5      nU$ r   )r   r  r  rM  s     r(   r   ClapProjectionLayer.forward  s2    ]36]3r*   )r  r   r   r  )r^   r_   r`   ra   r   r   r   r   r   rf   r   r   s   @r(   r  r    s%    Au_n%DE A r*   r  c                   >   ^  \ rS rSrSrU 4S jr SS jrS rSrU =r	$ )ClapTextEmbeddingsi  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        [        R                  " UR                  UR
                  5      U l	        [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        [#        USS5      U l        U R'                  S[(        R*                  " UR                  5      R-                  S5      SS9  U R'                  S	[(        R.                  " U R0                  R3                  5       [(        R4                  S
9SS9  UR                  U l        [        R                  " UR                  UR
                  U R6                  S9U l	        g )N)rF   r[  position_embedding_typeabsoluteposition_ids)r   r/   T)
persistenttoken_type_ids)r   )r   r   r   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   r`  r   rT  r   ru   r
  r   rA   rP   expandr   r  r   rD   rF   ry   r   r   s     r(   r   ClapTextEmbeddings.__init__  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	

 "..#%<<**F,>,>DL\L\$
 r*   c                    Uc+  Ub  [        XR                  U5      nOU R                  U5      nUb  UR                  5       nOUR                  5       S S nUS   nUcv  [	        U S5      (       a-  U R
                  S S 2S U24   nUR                  US   U5      n	U	nO8[        R                  " U[        R                  U R                  R                  S9nUc  U R                  U5      nU R                  U5      n
XJ-   nU R                  S:X  a  U R                  U5      nX-  nU R!                  U5      nU R#                  U5      nU$ )Nr/   r   r  r   r   r  )rJ   rF   &create_position_ids_from_inputs_embedsr   hasattrr  r  rA   r   rD   r  rO   r  r  r
  r  r   r   )ry   rE   r  r  inputs_embedsrG   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r(   r   ClapTextEmbeddings.forward  sM    $A)M]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-..*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J":
'':5"&":":<"H-J^^J/
\\*-
r*   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr/   r   r   r   )r   rA   rP   rF   rD   rO   r  r  )ry   r  r  sequence_lengthr  s        r(   r  9ClapTextEmbeddings.create_position_ids_from_inputs_embeds.  s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r*   )r   r   rF   r
  r  r  r  )NNNNr   )
r^   r_   r`   ra   rb   r   r   r  rf   r   r   s   @r(   r  r    s$    

4 rs&P= =r*   r  moduler   r   r   r  scalingr   r  c                    [         R                  " XR                  SS5      5      U-  n	Ub"  US S 2S S 2S S 2S UR                  S   24   n
X-   n	[        R
                  R                  U	S[         R                  S9R                  UR                  5      n	[        R
                  R                  XU R                  S9n	Ub  XR                  SSSS5      -  n	[         R                  " X5      nUR                  SS5      R                  5       nX4$ )Nr,   r	   r
  r/   )r>   r   )pr   r   )rA   r  r   r   r   rR   r  float32r  r   r   r   r1   r3   )r'  r   r   r   r  r(  r   r  kwargsattn_weightscausal_maskattn_outputs               r(   eager_attention_forwardr0  A  s     <<}}Q':;gEL!$Q1o		"o%=>#1==((2U]](SVVW\WbWbcL==((6??([L#nnQAq&AA,,|3K''1-88:K$$r*   c                      ^  \ rS rSrU 4S jr   S
S\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	rU =r$ )ClapTextSelfAttentioni]  c                 6  > [         TU ]  5         UR                  UR                  -  S:w  a7  [	        US5      (       d&  [        SUR                   SUR                   S35      eXl        UR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        UR                   U l        U R                  S-  U l        g )Nr   embedding_sizer   r   r         )r   r   r  r   r  r   r   r@   r   r   r   r   r   r   r   r   r   r   attention_dropoutr(  r  s     r(   r   ClapTextSelfAttention.__init__^  sD    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 #)#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF!'!D!D//5r*   r"   r  r  r  rL   c                    UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
[        nU R                  R                  S:w  a  [        U R                  R                     nU" U UU	U
U4U R                  (       d  SOU R                  U R                  US.UD6u  pUR                  " / UQSP76 R                  5       nU(       a  X4nU$ U4nU$ )Nr/   r   r,   eagerr   )r   r(  r  )r   r   r   r1   r   r   r   r0  r   _attn_implementationr   r   r6  r(  r!   r3   )ry   r"   r  r  r  r,  r  r  query_states
key_statesvalue_statesattention_interfacer/  r-  r  s                  r(   r   ClapTextSelfAttention.forwards  s[    $))#2.CCbC$*B*BCzz-055lCMMaQRSXXm,11,?II!QO
zz-055lCMMaQRS(?;;++w6"9$++:Z:Z"[$7
%
  $}}C$2H2HLL
%
 
%
! "));;;;FFH1B;- JUr*   )
r   r6  r   r   r   r   r   r   r(  r   r  r  r   s   @r(   r2  r2  ]  st    60 7;15,1!||! !!2!23! E--.	!
 $D>! 
u||	! !r*   r2  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextSelfOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  5      U l
        g Nr[  )r   r   r   r   r  r#  r   r`  r   rT  r   r  s     r(   r   ClapTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r*   r"   r&  rL   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   r#  r   r   r)  s      r(   r   ClapTextSelfOutput.forward  5    

=1]3}'CDr*   r   r#  r   r+  r   s   @r(   rA  rA    6    >U\\  RWR^R^  r*   rA  c                      ^  \ rS rSrU 4S jrS r   SS\R                  S\\R                     S\\R                     S\\
   S\\R                     4
S	 jjrS
rU =r$ )ClapTextAttentioni  c                    > [         TU ]  5         [        U5      U l        [	        U5      U l        [        5       U l        g r   )r   r   r2  ry   rA  r   r/  r0  r  s     r(   r   ClapTextAttention.__init__  s0    )&1	(0Er*   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g r3  r4  r6  s      r(   r9  ClapTextAttention.prune_heads  r;  r*   r"   r  r  r  rL   c                 p    U R                   " U4UUUS.UD6nU R                  US   U5      nU4USS  -   nU$ N)r  r  r  r   r   r=  )	ry   r"   r  r  r  r,  r>  r?  r  s	            r(   r   ClapTextAttention.forward  s]     yy
)/	

 
  ;;|AF#%QR(88r*   rA  r  rB  r   s   @r(   rL  rL    sy    ";* 7;15,1|| !!2!23 E--.	
 $D> 
u||	 r*   rL  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextIntermediatei  c                   > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        UR                  [        5      (       a  [        UR                     U l        g UR                  U l        g r   )r   r   r   r   r  intermediate_sizer#  r   rG  rH  r
   rI  r  s     r(   r   ClapTextIntermediate.__init__  s`    YYv1163K3KL
f''--'-f.?.?'@D$'-'8'8D$r*   r"   rL   c                 J    U R                  U5      nU R                  U5      nU$ r   rL  rM  s     r(   r   ClapTextIntermediate.forward  rO  r*   rL  r+  r   s   @r(   rU  rU    rP  r*   rU  c                   z   ^  \ rS rSrU 4S jrS\R                  S\R                  S\R                  4S jrSrU =r	$ )ClapTextOutputi  c                 (  > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR
                  UR                  S9U l        [        R                  " UR                  5      U l        g rC  )r   r   r   r   rW  r  r#  r   r`  r   rT  r   r  s     r(   r   ClapTextOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r*   r"   r&  rL   c                 p    U R                  U5      nU R                  U5      nU R                  X-   5      nU$ r   rF  r)  s      r(   r   ClapTextOutput.forward  rH  r*   rI  r+  r   s   @r(   r\  r\    rJ  r*   r\  c                      ^  \ rS rSrU 4S jr   SS\R                  S\\R                     S\\R                     S\\	   S\
\R                     4
S jjrS	 rS
rU =r$ )ClapTextLayeri  c                    > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        [        U5      U l        [        U5      U l	        g )Nr   )
r   r   r]  seq_len_dimrL  rb  rU  re  r\  r   r  s     r(   r   ClapTextLayer.__init__  sI    '-'E'E$*6208$V,r*   r"   r  r  r  rL   c                     U R                   " U4UUUS.UD6nUS   nUSS  n[        U R                  U R                  U R                  U5      n	U	4U-   nU$ rR  )rb  r   feed_forward_chunkr]  rd  )
ry   r"   r  r  r  r,  self_attention_outputsr?  r  r  s
             r(   r   ClapTextLayer.forward  s     "&"
)/	"

 "
 2!4(,0##T%A%A4CSCSUe
  /G+r*   c                 J    U R                  U5      nU R                  X!5      nU$ r   )re  r   )ry   r?  intermediate_outputr  s       r(   rg   ClapTextLayer.feed_forward_chunk  s)    "//0@A{{#6Ir*   )rb  r]  re  r   rd  r  )r^   r_   r`   ra   r   rA   r  r   rc   r  re   r   rg  rf   r   r   s   @r(   rb  rb    sy    - 7;15,1|| !!2!23 E--.	
 $D> 
u||	2 r*   rb  c                      ^  \ rS rSrU 4S jr\     SS\R                  S\\R                     S\\R                     S\\
   S\\
   S\\
   S	\\\R                     \4   4S
 jj5       rSrU =r$ )ClapTextEncoderi  c                    > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        SU l	        g s  snf )NF)
r   r   r   r   r  r  num_hidden_layersrb  layerr  )ry   r   r  r   s      r(   r   ClapTextEncoder.__init__  sR    ]]5IaIaCb#cCbaM&$9Cb#cd
&+# $ds   A%r"   r  r  r  r  r  rL   c           	         U(       a  SOS nU(       a  SOS n	[        U R                  5       H=  u  pU(       a  X4-   nUb  X:   OS nU" SUUUUS.UD6nUS   nU(       d  M5  XS   4-   n	M?     U(       a  X4-   n[        UUU	S9$ )Nr]   )r"   r  r  r  r   r   )r[   r"   r\   )r  rq  r   )ry   r"   r  r  r  r  r  r,  r  r  r  r  r  r  s                 r(   r   ClapTextEncoder.forward$  s     #7BD$5b4(4OA#$58H$H!.7.CilO( +-)"3	
 M *!,M  &91=M<O&O#!  5$   14D D++*
 	
r*   )r   r  rq  )NNFFT)r^   r_   r`   ra   r   r   rA   r  r   rc   r  r   re   r   r   rf   r   r   s   @r(   rn  rn    s    ,  7;15,1/4&*&
||&
 !!2!23&
 E--.	&

 $D>&
 'tn&
 d^&
 
uU\\"O3	4&
 &
r*   rn  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )ClapTextPooleriO  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )r   r   r   r   r  r#  Tanhr  r  s     r(   r   ClapTextPooler.__init__P  s9    YYv1163E3EF
'')r*   r"   rL   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ ri  )r#  r  )ry   r"   first_token_tensorpooled_outputs       r(   r   ClapTextPooler.forwardU  s6     +1a40

#566r*   )r  r#  r+  r   s   @r(   rv  rv  O  s(    $
U\\ ell  r*   rv  c                   J    \ rS rSr% \\S'   SrSrS\R                  4S jr
Srg)	ClapPreTrainedModeli^  r   clapFr'  c                    U R                   R                  n[        U[        5      (       ac  UR                  R
                  R                  R                  SUS-  S9  UR                  R
                  R                  R                  SUS-  S9  g[        U[        5      (       a  UR                  R                  R                  [        R                  " U R                   R                  5      5        UR                  R                  R                  [        R                  " U R                   R                  5      5        g[        U[         R"                  5      (       a(  UR
                  R                  R                  SUS-  S9  g[        U[         R$                  [         R&                  45      (       aJ  UR(                  R                  R+                  5         UR
                  R                  R                  S5        g[        U[         R,                  [         R.                  45      (       a  U R                   R0                  S-  SU R                   R2                  -  S-  -  U-  n[         R4                  R                  UR
                  US9  UR(                  b%  UR(                  R                  R+                  5         gg[        U[6        5      (       a%  UR8                  R                  R+                  5         gg)	zInitialize the weightsr   g{Gz?)meanstdg      ?r5  r,   )r  N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModellogit_scale_afill_r  loglogit_scale_init_valuelogit_scale_tr   r  r   r   r   zero_r   r   r  rp  initr   r   )ry   r'  factorin_proj_stds       r(   _init_weights!ClapPreTrainedModel._init_weightsd  s
   //f011&&--22::RV:W((//44<<#6TX=<Y	**  %%++DHHT[[5W5W,XY  %%++DHHT[[5W5W,XY--MM&&CVd]&Cr~~ >??KK""$MM$$S)BII 677;;22D8a$++B_B_>_dh=hilrrKGGOOFMM{O;{{&  &&( ' 677//44::< 8r*   r]   N)r^   r_   r`   ra   r   rd   base_model_prefixsupports_gradient_checkpointingr   r  r  rf   r]   r*   r(   r  r  ^  s$    &+#=BII =r*   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\     SS\\R                     S\\R                     S\\   S	\\   S
\\   S\\\4   4S jj5       rSrU =r$ )ClapAudioModeli|  r   r  c                 d   > [         TU ]  U5        [        U5      U l        U R	                  5         g r   )r   r   r  audio_encoder	post_initr  s     r(   r   ClapAudioModel.__init__  s'     -f5r*   rL   c                 B    U R                   R                  R                  $ r   )r  r  r   r}   s    r(   get_input_embeddings#ClapAudioModel.get_input_embeddings  s    !!--222r*   r  r  r  r  c                     Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUS9$ )a  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapAudioModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

>>> inputs = processor(audios=audio_sample, return_tensors="pt")

>>> outputs = model(**inputs)
>>> last_hidden_state = outputs.last_hidden_state
```r  r  r  r  r  )r   use_return_dictr  r  r  )ry   r  r  r  r  r  s         r(   r   ClapAudioModel.forward  sx    > &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r*   )r  NNNNN)r^   r_   r`   ra   r   rd   main_input_namer   r   r  r  r   r   rA   rc   
BoolTensorr  r   re   r   r   rf   r   r   s   @r(   r  r  |  s    &O 3bii 3  7;04,0/3&**
 !2!23*
 E,,-*
 $D>	*

 'tn*
 d^*
 
u00	1*
 *
r*   r  a0  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://huggingface.co/papers/1706.03762
    c                   r  ^  \ rS rSr% \\S'   SU 4S jjrS rS r\	\
         SS\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\\R                     \4   4S jj5       5       rSrU =r$ )ClapTextModeli  r   c                    > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
N)
r   r   r   r  r"  rn  encoderrv  poolerr  )ry   r   add_pooling_layerr   s      r(   r   ClapTextModel.__init__  sK    
 	 ,V4&v.0AnV,t 	r*   c                 .    U R                   R                  $ r   r"  r  r}   s    r(   r  "ClapTextModel.get_input_embeddings  s    ...r*   c                 $    XR                   l        g r   r  ry   r   s     r(   set_input_embeddings"ClapTextModel.set_input_embeddings  s    */'r*   rE   r  r  r  r  r  r  r  r  rL   c
           	         Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       n
O"Ub  UR                  5       S S n
O[	        S5      eU
u  pUb  UR                  OUR                  nUc  [        R                  " X4US9nUcr  [        U R                  S5      (       a3  U R                  R                  S S 2S U24   nUR                  X5      nUnO$[        R                  " U
[        R                  US9nU R!                  X*5      nU R#                  XPR                   R$                  5      nU R                  UUUUS9nU R'                  UUUUUSS	9nUS
   nU R(                  b  U R)                  U5      OS n[+        UUUR,                  UR.                  S9$ )NzDYou cannot specify both input_ids and inputs_embeds at the same timer/   z5You have to specify either input_ids or inputs_embedsrN   r  r   )rE   r  r  r  T)r  r  r  r  r  r   r  )r   r  r  r  r   %warn_if_padding_and_no_attention_maskr   rO   rA   onesr  r"  r  r  r   rD   get_extended_attention_maskget_head_maskrp  r  r  r   r"   r\   )ry   rE   r  r  r  r  r  r  r  r  r  r$   r  rO   r   r!  extended_attention_maskembedding_outputencoder_outputssequence_outputr|  s                        r(   r   ClapTextModel.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN!t(899*.//*H*HKZK*X'3J3Q3QR\3i0!A!&[

SY!Z 150P0PQ_0m &&y++2O2OP	??%)'	 + 
 ,,2/!5 ' 
 *!,8<8OO4UY)-')77&11	
 	
r*   )r   r"  r  r  )T	NNNNNNNNN)r^   r_   r`   ra   r   rd   r   r  r  r   r   r   rA   r  r  r   re   r   r   rf   r   r   s   @r(   r  r    s     /0  -11515/3,004,0/3&*G
ELL)G
 !.G
 !.	G

 u||,G
 ELL)G
  -G
 $D>G
 'tnG
 d^G
 
uU\\"$PP	QG
  G
r*   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjr\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\      SS\\	R                     S\\	R                     S\\	R                     S\\   S\\   S	\\   S
\	R                  4S jj5       r\\         SS\\	R                      S\\	R                     S\\	R"                     S\\	R                     S\\	R                      S\\   S\\   S\\   S	\\   S
\\\4   4S jj5       5       rSrU =r$ )r  i*  r   c                 N  > [         TU ]  U5        [        UR                  [        5      (       d"  [        S[        UR                  5       S35      e[        UR                  [        5      (       d"  [        S[        UR                  5       S35      eUR                  nUR                  n[        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        [        R                  " [        R                  " [        R                  " UR                  5      5      5      U l        UR$                  U l        ['        U5      U l        [+        U5      U l        [/        U5      U l        [+        U5      U l        U R5                  5         g )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )r   r   r   text_configr   	TypeErrortypeaudio_configr   r   r   rA   rm  r  r  r  r  r  r  r  
text_modelr  text_projectionr  audio_modelaudio_projectionr  )ry   r   r  r  r   s       r(   r   ClapModel.__init__.  sC    &,,n==++,-Q0 
 &--??,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r*   rE   r  r  r  r  r  rL   c           	      D   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUUUS9nUb  US   OUR
                  nU R                  U5      n	[        R                  " U	SS9n	U	$ )a>  
Returns:
    text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
    applying the projection layer to the pooled output of [`ClapTextModel`].

Examples:

```python
>>> from transformers import AutoTokenizer, ClapModel

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
>>> text_features = model.get_text_features(**inputs)
```rE   r  r  r  r  r  r   r/   r=   )	r   r  r  r  r  r  r  F	normalize)
ry   rE   r  r  r  r  r  text_outputsr|  text_featuress
             r(   get_text_featuresClapModel.get_text_featuresN  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 ,7+BQHbHb,,];Mr:r*   r  r  c                 F   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUS9nU(       d  US   OUR
                  nU R                  U5      n	[        R                  " U	SS9n	U	$ )a6  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Returns:
    audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
    applying the projection layer to the pooled output of [`ClapAudioModel`].

Examples:

```python
>>> from transformers import AutoFeatureExtractor, ClapModel
>>> import torch

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
>>> random_audio = torch.rand((16_000))
>>> inputs = feature_extractor(random_audio, return_tensors="pt")
>>> audio_features = model.get_audio_features(**inputs)
```)r  r  r  r   r/   r=   )	r   r  r  r  r  r  r  r  r  )
ry   r  r  r  r  r  r  audio_outputsr|  audio_featuress
             r(   get_audio_featuresClapModel.get_audio_features~  s    > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()# ) 
 1<a(A\A\..}=^<r*   return_lossc
           
      n   Ub  UOU R                   R                  nUb  UOU R                   R                  nU	b  U	OU R                   R                  n	U R	                  UUUUSS9n
U R                  UUUUUSS9nU	(       d  U
S   OU
R                  nU R                  U5      nU	(       d  US   OUR                  nU R                  U5      nXR                  SSSS9-  nXR                  SSSS9-  nU R                  R                  5       nU R                  R                  5       n[        R                  " XR                  5       5      U-  n[        R                  " XR                  5       5      U-  nSnU(       a,  [!        U5      n[!        UR                  5       5      nUU-   S	-  n[#        UUUUUUU
S
9$ )a+  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.
return_loss (`bool`, *optional*):
    Whether or not to return the contrastive loss.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import AutoProcessor, ClapModel

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
>>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

>>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

>>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)
>>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
>>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
```NTr  r  r   r,   r/   )r*  r>   keepdimg       @)rn   ro   rp   rZ   rj   rq   rr   )r   r  r  r  r  r  r  r  r  r   r  expr  rA   r  trU   rl   )ry   rE   r  r  r  r  r  r  r  r  r  r  rj   rZ   logit_scale_textlogit_scale_audiorp   ro   rn   caption_loss
audio_losss                        r(   r   ClapModel.forward  s   T 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5 ) 
 )%/!5 ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $&7&7!T&7&RR!$4$4qb$$4$OO  --113 ..224,,{NN4DEHXX <<mmoFIZZ+O<L)*:*<*<*>?J :-4D-+#%*,
 	
r*   )r  r  r  r  r  r  r  NNNNNNr  )r^   r_   r`   ra   r   rd   r   r   r   rA   r  r  rc   r  r  r   
LongTensorr  r   re   rl   r   rf   r   r   s   @r(   r  r  *  s@   z @  -115/3,0/3&*-ELL)- !.- u||,	-
 $D>- 'tn- d^- 
		- -^  26,015,0/3&*/ ./ ELL)/ !.	/
 $D>/ 'tn/ d^/ 
		/ /b  156:041537&*,0/3&*]
E,,-]
 !!2!23]
 E,,-	]

 !.]
 u//0]
 d^]
 $D>]
 'tn]
 d^]
 
uj 	!]
  ]
r*   r  c                     ^  \ rS rSr% \\S'   S\4U 4S jjrS\R                  4S jr	S r
\\      SS\\R                     S\\R                     S	\\R                     S
\\   S\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )ClapTextModelWithProjectioni  r   c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r   r   r  r  r  r  r  r  s     r(   r   $ClapTextModelWithProjection.__init__  s3     '/26:r*   rL   c                 B    U R                   R                  R                  $ r   r  r"  r  r}   s    r(   r  0ClapTextModelWithProjection.get_input_embeddings  s    ))999r*   c                 8    XR                   R                  l        g r   r  r  s     r(   r  0ClapTextModelWithProjection.set_input_embeddings   s    5:""2r*   rE   r  r  r  r  r  c           	         Ub  UOU R                   R                  nU R                  UUUUUSS9nU(       d  US   OUR                  nU R	                  U5      n	[        U	UR                  UR                  UR                  S9$ )a  
Examples:

```python
>>> from transformers import AutoTokenizer, ClapTextModelWithProjection

>>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
>>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

>>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

>>> outputs = model(**inputs)
>>> text_embeds = outputs.text_embeds
```Tr  r   )rZ   r[   r"   r\   )	r   r  r  r  r  rX   r[   r"   r\   )
ry   rE   r  r  r  r  r  r  r|  rZ   s
             r(   r   #ClapTextModelWithProjection.forward#  s    2 &1%<k$++B]B])%/!5 ' 
 0;Q@Z@Z**=9"#*<<&44#..	
 	
r*   )r  r  r  )r^   r_   r`   ra   r   rd   r   r   r  r  r  r   r   r   rA   r  r  r   re   rX   r   rf   r   r   s   @r(   r  r    s    ~ :bii :;  -115/3,0/3&*+
ELL)+
 !.+
 u||,	+

 $D>+
 'tn+
 d^+
 
u))	*+
  +
r*   r  c                      ^  \ rS rSr% \\S'   SrS\4U 4S jjrS\R                  4S jr
\\     SS\\R                     S\\R                      S\\   S	\\   S
\\   S\\\4   4S jj5       5       rSrU =r$ )ClapAudioModelWithProjectioniS  r   r  c                    > [         TU ]  U5        [        U5      U l        [	        U5      U l        U R                  5         g r   )r   r   r  r  r  r  r  r  s     r(   r   %ClapAudioModelWithProjection.__init__X  s4     )&1 3F ;r*   rL   c                 V    U R                   R                  R                  R                  $ r   )r  r  r  r   r}   s    r(   r  1ClapAudioModelWithProjection.get_input_embeddings_  s     --99>>>r*   r  r  r  r  c                 p   Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nU R	                  UUUUSS9nU(       d  US   OUR
                  nU R                  U5      n[        UUR                  UR                  UR                  S9$ )a  
is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
    Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
    the features.

Examples:

```python
>>> from datasets import load_dataset
>>> from transformers import ClapAudioModelWithProjection, ClapProcessor

>>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
>>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

>>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
>>> audio_sample = dataset["train"]["audio"][0]["array"]

>>> inputs = processor(audios=audio_sample, return_tensors="pt")
>>> outputs = model(**inputs)
>>> audio_embeds = outputs.audio_embeds
```Tr  r   )rj   r[   r\   r"   )r   r  r  r  r  r  r  rh   r[   r\   r"   )	ry   r  r  r  r  r  r  r|  rj   s	            r(   r   $ClapAudioModelWithProjection.forwardb  s    > &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5 ) 
 1<a(A\A\,,];#%+==$//'55	
 	
r*   )r  r  r  )r^   r_   r`   ra   r   rd   r  r   r   r  r  r   r   r   rA   rc   r  r  r   re   rh   r   rf   r   r   s   @r(   r  r  S  s    &O ?bii ?  7;04,0/3&*4
 !2!234
 E,,-4
 $D>	4

 'tn4
 d^4
 
u**	+4
  4
r*   r  )r  r  r  r  r  r  )r   )r   N)Urb   r   r  dataclassesr   typingr   r   r   r   rA   torch.nn.functionalr   rR   r  activationsr
   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   r   utilsr   r   r   r   r   configuration_clapr   r   r   
get_loggerr^   loggerr)   r9   r;   rJ   r  rU   rX   rh   rl   r  r   r   r   r   r!  r-  rD  rR  rY  r  r  r  r  r  floatr0  r2  rA  rL  rU  r\  rb  rn  rv  r  r  r  r  r  r  __all__r]   r*   r(   <module>r     s      ! 1 1     ! 9 
 G v v V V K K 
		H	%"*(4$7U\\ 7ell 7
 	?+ 	? 	? 
	?; 	? 	?  
  
   
H299 2%		 %P_")) _F\RYY \@
")) 
# #NBII  	bii 	zRYY z|9/ 9z3BII 3lB
ryy B
J")) &V= V=B (,%II%<<% 
% <<	%
 U\\*% % % %%87BII 7v *		 *\299  RYY %. %R.
bii .
dRYY  =/ = =:8
( 8
v b
' b
b
J d
# d
 d
N =
"5 =
 =
@ D
#6 D
 D
Nr*   