ó
    <±hDÿ  ã                   óœ  • S SK r S SKrS SKJr  S SKJrJr  S SKrS SKJ	s  J
r  S SKJ	r	  S SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJrJrJrJrJrJr  SSKJrJr  SSK J!r!  SSK"J#r#J$r$J%r%  SSK&J'r'  SSK(J)r)  \$" 5       (       a  S SK*J+r+  S SK,J-r-  S SK.J/r/  O\0r-\%Rb                  " \25      r3 " S S\Rh                  Rj                  5      r6  STS\\Rn                     S\\8   4S jjr9 " S S\-5      r: " S S\	Rv                  5      r< " S S\	Rv                  5      r= " S S \	Rv                  5      r>S! r?SUS" jr@ SVS#S$S%\Rn                  S&\Rn                  S'\Rn                  S(\\R‚                     S)\B\8\84   S*\8S+\8S,\\C   S-\\B\Rn                  \Rn                  4   \B\Rn                     4   4S. jjrD\RŠ                  4S#S$S%\Rn                  S/\:S\Rn                  S\8S)\B\8\84   S*\8S+\8S0\RŒ                  S-\B\Rn                     4S1 jjrGS#S$S%\Rn                  S&\Rn                  S'\Rn                  S(\\R‚                     S)\B\8\84   S*\8S+\8S-\B\Rn                     4S2 jrH\G\D\HS3.rI " S4 S$\	Rv                  5      rJ " S5 S6\5      rK\# " S7 S8\!5      5       rL  STS9\Rn                  S&\Rn                  S(\\Rn                     S:\\Rn                     S-\B\Rn                  \Rn                  \Rn                  \8\\Rn                     \\Rn                     4   4
S; jjrMS9\Rn                  S<\Rn                  S=\8S>\8S-\Rn                  4
S? jrN\# " S@ SA\L5      5       rO " SB SC\	Rv                  5      rP\#" SDSE9 " SF SG\L5      5       rQ\#" SHSE9 " SI SJ\L5      5       rR\#" SKSE9 " SL SM\L5      5       rS\# " SN SO\L5      5       rT\#" SPSE9 " SQ SR\L5      5       rU/ SSQrVg)Wé    N)Únullcontext)ÚOptionalÚUnion)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé   )ÚACT2FN)Ú_prepare_4d_attention_mask)ÚGradientCheckpointingLayer)ÚBaseModelOutputÚMaskedLMOutputÚMultipleChoiceModelOutputÚQuestionAnsweringModelOutputÚSequenceClassifierOutputÚTokenClassifierOutput)ÚROPE_INIT_FUNCTIONSÚdynamic_rope_update)ÚPreTrainedModel)Úauto_docstringÚis_flash_attn_2_availableÚlogging)Úis_triton_availableé   )ÚModernBertConfig)Ú flash_attn_varlen_qkvpacked_func)ÚRotaryEmbedding)Úapply_rotaryc                   óh   • \ rS rSr\  SS\\R                     S\\   4S jj5       r	\S 5       r
Srg)	ÚApplyRotaryEmbUnpadé=   NÚ
cu_seqlensÚ
max_seqlenc                 óÌ   • UR                  5       nUR                  u  pgp‰US S 2S S24   R                  USU	5      n
[        U
UUSUUSSS9  U R	                  X#U5        XPl        U$ )Né   éÿÿÿÿr   FT)Úseqlen_offsetsr#   r$   ÚinterleavedÚinplace)Ú
contiguousÚshapeÚviewr   Úsave_for_backwardr$   )ÚctxÚqkvÚcosÚsinr#   r$   Ú	total_nnzÚ_threeÚ_nheadsÚheaddimÚqks              Új/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/modernbert/modeling_modernbert.pyÚforwardÚApplyRotaryEmbUnpad.forward>   sz   € ð n‰nÓˆØ.1¯i©iÑ+ˆ	˜7ð ’BQB‰Z_‰_˜Y¨¨GÓ4ˆÜØØØØØ!Ø!ØØò		
ð 	×Ñ˜c¨
Ô3Ø#ŒØˆ
ó    c                 óÞ   • U R                   u  p#nUR                  5       nUR                  u  pVpxUS S 2S S24   R                  USU5      n	[	        U	UUSUU R
                  SSSS9	  US S S S S S 4$ )Nr&   r'   r   FT)r(   r#   r$   r)   r*   Ú	conjugate)Úsaved_tensorsr+   r,   r-   r   r$   )
r/   Údor1   r2   r#   r3   r4   r5   r6   Údqks
             r8   ÚbackwardÚApplyRotaryEmbUnpad.backward]   s‰   € à"×0Ñ0Ñˆ*Ø]‰]‹_ˆØ.0¯h©hÑ+ˆ	˜7ð ’BQB‰in‰n˜Y¨¨GÓ4ˆÜØØØØØ!Ø—~‘~ØØØò
	
ð 4˜˜t T¨4°Ð5Ð5r;   © ©NN)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ústaticmethodr   ÚtorchÚTensorÚintr9   rA   Ú__static_attributes__rC   r;   r8   r!   r!   =   sQ   † Øð .2Ø$(ñð
 ˜UŸ\™\Ñ*ðð ˜S‘Môó ðð< ñ6ó ó6r;   r!   r#   r$   c                 ó0   • [         R                  XX#U5      $ )a‰  
Arguments:
    qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
    cos, sin: (seqlen_rotary, rotary_dim / 2)
    interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
        of 1st half and 2nd half (GPT-NeoX style).
    inplace: if True, apply rotary embedding in-place.
    seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
        Most commonly used in inference when we have KV cache.
    cu_seqlens: (batch + 1,) or None
    max_seqlen: int
Return:
    out: (total_nnz, dim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
)r!   Úapply)r0   r1   r2   r#   r$   s        r8   Úapply_rotary_unpaddedrP   t   s   € ô. ×$Ñ$ S¨sÀ
ÓKÐKr;   c                   ó6  ^ • \ rS rSrSr    SS\S\S\\   S\\R                     S\\R                     4
U 4S jjjr SS	\R                  S
\R                  S\\   S\\R                  \\R                  \R                  4   4   4S jjrS\4S jrSrU =r$ )Ú!ModernBertUnpaddedRotaryEmbeddingéŽ   zH
The rotary position embeddings applied directly to unpadded sequences.
ÚdimÚbaser$   ÚdeviceÚdtypec                 óh   >• [         TU ]  XUSS9  X0l        Ub  Ub  Ub  U R                  X4US9  gggg)zú
max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
    up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
    the cos_sin_cache will be recomputed during the forward pass.
F)rT   rU   rV   r)   N©rV   rW   )ÚsuperÚ__init__r$   Ú_update_cos_sin_cache)ÚselfrT   rU   r$   rV   rW   Ú	__class__s         €r8   r[   Ú*ModernBertUnpaddedRotaryEmbedding.__init__“   sM   ø€ ô 	‰Ñ˜S°FÈÐÑNØ$ŒàÑ! fÑ&8¸UÑ=NØ×&Ñ& zÈÐ&ÒNð >OÐ&8Ð!r;   r0   r#   Úreturnc                 ó–   • Ub$  U R                  X1R                  UR                  S9  [        UU R                  U R
                  UUS9nU$ )z°
Apply rotary embedding *inplace* to qkv.
qkv: (total_nnz, 3, nheads, headdim)
cu_seqlens: (batch + 1,) cumulative sequence lengths
max_seqlen: int max seq length in the batch
rY   ©r#   r$   )r\   rV   rW   rP   Ú_cos_cachedÚ_sin_cached)r]   r0   r#   r$   s       r8   r9   Ú)ModernBertUnpaddedRotaryEmbedding.forward¦   sQ   € ð Ñ!Ø×&Ñ& z¿*¹*ÈCÏIÉIÐ&ÑVä#ØØ×ÑØ×ÑØ!Ø!ñ
ˆð ˆ
r;   c                 óT   • SU R                    SU R                   SU R                   3$ )Nzdim=z, base=z, scale_base=)rT   rU   Ú
scale_base©r]   s    r8   Ú
extra_reprÚ,ModernBertUnpaddedRotaryEmbedding.extra_repr¿   s(   € Ød—h‘hZ˜w t§y¡y k°¸t¿¹Ð>OÐPÐPr;   )r$   )g     ˆÃ@NNN©N)rE   rF   rG   rH   Ú__doc__rL   Úfloatr   rJ   rV   rW   r[   rK   r   Útupler9   Ústrri   rM   Ú__classcell__©r^   s   @r8   rR   rR   Ž   sÝ   ø† ñð Ø$(Ø)-Ø'+ñOàðOð ðOð ˜S‘Mð	Oð
 ˜Ÿ™Ñ&ðOð ˜Ÿ™Ñ$÷Oð Oð. %)ñ	à\‰\ðð —L‘Lðð ˜S‘Mð	ð
 
ˆu|‰|˜U 5§<¡<°·±Ð#=Ñ>Ð>Ñ	?õð2Q˜C÷ Qò Qr;   rR   c                   óø   ^ • \ rS rSrSrS\4U 4S jjr\R                  " SS9S\R                  S\R                  4S	 j5       r SS\\R                     S
\\R                     S\R                  4S jjrSrU =r$ )ÚModernBertEmbeddingséÃ   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
Úconfigc                 ó\  >• [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)Úpadding_idx©ÚepsÚbias)rZ   r[   ru   r   Ú	EmbeddingÚ
vocab_sizeÚhidden_sizeÚpad_token_idÚtok_embeddingsÚ	LayerNormÚnorm_epsÚ	norm_biasÚnormÚDropoutÚembedding_dropoutÚdrop©r]   ru   r^   s     €r8   r[   ÚModernBertEmbeddings.__init__È   su   ø€ Ü‰ÑÔØŒÜ Ÿlšl¨6×+<Ñ+<¸f×>PÑ>PÐ^d×^qÑ^qÑrˆÔÜ—L’L ×!3Ñ!3¸¿¹Èv×O_ÑO_Ñ`ˆŒ	Ü—J’J˜v×7Ñ7Ó8ˆ	r;   T©ÚdynamicÚ	input_idsr`   c                 ó`   • U R                  U R                  U R                  U5      5      5      $ rk   )r†   rƒ   r   )r]   r‹   s     r8   Úcompiled_embeddingsÚ(ModernBertEmbeddings.compiled_embeddingsÏ   s%   € ày‰y˜Ÿ™ 4×#6Ñ#6°yÓ#AÓBÓCÐCr;   Úinputs_embedsc                 ó  • Ub"  U R                  U R                  U5      5      nU$ U R                  R                  (       a  U R	                  U5      O.U R                  U R                  U R                  U5      5      5      nU$ rk   )r†   rƒ   ru   Úreference_compiler   r   )r]   r‹   r   Úhidden_statess       r8   r9   ÚModernBertEmbeddings.forwardÓ   su   € ð Ñ$Ø ŸI™I d§i¡i°Ó&>Ó?ˆMð Ðð —;‘;×0×0ð ×(Ñ(¨Ô3à—Y‘Y˜tŸy™y¨×)<Ñ)<¸YÓ)GÓHÓIð ð
 Ðr;   )ru   r†   rƒ   r   rD   )rE   rF   rG   rH   rl   r   r[   rJ   ÚcompileÚ
LongTensorrK   r   r   r9   rM   rp   rq   s   @r8   rs   rs   Ã   s’   ø† ñð9Ð/÷ 9ð ‡]‚]˜4Ñ ðD¨U×-=Ñ-=ð DÀ%Ç,Á,ó Dó !ðDð eiñØ! %×"2Ñ"2Ñ3ðØKSÐTY×T`ÑT`ÑKaðà	‰÷ó r;   rs   c                   ón   ^ • \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ÚModernBertMLPéá   a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
ru   c                 ó¤  >• [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )Nr&   ©rz   )rZ   r[   ru   r   ÚLinearr}   rL   Úintermediate_sizeÚmlp_biasÚWir   Úhidden_activationÚactr„   Úmlp_dropoutr†   ÚWor‡   s     €r8   r[   ÚModernBertMLP.__init__è   s‘   ø€ Ü‰ÑÔØŒÜ—)’)˜F×.Ñ.´°F×4LÑ4LÓ0MÐPQÑ0QÐX^×XgÑXgÑhˆŒÜ˜&×2Ñ2Ñ3ˆŒÜ—J’J˜v×1Ñ1Ó2ˆŒ	Ü—)’)˜F×4Ñ4°f×6HÑ6HÈvÏÉÑ_ˆr;   r’   r`   c                 ó¨   • U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )Nr&   r'   ©rT   )rž   Úchunkr¢   r†   r    )r]   r’   ÚinputÚgates       r8   r9   ÚModernBertMLP.forwardð   sG   € Ø—g‘g˜mÓ,×2Ñ2°1¸"Ð2Ð=‰ˆØw‰wt—y‘y §¡¨%£°4Ñ!7Ó8Ó9Ð9r;   )rž   r¢   r    ru   r†   )rE   rF   rG   rH   rl   r   r[   rJ   rK   r9   rM   rp   rq   s   @r8   r—   r—   á   s7   ø† ñð`Ð/÷ `ð: U§\¡\ð :°e·l±l÷ :ò :r;   r—   c                   ól   ^ • \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )ÚModernBertRotaryEmbeddingéõ   ru   c                 ó  >• [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )NÚrope_scalingÚ	rope_typeÚtypeÚdefaultÚinv_freqF)Ú
persistent)rZ   r[   ÚhasattrÚ
isinstancer®   ÚdictÚgetr¯   Úmax_position_embeddingsÚmax_seq_len_cachedÚoriginal_max_seq_lenru   r   Úrope_init_fnÚattention_scalingÚregister_bufferr²   Úoriginal_inv_freq)r]   ru   rV   r²   r^   s       €r8   r[   Ú"ModernBertRotaryEmbedding.__init__ö   sÏ   ø€ Ü‰ÑÔä6˜>×*Ñ*¬z¸&×:MÑ:MÌt×/TÑ/TØ#×0Ñ0×4Ñ4°[À&×BUÑBU×BYÑBYÐZ`ÓBaÓbˆDNà&ˆDŒNØ"(×"@Ñ"@ˆÔØ$*×$BÑ$BˆÔ!àŒÜ/°·±Ñ?ˆÔà+/×+<Ñ+<¸T¿[¹[È&Ó+QÑ(ˆÔ(Ø×Ñ˜Z¨¸eÐÑDØ!%§¡ˆÕr;   c                 ób  • U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   r'   r   ÚmpsÚcpuF)Údevice_typeÚenabledr&   r¥   )rW   )r²   rm   Úexpandr,   ÚtorV   rµ   r°   ro   rJ   ÚautocastÚ	transposeÚcatr1   r¼   r2   rW   )
r]   ÚxÚposition_idsÚinv_freq_expandedÚposition_ids_expandedrÃ   ÚfreqsÚembr1   r2   s
             r8   r9   Ú!ModernBertRotaryEmbedding.forward  sR  € ð !ŸM™M¨$²°4¨-Ñ8×>Ñ>Ó@×GÑGÈ×HZÑHZÐ[\ÑH]Ð_aÐcdÓe×hÑhÐij×iqÑiqÓrÐØ ,ªQ°²a¨ZÑ 8× >Ñ >Ó @Ðä'1°!·(±(·-±-Ä×'EÑ'EÈ!Ï(É(Ï-É-Ð[`ÓJ`a—h‘h—m’mÐfkˆÜ^Š^¨¸UÓCØ&×,Ñ,Ó.Ð1F×1LÑ1LÓ1NÑN×YÑYÐZ[Ð]^Ó_ˆEÜ—)’)˜U˜N°Ñ3ˆCØ—'‘'“)˜d×4Ñ4Ñ4ˆCØ—'‘'“)˜d×4Ñ4Ñ4ˆC÷	 Dð v‰v˜AŸG™GˆvÐ$ c§f¡f°1·7±7 fÐ&;Ð;Ð;÷ DÕCús   Ã$BF Æ 
F.)r¼   ru   r¹   r¾   rº   r»   r¯   rk   )rE   rF   rG   rH   r   r[   rJ   Úno_gradr   r9   rM   rp   rq   s   @r8   r«   r«   õ   s7   ø† ñ/Ð/÷ /ð /ð" ‡]‚]ƒ_Øñ<ó ó ö<r;   r«   c                 ó–   • U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..Nr'   r&   r¥   )r,   rJ   rÉ   )rÊ   Úx1Úx2s      r8   Úrotate_halfrÕ     sZ   € à	
ˆ3Ð"!—'‘'˜"‘+ Ñ"Ð"Ð"Ñ	#€BØ	
ˆ3—‘˜‘˜qÑ Ñ"Ð"Ñ	#€BÜ9Š9rc˜2Y BÑ'Ð'r;   c                 ó˜   • UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a—  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)Ú	unsqueezerÕ   )ÚqÚkr1   r2   rË   Úunsqueeze_dimÚq_embedÚk_embeds           r8   Úapply_rotary_pos_embrÝ     sS   € ð( -‰-˜Ó
&€CØ
-‰-˜Ó
&€CØ‰wœ; q›>¨CÑ/Ñ0€GØ‰wœ; q›>¨CÑ/Ñ0€GØÐÐr;   ÚmoduleÚModernBertAttentionr0   Úattention_maskÚsliding_window_maskrË   Úlocal_attentionÚbsrT   Úoutput_attentionsr`   c	                 ó°  • U R                  XS9u  p«UR                  SS5      R                  SS9u  pÍn[        XÍX«5      u  pÍU R                  S-  n[
        R                  " XÍR                  SS5      5      U-  nUS:w  a  UnUU-   n[        R                  R                  US[
        R                  S	9R                  UR                  5      n[        R                  R                  UU R                  U R                  S
9n[
        R                  " UU5      nUR                  SS5      R!                  5       nUR#                  USU5      nU(       a  UU4$ U4$ )N©rË   r
   r   r&   r¥   ç      à¿©r'   r'   r'   ©rT   rW   )ÚpÚtraining)Ú
rotary_embrÈ   ÚunbindrÝ   Úhead_dimrJ   Úmatmulr   Ú
functionalÚsoftmaxÚfloat32rÆ   rW   ÚdropoutÚattention_dropoutrë   r+   r-   )rÞ   r0   rà   rá   rË   râ   rã   rT   rä   Ú_kwargsr1   r2   ÚqueryÚkeyÚvalueÚscaleÚattn_weightsÚattn_outputs                     r8   Úeager_attention_forwardrü   9  s=  € ð × Ñ  Ð Ð@H€CØŸ™ a¨Ó+×2Ñ2°qÐ2Ð9Ñ€Eä% e°#Ó;J€EàO‰O˜TÑ!€EÜ—<’< §}¡}°Q¸Ó':Ó;¸eÑC€Là˜(Ó"Ø,ˆà .Ñ0€Lô —=‘=×(Ñ(¨¸2ÄUÇ]Á]Ð(ÐS×VÑVÐW\×WbÑWbÓc€LÜ—=‘=×(Ñ(¨¸×9QÑ9QÐ\b×\kÑ\kÐ(Ðl€LÜ—,’,˜|¨UÓ3€KØ×'Ñ'¨¨1Ó-×8Ñ8Ó:€KØ×"Ñ" 2 r¨3Ó/€KÞØ˜\Ð*Ð*Øˆ>Ðr;   rì   Útarget_dtypec	           	      óÎ  • U" XUS9nUR                   [        R                  [        R                  4;  n
U
(       ad  UR                   nUR	                  U5      n[        UUUU R                  (       a  U R                  OSU R                  US9nUR	                  U5      nO5[        UUUU R                  (       a  U R                  OSU R                  US9nUR                  Xg5      4$ )Nrb   ç        )r#   r$   Ú	dropout_pÚdeterministicÚwindow_size)
rW   rJ   Úfloat16Úbfloat16rÆ   r   rë   rô   Údeterministic_flash_attnr-   )rÞ   r0   rì   r#   r$   râ   rã   rT   rý   rõ   Úconvert_dtypeÚ
orig_dtypeÚattns                r8   Úflash_attention_forwardr	  ^  sË   € ñ S¸JÑ
G€Cà—I‘I¤e§m¡m´U·^±^Ð%DÑD€MÞð —Y‘Yˆ
Øf‰f\Ó"ˆä/ØØ!Ø!Ø28·/·/f×.Ò.ÀsØ ×9Ñ9Ø'ñ
ˆð w‰wzÓ"‰ä/ØØ!Ø!Ø28·/·/f×.Ò.ÀsØ ×9Ñ9Ø'ñ
ˆð I‰IbÓÐ Ð r;   c                 óf  • U R                  XS9u  pšUR                  SS5      R                  SS9u  p¼n[        X¼Xš5      u  p¼US:w  a  Un[        R
                  " UUUU R                  (       a  U R                  OSUS9R                  SS5      R                  5       nUR                  US	U5      nU4$ )
Nræ   r
   r   r&   r¥   rè   rÿ   )r   Ú	attn_maskr'   )
rì   rÈ   rí   rÝ   ÚFÚscaled_dot_product_attentionrë   rô   r+   r-   )rÞ   r0   rà   rá   rË   râ   rã   rT   rõ   r1   r2   rö   r÷   rø   rû   s                  r8   Úsdpa_attention_forwardr  ‰  s»   € ð × Ñ  Ð Ð@H€CØŸ™ a¨Ó+×2Ñ2°qÐ2Ð9Ñ€Eä% e°#Ó;J€Eà˜(Ó"Ø,ˆô 	
×&Ò&ØØØØ28·/·/f×.Ò.ÀsØ$ñ	
÷ 
‰1a‹ß	‰‹ð ð ×"Ñ" 2 r¨3Ó/€KØˆ>Ðr;   )Úflash_attention_2ÚeagerÚsdpac                   óŒ   ^ • \ rS rSrSrSS\S\\   4U 4S jjjr SS\	R                  S\\   S\	R                  4S	 jjrS
rU =r$ )rß   i³  an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
ru   Úlayer_idc                 óî  >• [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR
                  U l	        UR                  UR
                  -  U l
        U R                  U R                  -  U l        [        R                  " UR                  SU R                  -  UR                  S9U l        X!R                   -  S:w  aU  UR"                  S-  UR"                  S-  4U l        UR$                  b  UR$                  OUR&                  nUR"                  nOSU l        UR(                  nUR&                  nUR*                  S	:X  a  [-        U R                  XCS
9U l        O*[0        R2                  " U5      nX5l        [7        US9U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S:”  a   [        R:                  " UR                  5      O[        R<                  " 5       U l        [A        5       U l!        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads (Ú)r
   rš   r&   rè   r  )rT   r$   rU   )ru   rÿ   )"rZ   r[   ru   r  r}   Únum_attention_headsÚ
ValueErrorrô   r  Ú	num_headsrî   Úall_head_sizer   r›   Úattention_biasÚWqkvÚglobal_attn_every_n_layersrâ   Úlocal_rope_thetaÚglobal_rope_thetar¸   Ú_attn_implementationrR   rì   ÚcopyÚdeepcopyÚ
rope_thetar«   r¢   r„   ÚIdentityÚout_dropÚsetÚpruned_heads)r]   ru   r  r"  r¸   Úconfig_copyr^   s         €r8   r[   ÚModernBertAttention.__init__½  s  ø€ Ü‰ÑÔØŒØ Œà×Ñ × :Ñ :Ñ:¸aÓ?ÜØ# F×$6Ñ$6Ð#7Ð7mÐnt÷  oIñ  oIð  nJð  JKð  Lóð ð "(×!9Ñ!9ˆÔØ(.×(GÑ(GˆÔ%Ø×3Ñ3ˆŒØ×*Ñ*¨f×.HÑ.HÑHˆŒØ!Ÿ]™]¨T¯^©^Ñ;ˆÔÜ—I’I˜f×0Ñ0°!°d×6HÑ6HÑ2HÈv×OdÑOdÑeˆŒ	à×7Ñ7Ñ7¸1Ó<Ø$*×$:Ñ$:¸aÑ$?À×AWÑAWÐ[\ÑA\Ð#]ˆDÔ Ø4:×4KÑ4KÑ4W˜×0Ò0Ð]c×]uÑ]uˆJØ&,×&<Ñ&<Ñ#à#+ˆDÔ Ø&,×&DÑ&DÐ#Ø×1Ñ1ˆJà×&Ñ&Ð*=Ó=Ü?Ø—M‘MÐ.EñˆDOô Ÿ-š-¨Ó/ˆKØ%/Ô"Ü7¸{ÑKˆDŒOä—)’)˜F×.Ñ.°×0BÑ0BÈ×I^ÑI^Ñ_ˆŒØ@F×@XÑ@XÐ[^Ó@^œŸ
š
 6×#;Ñ#;Ô<Ôdf×doÒdoÓdqˆŒÜ›EˆÕr;   r’   rä   r`   c           
      ó  • U R                  U5      nUR                  S   nU R                  R                  S:X  a)  UR	                  SSU R
                  U R                  5      nO)UR	                  USSU R
                  U R                  5      n[        U R                  R                     " U 4UU R                  U R                  UU R                  US.UD6nUS   nU R                  U R                  U5      5      nU4USS  -   $ )Nr   r  r'   r
   )r0   rì   râ   rã   rT   rä   r   )r  r,   ru   r  r-   r  rî   ÚMODERNBERT_ATTENTION_FUNCTIONrì   râ   r  r$  r¢   )r]   r’   rä   Úkwargsr0   rã   Úattn_outputss          r8   r9   ÚModernBertAttention.forwardä  só   € ð i‰i˜Ó&ˆà× Ñ  Ñ#ˆØ;‰;×+Ñ+Ð/BÓBØ—(‘(˜2˜q $§.¡.°$·-±-Ó@‰Cà—(‘(˜2˜r 1 d§n¡n°d·m±mÓDˆCä4°T·[±[×5UÑ5UÒVØð	
àØ—‘Ø ×0Ñ0ØØ×"Ñ"Ø/ñ	
ð ñ	
ˆð % Q™ˆØŸ™ d§g¡g¨mÓ&<Ó=ˆàÐ ,¨q¨rÐ"2Ñ2Ð2r;   )r¢   r  r  rô   ru   r  rî   r  râ   r  r$  r&  rì   rk   ©F)rE   rF   rG   rH   rl   r   r   rL   r[   rJ   rK   Úboolr9   rM   rp   rq   s   @r8   rß   rß   ³  s]   ø† ññ%"Ð/ð %"¸8ÀC¹=÷ %"ð %"ðT -2ñ3à—|‘|ð3ð $ D™>ð3ð
 
‰÷3ó 3r;   c                   ót  ^ • \ rS rSrSS\S\\   4U 4S jjjr\R                  " SS9S\R                  S\R                  4S	 j5       r      SS\R                  S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\R                  4S jjrSrU =r$ )ÚModernBertEncoderLayeri  ru   r  c                 ó  >• [         TU ]  5         Xl        US:X  a  [        R                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        g )Nr   rx   )ru   r  )rZ   r[   ru   r   r#  Ú	attn_normr€   r}   r   r‚   rß   r  Úmlp_normr—   Úmlp©r]   ru   r  r^   s      €r8   r[   ÚModernBertEncoderLayer.__init__  s‰   ø€ Ü‰ÑÔØŒØq‹=ÜŸ[š[›]ˆDNäŸ\š\¨&×*<Ñ*<À&Ç/Á/ÐX^×XhÑXhÑiˆDŒNÜ'¨vÑIˆŒ	ÜŸš V×%7Ñ%7¸V¿_¹_ÐSY×ScÑScÑdˆŒÜ  Ó(ˆr;   Tr‰   r’   r`   c                 óB   • U R                  U R                  U5      5      $ rk   )r5  r4  ©r]   r’   s     r8   Úcompiled_mlpÚ#ModernBertEncoderLayer.compiled_mlp  s   € àx‰x˜Ÿ™ mÓ4Ó5Ð5r;   rà   rá   rË   r#   r$   rä   c           
      ó
  • U R                  U R                  U5      UUUUUUS9nXS   -   nU R                  R                  (       a  U R	                  U5      OU R                  U R                  U5      5      n	X-   nU4USS  -   $ )N©rà   rá   rË   r#   r$   rä   r   r   )r  r3  ru   r‘   r:  r5  r4  )
r]   r’   rà   rá   rË   r#   r$   rä   r,  Ú
mlp_outputs
             r8   r9   ÚModernBertEncoderLayer.forward  s›   € ð —y‘yØN‰N˜=Ó)Ø)Ø 3Ø%Ø!Ø!Ø/ð !ð 
ˆð &°Q©Ñ7ˆð {‰{×,×,ð ×Ñ˜mÔ,à—‘˜$Ÿ-™-¨Ó6Ó7ð 	ð
 &Ñ2ˆàÐ ,¨q¨rÐ"2Ñ2Ð2r;   )r  r3  ru   r5  r4  rk   )NNNNNF)rE   rF   rG   rH   r   r   rL   r[   rJ   r”   rK   r:  r•   r/  r9   rM   rp   rq   s   @r8   r1  r1    sõ   ø† ñ	)Ð/ð 	)¸8ÀC¹=÷ 	)ð 	)ð ‡]‚]˜4Ñ ð6¨%¯,©,ð 6¸5¿<¹<ó 6ó !ð6ð 26Ø6:Ø37Ø-1Ø$(Ø,1ñ3à—|‘|ð3ð ! §¡Ñ.ð3ð & e§l¡lÑ3ð	3ð
 ˜u×/Ñ/Ñ0ð3ð ˜UŸ\™\Ñ*ð3ð ˜S‘Mð3ð $ D™>ð3ð 
‰÷3ó 3r;   r1  c                   ó    ^ • \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrS\R                  4S	 jr SS
\\   S\S\4U 4S jjjrS rU 4S jrSrU =r$ )ÚModernBertPreTrainedModeli0  ru   ÚmodelTrs   r1  FrÞ   c                 óþ  ^• U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        [2        45      (       a  U" UR4                  US   5        g [        U[        R6                  5      (       aX  UR8                  R:                  R=                  S5        UR>                  b%  UR>                  R:                  RA                  5         g g g )Nr
   rÞ   Ústdc                 ó  >• [         R                  R                  U R                  SUT* U-  TU-  S9  [	        U [         R
                  5      (       a8  U R                  b*  [         R                  R                  U R                  5        g g g )Nrÿ   )ÚmeanrD  ÚaÚb)r   ÚinitÚtrunc_normal_Úweightrµ   r›   rz   Úzeros_)rÞ   rD  Úcutoff_factors     €r8   Úinit_weightÚ<ModernBertPreTrainedModel._init_weights.<locals>.init_weight?  st   ø€ ÜG‰G×!Ñ!Ø—‘ØØØ . 3Ñ&Ø #Ñ%ð "ñ ô ˜&¤"§)¡)×,Ñ,Ø—;‘;Ñ*Ü—G‘G—N‘N 6§;¡;Õ/ð +ð -r;   g       @rç   )ÚinÚoutÚ	embeddingÚ	final_outrR  rP  rQ  rS  g      ð?)!ru   Úinitializer_cutoff_factorr   ÚModulerm   Úinitializer_rangeÚmathÚsqrtÚnum_hidden_layersr}   rµ   rs   r   r—   rž   r¢   rß   r  ÚModernBertPredictionHeadÚdenseÚModernBertForMaskedLMÚdecoderÚ#ModernBertForSequenceClassificationÚModernBertForMultipleChoiceÚ ModernBertForTokenClassificationÚModernBertForQuestionAnsweringÚ
classifierr€   rK  ÚdataÚfill_rz   Úzero_)r]   rÞ   rN  ÚstdsrM  s       @r8   Ú_init_weightsÚ'ModernBertPreTrainedModel._init_weights:  sæ  ø€ ØŸ™×=Ñ=ˆØÑ ØˆMð	0¤§	¡	ð 	0´÷ 	0ð —+‘+×/Ñ/Ø—;‘;×0Ñ0´4·9²9¸SÀ4Ç;Á;×C`ÑC`Ñ=`Ó3aÑaØŸ™×6Ñ6ØŸ™×0Ñ0°$Ñ6ñ	
ˆô fÔ2×3Ñ3Ù˜×-Ñ-¨t°KÑ/@ÕAÜ˜¤×.Ñ.Ù˜Ÿ	™	 4¨¡:Ô.Ù˜Ÿ	™	 4¨¡;Õ/Ü˜Ô 3×4Ñ4Ù˜Ÿ™ T¨$¡ZÔ0Ù˜Ÿ	™	 4¨¡;Õ/Ü˜Ô 8×9Ñ9Ù˜Ÿ™ d¨5¡kÕ2Ü˜Ô 5×6Ñ6Ù˜Ÿ™¨¨U©Õ4ÜØä3Ü+Ü0Ü.ð	÷
ñ 
ñ ˜×)Ñ)¨4°Ñ+<Õ=Ü˜¤§¡×-Ñ-ØM‰M×Ñ×$Ñ$ SÔ)Ø{‰{Ñ&Ø—‘× Ñ ×&Ñ&Õ(ð 'ð .r;   Úattn_implementationÚis_init_checkr`   c                 ó„   >•  Uc  U R                  5       (       a  SOUn[        TU ]  XS9$ ! [        [        4 a     Nf = f)zB
Checks and dispatches to hhe requested attention implementation.
r  )ri  rj  )Ú_flash_attn_2_can_dispatchr  ÚImportErrorrZ   Ú%_check_and_adjust_attn_implementation)r]   ri  rj  r^   s      €r8   rn  Ú?ModernBertPreTrainedModel._check_and_adjust_attn_implementationn  s`   ø€ ð	ð 'Ñ.°4×3RÑ3R×3TÑ3Tñ $à(ð  ô ‰wÑ<Ø 3ð =ð 
ð 	
øô œKÐ(ó 	Ùð	ús   ƒ, ¬?¾?c                 óÜ  • U R                   R                  SL a  g [        U S5      (       aZ  [        U R                  5      S:”  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                   R                  c  [        5       U R                   l        g g )	NFÚhf_device_mapr   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.rÁ   z|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.rÂ   z|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
ru   r‘   r´   Úlenrq  ÚloggerÚwarning_oncerV   r°   r   rh   s    r8   Ú_maybe_set_compileÚ,ModernBertPreTrainedModel._maybe_set_compile…  s  € Ø;‰;×(Ñ(¨EÒ1Øä4˜×)Ñ)¬c°$×2DÑ2DÓ.EÈÓ.IØ{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×Ñ˜uÓ$Ø{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×Ñ˜uÓ$Ø{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×(Ñ(Ñ0Ü,?Ó,AˆDK‰KÕ)ð 1r;   c                 óÞ   >• [         TU ]  " U0 UD6nU R                  R                  S;   aA  U R                  R                  (       a  [        R                  S5        SU R                  l        U$ )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)rZ   Úresize_token_embeddingsru   r‘   rs  rt  )r]   Úargsr+  Úmodel_embedsr^   s       €r8   rx  Ú1ModernBertPreTrainedModel.resize_token_embeddings¤  s[   ø€ Ü‘wÒ6¸ÐGÀÑGˆà;‰;×(Ñ(¨LÓ8Ø{‰{×,×,Ü×#Ñ#Øyôð -2ˆDK‰KÔ)àÐr;   rC   r.  )rE   rF   rG   rH   r   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnr   rU  rg  r   ro   r/  rn  ru  rx  rM   rp   rq   s   @r8   rA  rA  0  s€   ø‡ àÓØÐØ&*Ð#Ø/Ð1IÐJÐØÐØ€NØÐð2) B§I¡Iô 2)ðj INñ
Ø#+¨C¡=ð
ØAEð
à	÷
ð 
ò.B÷>
ó 
r;   rA  ÚinputsÚlabelsc                 ó  • UR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       n[        UR                  5       R                  5       5      n[        R                  R                  R                  [        R                  " US[        R                  S9S5      nU R                  5       S:X  a  U R	                  5       U   nO(U R                  tpšnXš-  nU R                  " U/UQ76 U   nUb  UR	                  5       U   OSnUb  UR	                  5       U   OSnX…XvXÞ4$ )	aP  
Remove padding from input sequences.

Args:
    inputs: (batch, seqlen, ...) or (batch, seqlen)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    position_ids: (batch, seqlen), int, position ids
    labels: (batch, seqlen), int, labels

Returns:
    unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    cu_seqlens: (batch + 1), the cumulative sequence lengths
    max_seqlen_in_batch: int
    unpadded_position_ids: (total_nnz) or None
    unpadded_labels: (total_nnz) or None
r'   ré   F)Úas_tupler   )r   r   r&   N)ÚsumrJ   Úint32ÚnonzeroÚflattenrL   ÚmaxÚitemr   rð   ÚpadÚcumsumrT   r,   r-   )rƒ  rà   rË   r„  Úseqlens_in_batchÚindicesÚmax_seqlen_in_batchr#   Úunpadded_inputsÚbatchÚseqlenÚrestr,   Úunpadded_position_idsÚunpadded_labelss                  r8   Ú_unpad_modernbert_inputr˜  ±  s  € ð. &×)Ñ)¨b¼¿¹Ð)ÐDÐÜmŠm˜N×2Ñ2Ó4¸uÑE×MÑMÓO€GÜÐ.×2Ñ2Ó4×9Ñ9Ó;Ó<ÐÜ—‘×$Ñ$×(Ñ(¬¯ªÐ6FÈAÔUZ×U`ÑU`Ñ)aÐciÓj€Jà‡zzƒ|qÓØ Ÿ.™.Ó*¨7Ñ3‰à%Ÿ|™|Ðˆ˜Ø‘ˆØ Ÿ+š+ eÐ3¨dÒ3°GÑ<ˆà?KÑ?W˜L×0Ñ0Ó2°7Ò;Ð]aÐØ39Ñ3Ef—n‘nÓ& wÒ/È4€Oà ZÐF[ÐlÐlr;   r  r“  r”  c                 ó^  • U R                  5       S:X  aC  [        R                  " X#-  U R                  U R                  S9nXU'   UR                  X#5      nU$ U R                  tpg[        R                  " X#-  /UQ7U R                  U R                  S.6nXU'   UR
                  " X#/UQ76 nU$ )a-  
Add padding to sequences.

Args:
    inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    batch: int, batch size
    seqlen: int, max sequence length

Returns:
    padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
r   )rW   rV   )rT   rJ   ÚzerosrW   rV   r-   r,   )rƒ  r  r“  r”  ÚoutputÚpadded_inputsÚ_r•  s           r8   Ú_pad_modernbert_outputrž  Ú  sœ   € ð$ ‡zzƒ|qÓÜ—’˜U™^°6·<±<ÈÏÉÑVˆØ ˆw‰ØŸ™ EÓ2ˆð Ðð —<‘<ˆˆÜ—’˜U™^Ð]¨dÑ]¸&¿,¹,ÈvÏ}É}Ò]ˆØ ˆw‰ØŸš EÐ9°DÒ9ˆàÐr;   c            !       óæ  ^ • \ rS rSrS\4U 4S jjrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\   S\\\
R                  S4   \4   4S jj5       rS\
R                  S\S\
R                  4S jrSrU =r$ )ÚModernBertModeliù  ru   c           	      óˆ  >• [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        SU l        U R#                  5         g s  snf )Nrx   F)rZ   r[   ru   rs   Ú
embeddingsr   Ú
ModuleListÚrangerY  r1  Úlayersr€   r}   r   r‚   Ú
final_normÚgradient_checkpointingÚ	post_initr6  s      €r8   r[   ÚModernBertModel.__init__û  s•   ø€ Ü‰Ñ˜Ô ØŒÜ.¨vÓ6ˆŒÜ—m’mÜFKÈF×LdÑLdÔFeÓfÑFe¸(Ô# FÖ5ÑFeÑfó
ˆŒô Ÿ,š, v×'9Ñ'9¸v¿¹ÐU[×UeÑUeÑfˆŒØ&+ˆÔ#Ø‰Õùò	 gs   ÁB?c                 ó.   • U R                   R                  $ rk   ©r¢  r   rh   s    r8   Úget_input_embeddingsÚ$ModernBertModel.get_input_embeddings  s   € Ø‰×-Ñ-Ð-r;   c                 ó$   • XR                   l        g rk   r«  )r]   rø   s     r8   Úset_input_embeddingsÚ$ModernBertModel.set_input_embeddings	  s   € Ø).‰Õ&r;   r‹   rà   rá   rË   r   r  r#   r$   Ú
batch_sizeÚseq_lenrä   Úoutput_hidden_statesÚreturn_dictr`   .c                 ó6  ^^	^
• Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU(       a  SOSnU(       a  SOSnU R                  5         Ub  U R                  X5        T	c+  T
c(  Ub  UR                  SS u  m	m
OUR                  SS u  m	m
Ub  UR                  OUR                  nUc&  [        R                  " T	T
4U[        R                  S9nSnU R                   R                  S:X  aH  TcD  UcA  Uc>  SnUc,  [        R                  " 5          [        XS	9tnmpxnSSS5        OF[        XRS	9tnmpxnO8Uc$  [        R                  " T
US
9R!                  S5      nU R#                  X+S9u  p#U R%                  XS9nU R&                   HD  nU(       a  UU4-   nU" UUUUUUUS9nUS   nU(       d  M*  [)        U5      S:”  d  M;  UUS   4-   nMF     U(       a  UU4-   nU R+                  U5      nU(       a&  [-        UTT	T
S9nUb  [/        U	UU
4S jU 5       5      nU(       d  [/        S UXï4 5       5      $ [1        UUUS9$ ! , (       d  f       Nâ= f)áF  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nz:You must specify exactly one of input_ids or inputs_embedsrC   r&   rY   Fr  T)rƒ  rà   )rV   r   )rä   )r‹   r   r=  r   ©rƒ  r  r“  r”  c              3   ó:   >#   • U H  n[        UTTTS 9v •  M     g7f)r·  N)rž  )Ú.0Úhsr±  r  r²  s     €€€r8   Ú	<genexpr>Ú*ModernBertModel.forward.<locals>.<genexpr>z  s$   øé € ð *á/˜ô +°"¸gÈZÐ`gÖhÚ/ùs   ƒc              3   ó,   #   • U H  oc  M  Uv •  M     g 7frk   rC   )r¹  Úvs     r8   r»  r¼  €  s   é € ÐmÑ$[˜qŸ™Ò$[ùs   ‚‹	)Úlast_hidden_stater’   Ú
attentions)ru   rä   r³  Úuse_return_dictr  ru  Ú%warn_if_padding_and_no_attention_maskr,   rV   rJ   Úonesr/  r  rÑ   r˜  Úaranger×   Ú_update_attention_maskr¢  r¥  rr  r¦  rž  rn   r   )r]   r‹   rà   rá   rË   r   r  r#   r$   r±  r²  rä   r³  r´  Úall_hidden_statesÚall_self_attentionsrV   Úrepadr  r’   Úencoder_layerÚlayer_outputss         `  ``           r8   r9   ÚModernBertModel.forward  sæ  ú€ ðB 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà˜Ð -°tÐ";×<ÜÐYÓZÐZæ"6™B¸DÐÞ$5™b¸4Ðà×ÑÔ!àÑ Ø×6Ñ6°yÔQàÑ '¡/ØÑ(Ø&3×&9Ñ&9¸"¸1Ð&=Ñ#
™Gà&/§o¡o°b°qÐ&9Ñ#
˜GØ%.Ñ%:×!Ò!À×@TÑ@TˆàÑ!Ü"ŸZšZ¨°WÐ(=ÀfÔTY×T^ÑT^Ñ_ˆNàˆØ;‰;×+Ñ+Ð/BÓBØ‰ :Ñ#5¸*Ñ:LØØ Ñ(ÜŸšÜI`Ø#,ñJÐF˜	 7¨JÀQ÷ )˜ô
 JaØ,ñJÐFM 7¨JÀQøð Ñ#Ü$Ÿ|š|¨G¸FÑC×MÑMÈaÓPà26×2MÑ2MØð 3Nð 3Ñ/ˆNð Ÿ™°)˜ÐYˆà!Ÿ[œ[ˆMÞ#Ø$5¸Ð8HÑ$HÐ!á)ØØ-Ø$7Ø)Ø%Ø%Ø"3ñˆMð *¨!Ñ,ˆMß Ð ¤S¨Ó%7¸!Õ%;Ø&9¸]È1Ñ=MÐ<OÑ&OÒ#ñ )ö"  Ø 1°]Ð4DÑ DÐàŸ™¨Ó6ˆæÜ2Ø$¨g¸ZÐPWñˆMð !Ñ,Ü$)ö *á/ó*ó %Ð!ö
 ÜÑm ]Ð4EÑ$[ÓmÓmÐmÜØ+Ø+Ø*ñ
ð 	
÷i )ús   ÅJ
Ê

Jc                 ó,  • U(       aˆ  U R                   R                  S:X  a'  [        R                  S5        SU R                   l        OGU R                   R                  S:w  a-  [        R                  SU R                   R                   S35        [	        XR
                  5      n[        R                  " UR                  S   5      R                  S5      n[        R                  " XDR                  -
  5      nXPR                   R                  S-  :*  R                  S5      R                  S5      R                  UR                  5      nUR                  UR!                  5       [        R"                  " U R
                  5      R$                  5      nX74$ )Nr  z’Outputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r&   r   )ru   r  rs  rt  r   rW   rJ   rÄ  r,   r×   ÚabsÚTrâ   rÆ   rV   Úmasked_fillÚlogical_notÚfinfoÚmin)r]   rà   rä   Úglobal_attention_maskÚrowsÚdistanceÚwindow_maskrá   s           r8   rÅ  Ú&ModernBertModel._update_attention_mask‡  sJ  € ÞØ{‰{×/Ñ/°6Ó9Ü×#Ñ#ðVôð 4;—‘Õ0Ø—‘×1Ñ1°WÓ<Ü×#Ñ#ð Ø $§¡× @Ñ @ÐAð B:ð:ôô !;¸>Ï:É:Ó VÐô |Š|Ð1×7Ñ7¸Ñ:Ó;×EÑEÀaÓHˆä—9’9˜T§F¡F™]Ó+ˆð Ÿ™×4Ñ4¸Ñ9Ñ9×DÑDÀQÓG×QÑQÐRSÓT×WÑWÐXf×XmÑXmÓnð 	ð 4×?Ñ?À×@WÑ@WÓ@YÔ[`×[fÒ[fÐgk×gqÑgqÓ[r×[vÑ[vÓwÐà$Ð9Ð9r;   )ru   r¢  r¦  r§  r¥  ©NNNNNNNNNNNNN)rE   rF   rG   rH   r   r[   r¬  r¯  r   r   rJ   r•   rK   rL   r/  r   rn   r   r9   rÅ  rM   rp   rq   s   @r8   r   r   ù  s  ø† ð	Ð/÷ 	ò.ò/ð ð 15Ø15Ø6:Ø37Ø04Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñx
à˜E×,Ñ,Ñ-ðx
ð ! §¡Ñ.ðx
ð & e§l¡lÑ3ð	x
ð
 ˜u×/Ñ/Ñ0ðx
ð   §¡Ñ-ðx
ð ˜%Ÿ,™,Ñ'ðx
ð ˜UŸ\™\Ñ*ðx
ð ˜S‘Mðx
ð ˜S‘Mðx
ð ˜#‘ðx
ð $ D™>ðx
ð ' t™nðx
ð ˜d‘^ðx
ð 
ˆuU—\‘\ 3Ð&Ñ'¨Ð8Ñ	9ôx
ó ðx
ðt:°U·\±\ð :ÐVZð :Ð_d×_kÑ_k÷ :ò :r;   r   c                   ój   ^ • \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )rZ  i§  ru   c                 óF  >• [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nrx   )rZ   r[   ru   r   r›   r}   Úclassifier_biasr[  r   Úclassifier_activationr    r€   r   r‚   rƒ   r‡   s     €r8   r[   Ú!ModernBertPredictionHead.__init__¨  so   ø€ Ü‰ÑÔØŒÜ—Y’Y˜v×1Ñ1°6×3EÑ3EÀv×G]ÑG]Ó^ˆŒ
Ü˜&×6Ñ6Ñ7ˆŒÜ—L’L ×!3Ñ!3¸¿¹Èv×O_ÑO_Ñ`ˆ	r;   r’   r`   c                 ó`   • U R                  U R                  U R                  U5      5      5      $ rk   )rƒ   r    r[  r9  s     r8   r9   Ú ModernBertPredictionHead.forward¯  s#   € Øy‰y˜Ÿ™ $§*¡*¨]Ó";Ó<Ó=Ð=r;   )r    ru   r[  rƒ   )rE   rF   rG   rH   r   r[   rJ   rK   r9   rM   rp   rq   s   @r8   rZ  rZ  §  s2   ø† ðaÐ/÷ að> U§\¡\ð >°e·l±l÷ >ò >r;   rZ  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )Úcustom_introc            "       óF  ^ • \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\R                  " SS	9S
\R                  S\R                  4S j5       r\              SS\\R"                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r\  i³  zdecoder.weightru   c                 ón  >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )Nrš   )rZ   r[   ru   r   rB  rZ  Úheadr   r›   r}   r|   Údecoder_biasr]  Úsparse_predictionÚsparse_pred_ignore_indexr¨  r‡   s     €r8   r[   ÚModernBertForMaskedLM.__init__»  sƒ   ø€ Ü‰Ñ˜Ô ØŒÜ$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—y’y ×!3Ñ!3°V×5FÑ5FÈV×M`ÑM`ÑaˆŒà!%§¡×!>Ñ!>ˆÔØ(,¯©×(LÑ(LˆÔ%ð 	‰Õr;   c                 ó   • U R                   $ rk   ©r]  rh   s    r8   Úget_output_embeddingsÚ+ModernBertForMaskedLM.get_output_embeddingsÈ  s   € Ø|‰|Ðr;   Únew_embeddingsc                 ó   • Xl         g rk   ré  )r]   rì  s     r8   Úset_output_embeddingsÚ+ModernBertForMaskedLM.set_output_embeddingsË  s   € Ø%r;   Tr‰   r›  r`   c                 óB   • U R                  U R                  U5      5      $ rk   )r]  rã  )r]   r›  s     r8   Úcompiled_headÚ#ModernBertForMaskedLM.compiled_headÎ  s   € à|‰|˜DŸI™I fÓ-Ó.Ð.r;   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  c                 ól  • Ub  UOU R                   R                  nU R                  5         U R                   R                  S:X  a¶  Uc³  Uc°  U	c­  U
c)  Uc&  Ub  UR                  SS u  p«OUR                  SS u  p«Ub  UR
                  OUR
                  nUc%  [        R                  " X«4U[        R                  S9nUc-  [        R                  " 5          [        XXFS9u  pp‰pFSSS5        O[        XRXFS9u  pWp‰pFU R                  UUUUUUUU	U
UUUUS9nUS   nU R                  (       aK  UbH  UR                  S5      nUR                  UR                  S   S5      nX`R                  :g  nUU   nUU   nU R                   R                  (       a  U R!                  U5      OU R#                  U R%                  U5      5      nSnUb*  U R&                  " UU4S	U R                   R(                  0UD6nU R                   R                  S:X  aQ  U R                   R*                  (       d  Uc
  [-        5       O[        R                  " 5          [/        UXzUS
9nSSS5        U(       d  U4nUb  U4U-   $ U$ [1        UUUR2                  UR4                  S9$ ! , (       d  f       GN¦= f! , (       d  f       NU= f)r¶  Nr  r&   rY   )rƒ  rà   rË   r„  ©r‹   rà   rá   rË   r   r  r#   r$   r±  r²  rä   r³  r´  r   r'   r|   r·  ©ÚlossÚlogitsr’   rÀ  )ru   rÁ  ru  r  r,   rV   rJ   rÃ  r/  rÑ   r˜  rB  rå  r-   ræ  r‘   rñ  r]  rã  Úloss_functionr|   Úrepad_logits_with_gradr   rž  r   r’   rÀ  )r]   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r+  rV   Úoutputsr¿  Úmask_tokensr÷  rö  r›  s                          r8   r9   ÚModernBertForMaskedLM.forwardÒ  sÁ  € ðF &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à;‰;×+Ñ+Ð/BÓBØ‰ :Ñ#5¸*Ñ:LØÑ%¨'©/Ø$Ñ0Ø.;×.AÑ.AÀ"À1Ð.EÑ+˜
 Gà.7¯o©o¸b¸qÐ.AÑ+˜
Ø-6Ñ-B˜×)Ò)È×H\ÑH\à!Ñ)Ü%*§Z¢Z°Ð0EÈfÔ\a×\fÑ\fÑ%gNà Ñ(ÜŸšÜ[rØ#,ÐZfñ\ÑX˜	¨JÀL÷ )˜ô
 \sØ,ÐZfñ\ÑXM¨JÀLð —*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà×!×! fÑ&8à—[‘[ “_ˆFØ 1× 6Ñ 6°v·|±|ÀA±ÈÓ KÐð !×$AÑ$AÑAˆKØ 1°+Ñ >ÐØ˜KÑ(ˆFð {‰{×,×,ð ×ÑÐ0Ô1à—‘˜dŸi™iÐ(9Ó:Ó;ð 	ð ˆØÑØ×%Ò% f¨fÑbÀÇÁ×AWÑAWÐbÐ[aÑbˆDà;‰;×+Ñ+Ð/BÓBØ"&§+¡+×"D×"DÈÉ””Ô\a×\iÒ\iÓ\kÑkÜ/°vÀwÐipÑq÷ lö ØYˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEäØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
÷m )žú÷^ lÕkús   ÃJÉ
J%Ê
J"Ê%
J3)ru   r]  rã  rB  ræ  rå  ©NNNNNNNNNNNNNN)rE   rF   rG   rH   Ú_tied_weights_keysr   r[   rê  r   r›   rî  rJ   r”   rK   rñ  r   r   r•   rL   r/  r   rn   r   r9   rM   rp   rq   s   @r8   r\  r\  ³  s½  ø† ð +Ð+ÐðÐ/÷ òð&°B·I±Iô &ð ‡]‚]˜4Ñ ð/ E§L¡Lð /°U·\±\ó /ó !ð/ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñm
à˜E×,Ñ,Ñ-ðm
ð ! §¡Ñ.ðm
ð & e§l¡lÑ3ð	m
ð
 ˜uŸ|™|Ñ,ðm
ð   §¡Ñ-ðm
ð ˜Ÿ™Ñ&ðm
ð ˜%Ÿ,™,Ñ'ðm
ð ˜UŸ\™\Ñ*ðm
ð ˜S‘Mðm
ð ˜S‘Mðm
ð ˜#‘ðm
ð $ D™>ðm
ð ' t™nðm
ð ˜d‘^ðm
ð" 
ˆuU—\‘\Ñ" NÐ2Ñ	3ô#m
ó öm
r;   r\  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r^  iC  ru   c                 ón  >• [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g rk   )rZ   r[   Ú
num_labelsru   r   rB  rZ  rã  rJ   r   r„   Úclassifier_dropoutr†   r›   r}   rb  r¨  r‡   s     €r8   r[   Ú,ModernBertForSequenceClassification.__init__I  s€   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒØŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒð 	‰Õr;   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r`   c                 ót  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  SS9UR                  SS	S
9-  nU R                  U5      nU R                  U5      nU R                  U5      nSnUGb  U R                   R                  c‘  U R                  S:X  a  SU R                   l
        OoU R                  S:”  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l
        OSU R                   l
        U R                   R                  S:X  aJ  [!        5       nU R                  S:X  a&  U" UR#                  5       UR#                  5       5      nOŽU" UU5      nO„U R                   R                  S:X  a=  [%        5       nU" UR'                  SU R                  5      UR'                  S5      5      nO-U R                   R                  S:X  a  [)        5       nU" UU5      nU(       d  U4nUb  U4U-   $ U$ [+        UUUR,                  UR.                  S9$ )aª  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nrô  r   ÚclsrF  r'   r   r¥   T©rT   ÚkeepdimÚ
regressionÚsingle_label_classificationÚmulti_label_classificationrõ  )ru   rÁ  ru  rB  Úclassifier_poolingr×   r‡  rã  r†   rb  Úproblem_typer  rW   rJ   ÚlongrL   r	   Úsqueezer   r-   r   r   r’   rÀ  )r]   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r+  rú  r¿  Úpooled_outputr÷  rö  Úloss_fctr›  s                          r8   r9   Ú+ModernBertForSequenceClassification.forwardV  s€  € ðN &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà;‰;×)Ñ)¨UÓ2Ø 1²!°Q°$Ñ 7ÑØ[‰[×+Ñ+¨vÓ5Ø!2°^×5MÑ5MÈbÓ5QÑ!Q× VÑ VÐ[\Ð VÐ ]Ð`n×`rÑ`rØ˜tð asð añ !Ðð Ÿ	™	Ð"3Ó4ˆØŸ	™	 -Ó0ˆØ—‘ Ó/ˆàˆØÒØ{‰{×'Ñ'Ñ/Ø—?‘? aÓ'Ø/;D—K‘KÕ,Ø—_‘_ qÓ(¨f¯l©l¼e¿j¹jÓ.HÈFÏLÉLÔ\a×\eÑ\eÓLeØ/LD—K‘KÕ,à/KD—K‘KÔ,à{‰{×'Ñ'¨<Ó7Ü"›9Ø—?‘? aÓ'Ù# F§N¡NÓ$4°f·n±nÓ6FÓG‘Dá# F¨FÓ3‘DØ—‘×)Ñ)Ð-JÓJÜ+Ó-Ù §¡¨B°·±Ó @À&Ç+Á+ÈbÃ/ÓR‘Ø—‘×)Ñ)Ð-IÓIÜ,Ó.Ù ¨Ó/æØYˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä'ØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
r;   )rb  ru   r†   rã  rB  r  rý  )rE   rF   rG   rH   r   r[   r   r   rJ   r•   rK   rL   r/  r   rn   r   r9   rM   rp   rq   s   @r8   r^  r^  C  sk  ø† ðÐ/÷ ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñe
à˜E×,Ñ,Ñ-ðe
ð ! §¡Ñ.ðe
ð & e§l¡lÑ3ð	e
ð
 ˜uŸ|™|Ñ,ðe
ð   §¡Ñ-ðe
ð ˜Ÿ™Ñ&ðe
ð ˜%Ÿ,™,Ñ'ðe
ð ˜UŸ\™\Ñ*ðe
ð ˜S‘Mðe
ð ˜S‘Mðe
ð ˜#‘ðe
ð $ D™>ðe
ð ' t™nðe
ð ˜d‘^ðe
ð" 
ˆuU—\‘\Ñ"Ð$<Ð<Ñ	=ô#e
ó öe
r;   r^  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r`  i¿  ru   c                 ób  >• [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rk   ©rZ   r[   r  r   rB  rZ  rã  rJ   r   r„   r  r†   r›   r}   rb  r¨  r‡   s     €r8   r[   Ú)ModernBertForTokenClassification.__init__Å  s{   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒð 	‰Õr;   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r`   c                 óö  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nrô  r   r'   r   rõ  )ru   rÁ  ru  rB  rã  r†   rb  r   r-   r  r   r’   rÀ  )r]   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  rú  r¿  r÷  rö  r  r›  s                        r8   r9   Ú(ModernBertForTokenClassification.forwardÑ  s#  € ðH &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà ŸI™IÐ&7Ó8ÐØ ŸI™IÐ&7Ó8ÐØ—‘Ð!2Ó3ˆàˆØÑÜ'Ó)ˆHÙ˜FŸK™K¨¨D¯O©OÓ<¸f¿k¹kÈ"»oÓNˆDæØY ¨¨ Ñ,ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä$ØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
r;   ©rb  r†   rã  rB  r  rý  )rE   rF   rG   rH   r   r[   r   r   rJ   r•   rK   rL   r/  r   rn   r   r9   rM   rp   rq   s   @r8   r`  r`  ¿  sk  ø† ð
Ð/÷ 
ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñI
à˜E×,Ñ,Ñ-ðI
ð ! §¡Ñ.ðI
ð & e§l¡lÑ3ð	I
ð
 ˜uŸ|™|Ñ,ðI
ð   §¡Ñ-ðI
ð ˜Ÿ™Ñ&ðI
ð ˜%Ÿ,™,Ñ'ðI
ð ˜UŸ\™\Ñ*ðI
ð ˜S‘MðI
ð ˜S‘MðI
ð ˜#‘ðI
ð $ D™>ðI
ð ' t™nðI
ð ˜d‘^ðI
ð  
ˆuU—\‘\Ñ"Ð$9Ð9Ñ	:ô!I
ó öI
r;   r`  c            "       ó¶  ^ • \ rS rSrS\4U 4S jjr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )ra  i  ru   c                 ób  >• [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rk   r  r‡   s     €r8   r[   Ú'ModernBertForQuestionAnswering.__init__   sy   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒà‰Õr;   r‹   rà   rá   rË   Ústart_positionsÚend_positionsr  r#   r$   r±  r²  rä   r³  r´  r`   c                 óR  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nUR                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  U R                  " UUXV40 UD6nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )r¶  N)rà   rá   rË   r  r#   r$   r±  r²  rä   r³  r´  r   r   r'   r¥   )rö  Ústart_logitsÚ
end_logitsr’   rÀ  )ru   rÁ  ru  rB  rã  r†   rb  Úsplitr  r+   rø  r   r’   rÀ  )r]   r‹   rà   rá   rË   r  r  r  r#   r$   r±  r²  rä   r³  r´  r+  rú  r¿  r÷  r  r   rö  r›  s                          r8   r9   Ú&ModernBertForQuestionAnswering.forward+  sc  € ðF &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà ŸI™IÐ&7Ó8ÐØ ŸI™IÐ&7Ó8ÐØ—‘Ð!2Ó3ˆà#)§<¡<°°r <Ð#:Ñ ˆjØ#×+Ñ+¨BÓ/×:Ñ:Ó<ˆØ×'Ñ'¨Ó+×6Ñ6Ó8ˆ
àˆØÑ&¨=Ñ+DØ×%Ò% l°JÀÑiÐbhÑiˆDæØ" JÐ/°'¸!¸"°+Ñ=ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä+ØØ%Ø!Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
r;   r  rØ  )rE   rF   rG   rH   r   r[   r   r   rJ   rK   rL   r/  r   rn   r   r9   rM   rp   rq   s   @r8   ra  ra    sf  ø† ð	Ð/÷ 	ð ð 26Ø6:Ø/3Ø26Ø04Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñK
à˜EŸL™LÑ)ðK
ð ! §¡Ñ.ðK
ð & e§l¡lÑ3ð	K
ð
 ˜uŸ|™|Ñ,ðK
ð " %§,¡,Ñ/ðK
ð   §¡Ñ-ðK
ð ˜%Ÿ,™,Ñ'ðK
ð ˜UŸ\™\Ñ*ðK
ð ˜S‘MðK
ð ˜S‘MðK
ð ˜#‘ðK
ð $ D™>ðK
ð ' t™nðK
ð ˜d‘^ðK
ð" 
ˆuU—\‘\Ñ"Ð$@Ð@Ñ	Aô#K
ó öK
r;   ra  z«
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )r_  iz  ru   c                 ó8  >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g ©Nr   )rZ   r[   ru   r   rB  rZ  rã  rJ   r   r„   r  r†   r›   r}   rb  r¨  r‡   s     €r8   r[   Ú$ModernBertForMultipleChoice.__init__€  sm   ø€ Ü‰Ñ˜Ô ØŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸Ó:ˆŒð 	‰Õr;   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r`   c                 óP  • Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  SS	9UR                  SS
S9-  nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        R                  " 5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S9$ )aK  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr   r'   éþÿÿÿrô  r   r  rF  r¥   Tr  rõ  )ru   rÁ  r,   r-   Úsizeru  rB  r  r×   r‡  rã  r†   rb  r   r   r   r’   rÀ  )r]   r‹   rà   rá   rË   r   r„  r  r#   r$   r±  r²  rä   r³  r´  r+  Únum_choicesrú  r¿  r  r÷  Úreshaped_logitsrö  r  r›  s                            r8   r9   Ú#ModernBertForMultipleChoice.forwardŒ  sa  € ðL &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ,5Ñ,Ai—o‘o aÒ(À}×GZÑGZÐ[\ÑG]ˆà>GÑ>SI—N‘N 2 y§~¡~°bÓ'9Ô:ÐY]ˆ	ØM[ÑMg˜×,Ñ,¨R°×1DÑ1DÀRÓ1HÔIÐmqˆØGSÑG_|×(Ñ(¨¨\×->Ñ->¸rÓ-BÔCÐeiˆð Ñ(ð ×Ñ˜r =×#5Ñ#5°bÓ#9¸=×;MÑ;MÈbÓ;QÔRàð 	ð 	×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà;‰;×)Ñ)¨UÓ2Ø 1²!°Q°$Ñ 7ÑØ[‰[×+Ñ+¨vÓ5Ø!2°^×5MÑ5MÈbÓ5QÑ!Q× VÑ VÐ[\Ð VÐ ]Ð`n×`rÑ`rØ˜tð asð añ !Ðð Ÿ	™	Ð"3Ó4ˆØŸ	™	 -Ó0ˆØ—‘ Ó/ˆà Ÿ+™+ b¨+Ó6ˆàˆØÑÜ×*Ò*Ó,ˆHÙ˜O¨VÓ4ˆDæØ%Ð'¨'°!°"¨+Ñ5ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä(ØØ"Ø!×/Ñ/Ø×)Ñ)ñ	
ð 	
r;   )rb  ru   r†   rã  rB  rý  )rE   rF   rG   rH   r   r[   r   r   rJ   r•   rK   rL   r/  r   rn   r   r9   rM   rp   rq   s   @r8   r_  r_  z  sk  ø† ð
Ð/÷ 
ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñ_
à˜E×,Ñ,Ñ-ð_
ð ! §¡Ñ.ð_
ð & e§l¡lÑ3ð	_
ð
 ˜uŸ|™|Ñ,ð_
ð   §¡Ñ-ð_
ð ˜Ÿ™Ñ&ð_
ð ˜%Ÿ,™,Ñ'ð_
ð ˜UŸ\™\Ñ*ð_
ð ˜S‘Mð_
ð ˜S‘Mð_
ð ˜#‘ð_
ð $ D™>ð_
ð ' t™nð_
ð ˜d‘^ð_
ð" 
ˆuU—\‘\Ñ"Ð$=Ð=Ñ	>ô#_
ó ö_
r;   r_  )r   rA  r\  r^  r`  ra  r_  rD   r%  r.  )Wr   rW  Ú
contextlibr   Útypingr   r   rJ   Útorch.nn.functionalr   rð   r  Útorch.nnr   r   r	   Úactivationsr   Úmodeling_attn_mask_utilsr   Úmodeling_layersr   Úmodeling_outputsr   r   r   r   r   r   Úmodeling_rope_utilsr   r   Úmodeling_utilsr   Úutilsr   r   r   Úutils.import_utilsr   Úconfiguration_modernbertr   Úflash_attn.flash_attn_interfacer   Úflash_attn.layers.rotaryr   Úflash_attn.ops.triton.rotaryr   ÚobjectÚ
get_loggerrE   rs  ÚautogradÚFunctionr!   rK   rL   rP   rR   rU  rs   r—   r«   rÕ   rÝ   r•   rn   r/  rü   r  rW   r	  r  r*  rß   r1  rA  r˜  rž  r   rZ  r\  r^  r`  ra  r_  Ú__all__rC   r;   r8   Ú<module>rB     sï  ðó, Û Ý "ß "ã ß Ð Ý ß AÑ Aå !Ý BÝ 9÷÷ ÷ LÝ -ß GÑ GÝ 5Ý 6ñ ×ÑÝPÝ8Þ9à€Oð 
×	Ò	˜HÓ	%€ô46˜%Ÿ.™.×1Ñ1ô 46ðv *.Ø $ñLð ˜Ÿ™Ñ&ð	Lð
 ˜‘õLô42Q¨ô 2Qôj˜2Ÿ9™9ô ô<:B—I‘Iô :ô(< §	¡	ô <òD(ôðH ).ñ"Ø!ð"à	‰ð"ð —L‘Lð"ð Ÿ™ð	"ð
 ˜5×+Ñ+Ñ,ð"ð ˜3 ˜8‘_ð"ð 	ð"ð 
ð"ð   ‘~ð"ð ˆ5—‘˜uŸ|™|Ð+Ñ,¨e°E·L±LÑ.AÐAÑBõ"ð\ !&§¡ñ(!Ø!ð(!à	‰ð(!ð 2ð(!ð —‘ð	(!ð
 ð(!ð ˜3 ˜8‘_ð(!ð 	ð(!ð 
ð(!ð —+‘+ð(!ð ˆ5<‰<Ñõ(!ðV Ø!ð à	‰ð ð —L‘Lð ð Ÿ™ð	 ð
 ˜5×+Ñ+Ñ,ð ð ˜3 ˜8‘_ð ð 	ð ð 
ð ð ˆ5<‰<Ñô ðH 1Ø$Ø"ñ!Ð ôL3˜"Ÿ)™)ô L3ô^+3Ð7ô +3ð\ ô} ó }ó ð}ðF ,0Ø%)ñ	&mØL‰Lð&mà—L‘Lð&mð ˜5Ÿ<™<Ñ(ð&mð U—\‘\Ñ"ð	&mð
 ˆ5<‰<˜Ÿ™ u§|¡|°S¸(À5Ç<Á<Ñ:PÐRZÐ[`×[gÑ[gÑRhÐhÑiõ&mðRØL‰Lðà\‰\ðð ðð ð	ð
 ‡\\ôð> ôj:Ð/ó j:ó ðj:ôZ	>˜rŸy™yô 	>ñ ðñô
H
Ð5ó H
óð
H
ñV ðñô
t
Ð*Có t
óð
t
ñn ðñô
W
Ð'@ó W
óð
W
ðt ôX
Ð%>ó X
ó ðX
ñv ðñô
m
Ð";ó m
óð
m
ò`r;   