ó
    <±hØ ã                   óš  • S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SK	J
s  Jr  S SKrS SKJ
r
  S SKJrJrJr  SSKJr  SSKJr  SS	KJr  SS
KJr  SSKJrJrJrJrJrJ r   SSK!J"r"  SSK#J$r$J%r%J&r&  SSK'J(r(  SSK)J*r*J+r+  \%" 5       (       a  S SK,J-r-  S SK.J/r/  S SK0J1r1  O\2r/\&Rf                  " \45      r5 " S S\5      r6  STS\Rn                  S\Rn                  S\\Rn                     S\\Rn                     S\8\Rn                  \Rn                  \Rn                  \9\\Rn                     \\Rn                     4   4
S jjr:S\Rn                  S\Rn                  S\9S\9S\Rn                  4
S jr; " S  S!\Rx                  Rz                  5      r>  STS"\\Rn                     S#\\9   4S$ jjr? " S% S&\/5      r@ " S' S(\
R‚                  5      rB " S) S*\
R‚                  5      rC " S+ S,\*5      rD SUS-S.S/\Rn                  S\Rn                  S0\Rn                  S\\RŠ                     S1\8\9\94   S2\9S3\9S4\\F   S\\8\Rn                  \Rn                  4   \8\Rn                     4   4S5 jjrG\R                  4S-S.S/\Rn                  S6\@S"\Rn                  S#\9S1\8\9\94   S2\9S3\9S7\R’                  S\8\Rn                     4S8 jjrJS-S.S/\Rn                  S\Rn                  S0\Rn                  S\\RŠ                     S1\8\9\94   S2\9S3\9S\8\Rn                     4S9 jrK\J\G\KS:.rL " S; S.\
R‚                  5      rM " S< S=\5      rN\$ " S> S?\"5      5       rO\$ " S@ SA\O5      5       rP " SB SC\
R‚                  5      rQ\$" SDSE9 " SF SG\O5      5       rR\$" SHSE9 " SI SJ\O5      5       rS\$" SKSE9 " SL SM\O5      5       rT\$ " SN SO\O5      5       rU\$" SPSE9 " SQ SR\O5      5       rV/ SSQrWg)Vé    N)Únullcontext)ÚLiteralÚOptionalÚUnion)Únn)ÚBCEWithLogitsLossÚCrossEntropyLossÚMSELossé   )ÚACT2FN)ÚPretrainedConfig)Ú_prepare_4d_attention_mask)ÚGradientCheckpointingLayer)ÚBaseModelOutputÚMaskedLMOutputÚMultipleChoiceModelOutputÚQuestionAnsweringModelOutputÚSequenceClassifierOutputÚTokenClassifierOutput)ÚPreTrainedModel)Úauto_docstringÚis_flash_attn_2_availableÚlogging)Úis_triton_availableé   )ÚGemmaRotaryEmbeddingÚapply_rotary_pos_emb)Ú flash_attn_varlen_qkvpacked_func)ÚRotaryEmbedding)Úapply_rotaryc                   ó¢   ^ • \ rS rSrSrSrSS0rS/r                                   SS\S   4U 4S	 jjjr	U 4S
 jr
SrU =r$ )ÚModernBertConfigé8   a  
This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
defaults will yield a similar configuration to that of the ModernBERT-base.
e.g. [answerdotai/ModernBERT-base](https://huggingface.co/answerdotai/ModernBERT-base)

Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.

Args:
    vocab_size (`int`, *optional*, defaults to 50368):
        Vocabulary size of the ModernBert model. Defines the number of different tokens that can be represented by the
        `inputs_ids` passed when calling [`ModernBertModel`]
    hidden_size (`int`, *optional*, defaults to 768):
        Dimension of the hidden representations.
    intermediate_size (`int`, *optional*, defaults to 1152):
        Dimension of the MLP representations.
    num_hidden_layers (`int`, *optional*, defaults to 22):
        Number of hidden layers in the Transformer decoder.
    num_attention_heads (`int`, *optional*, defaults to 12):
        Number of attention heads for each attention layer in the Transformer decoder.
    hidden_activation (`str` or `function`, *optional*, defaults to `"gelu"`):
        The non-linear activation function (function or string) in the decoder. Will default to `"gelu"`
        if not specified.
    max_position_embeddings (`int`, *optional*, defaults to 8192):
        The maximum sequence length that this model might ever be used with.
    initializer_range (`float`, *optional*, defaults to 0.02):
        The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
    initializer_cutoff_factor (`float`, *optional*, defaults to 2.0):
        The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
    norm_eps (`float`, *optional*, defaults to 1e-05):
        The epsilon used by the rms normalization layers.
    norm_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the normalization layers.
    pad_token_id (`int`, *optional*, defaults to 50283):
        Padding token id.
    eos_token_id (`int`, *optional*, defaults to 50282):
        End of stream token id.
    bos_token_id (`int`, *optional*, defaults to 50281):
        Beginning of stream token id.
    cls_token_id (`int`, *optional*, defaults to 50281):
        Classification token id.
    sep_token_id (`int`, *optional*, defaults to 50282):
        Separation token id.
    global_rope_theta (`float`, *optional*, defaults to 160000.0):
        The base period of the global RoPE embeddings.
    attention_bias (`bool`, *optional*, defaults to `False`):
        Whether to use a bias in the query, key, value and output projection layers during self-attention.
    attention_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the attention probabilities.
    global_attn_every_n_layers (`int`, *optional*, defaults to 3):
        The number of layers between global attention layers.
    local_attention (`int`, *optional*, defaults to 128):
        The window size for local attention.
    local_rope_theta (`float`, *optional*, defaults to 10000.0):
        The base period of the local RoPE embeddings.
    embedding_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the embeddings.
    mlp_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the MLP layers.
    mlp_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the MLP layers.
    decoder_bias (`bool`, *optional*, defaults to `True`):
        Whether to use bias in the decoder layers.
    classifier_pooling (`str`, *optional*, defaults to `"cls"`):
        The pooling method for the classifier. Should be either `"cls"` or `"mean"`. In local attention layers, the
        CLS token doesn't attend to all tokens on long sequences.
    classifier_dropout (`float`, *optional*, defaults to 0.0):
        The dropout ratio for the classifier.
    classifier_bias (`bool`, *optional*, defaults to `False`):
        Whether to use bias in the classifier.
    classifier_activation (`str`, *optional*, defaults to `"gelu"`):
        The activation function for the classifier.
    deterministic_flash_attn (`bool`, *optional*, defaults to `False`):
        Whether to use deterministic flash attention. If `False`, inference will be faster but not deterministic.
    sparse_prediction (`bool`, *optional*, defaults to `False`):
        Whether to use sparse prediction for the masked language model instead of returning the full dense logits.
    sparse_pred_ignore_index (`int`, *optional*, defaults to -100):
        The index to ignore for the sparse prediction.
    reference_compile (`bool`, *optional*):
        Whether to compile the layers of the model which were compiled during pretraining. If `None`, then parts of
        the model will be compiled if 1) `triton` is installed, 2) the model is not on MPS, 3) the model is not
        shared between devices, and 4) the model is not resized after initialization. If `True`, then the model may
        be faster in some scenarios.
    repad_logits_with_grad (`bool`, *optional*, defaults to `False`):
        When True, ModernBertForMaskedLM keeps track of the logits' gradient when repadding for output. This only
        applies when using Flash Attention 2 with passed labels. Otherwise output logits always have a gradient.

Examples:

```python
>>> from transformers import ModernBertModel, ModernBertConfig

>>> # Initializing a ModernBert style configuration
>>> configuration = ModernBertConfig()

>>> # Initializing a model from the modernbert-base style configuration
>>> model = ModernBertModel(configuration)

>>> # Accessing the model configuration
>>> configuration = model.config
```Ú
modernbertÚ
rope_thetaÚglobal_rope_thetaÚpast_key_valuesÚclassifier_pooling©ÚclsÚmeanc$           	      ó  >• [         T%U ]  " SUUUUUS.U$D6  Xl        Xpl        X l        X0l        X@l        XPl        X€l        Xl	        X l
        X°l        UU l        UU l        UU l        X`l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        UU l        U U l        U!U l        U"U l        U#U l        U R.                  S;  a  [A        SU R.                   S35      eg )N)Úpad_token_idÚbos_token_idÚeos_token_idÚcls_token_idÚsep_token_idr)   zQInvalid value for `classifier_pooling`, should be either "cls" or "mean", but is Ú.© )!ÚsuperÚ__init__Ú
vocab_sizeÚmax_position_embeddingsÚhidden_sizeÚintermediate_sizeÚnum_hidden_layersÚnum_attention_headsÚinitializer_rangeÚinitializer_cutoff_factorÚnorm_epsÚ	norm_biasr&   Úattention_biasÚattention_dropoutÚhidden_activationÚglobal_attn_every_n_layersÚlocal_attentionÚlocal_rope_thetaÚembedding_dropoutÚmlp_biasÚmlp_dropoutÚdecoder_biasr(   Úclassifier_dropoutÚclassifier_biasÚclassifier_activationÚdeterministic_flash_attnÚsparse_predictionÚsparse_pred_ignore_indexÚreference_compileÚrepad_logits_with_gradÚ
ValueError)&Úselfr6   r8   r9   r:   r;   rB   r7   r<   r=   r>   r?   r-   r/   r.   r0   r1   r&   r@   rA   rC   rD   rE   rF   rG   rH   rI   r(   rJ   rK   rL   rM   rN   rO   rP   rQ   ÚkwargsÚ	__class__s&                                        €Úi/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/modernbert/modular_modernbert.pyr5   ÚModernBertConfig.__init__¤   s>  ø€ ôN 	‰Òð 	
Ø%Ø%Ø%Ø%Ø%ñ	
ð ò	
ð %ŒØ'>Ô$Ø&ÔØ!2ÔØ!2ÔØ#6Ô Ø!2ÔØ)BÔ&Ø ŒØ"ŒØ!2ˆÔØ,ˆÔØ!2ˆÔØ!2ÔØ*DˆÔ'Ø.ˆÔØ 0ˆÔØ!2ˆÔØ ˆŒØ&ˆÔØ(ˆÔØ"4ˆÔØ"4ˆÔØ.ˆÔØ%:ˆÔ"Ø(@ˆÔ%Ø!2ˆÔØ(@ˆÔ%Ø!2ˆÔØ&<ˆÔ#à×"Ñ"¨/Ó9ÜØcÐdh×d{Ñd{Ðc|Ð|}Ð~óð ð :ó    c                 óH   >• [         TU ]  5       nUR                  SS 5        U$ )NrP   )r4   Úto_dictÚpop)rS   ÚoutputrU   s     €rV   rZ   ÚModernBertConfig.to_dict÷   s#   ø€ Ü‘‘Ó"ˆØ
‰
Ð&¨Ô-ØˆrX   )r@   rA   rL   rK   rJ   r(   rI   rM   rF   rC   r&   rB   r8   r=   r<   r9   rD   rE   r7   rG   rH   r?   r>   r;   r:   rP   rQ   rO   rN   r6   )#iÀÄ  i   i€  é   é   Úgelui    g{®Gáz”?ç       @gñhãˆµøä>FikÄ  éjÄ  éiÄ  rc   rb   g     ˆAFç        r   é€   ç     ˆÃ@rd   Frd   Tr*   rd   Fr`   FFiœÿÿÿNF)Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__Ú
model_typeÚattribute_mapÚkeys_to_ignore_at_inferencer   r5   rZ   Ú__static_attributes__Ú__classcell__©rU   s   @rV   r"   r"   8   s¸   ø† ñeðN €JØ!Ð#6Ð7€MØ#4Ð"5Ðð ØØØØØ Ø $ØØ"%ØØØØØØØØ"ØØØ#$ØØ ØØØØØ5:ØØØ$Ø!&ØØ!%ØØ$ñIQð8 $ MÑ2÷9Qð Q÷fó rX   r"   ÚinputsÚattention_maskÚposition_idsÚlabelsÚreturnc                 ó  • UR                  S[        R                  S9n[        R                  " UR	                  5       SS9R	                  5       n[        UR                  5       R                  5       5      n[        R                  R                  R                  [        R                  " US[        R                  S9S5      nU R                  5       S:X  a  U R	                  5       U   nO(U R                  tpšnXš-  nU R                  " U/UQ76 U   nUb  UR	                  5       U   OSnUb  UR	                  5       U   OSnX…XvXÞ4$ )	aP  
Remove padding from input sequences.

Args:
    inputs: (batch, seqlen, ...) or (batch, seqlen)
    attention_mask: (batch, seqlen), bool / int, 1 means valid and 0 means not valid.
    position_ids: (batch, seqlen), int, position ids
    labels: (batch, seqlen), int, labels

Returns:
    unpadded_inputs: (total_nnz, ...), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    cu_seqlens: (batch + 1), the cumulative sequence lengths
    max_seqlen_in_batch: int
    unpadded_position_ids: (total_nnz) or None
    unpadded_labels: (total_nnz) or None
éÿÿÿÿ©ÚdimÚdtypeF)Úas_tupler   )é   r   r   N)ÚsumÚtorchÚint32ÚnonzeroÚflattenÚintÚmaxÚitemr   Ú
functionalÚpadÚcumsumrz   ÚshapeÚview)rr   rs   rt   ru   Úseqlens_in_batchÚindicesÚmax_seqlen_in_batchÚ
cu_seqlensÚunpadded_inputsÚbatchÚseqlenÚrestr‰   Úunpadded_position_idsÚunpadded_labelss                  rV   Ú_unpad_modernbert_inputr•   ý   s  € ð. &×)Ñ)¨b¼¿¹Ð)ÐDÐÜmŠm˜N×2Ñ2Ó4¸uÑE×MÑMÓO€GÜÐ.×2Ñ2Ó4×9Ñ9Ó;Ó<ÐÜ—‘×$Ñ$×(Ñ(¬¯ªÐ6FÈAÔUZ×U`ÑU`Ñ)aÐciÓj€Jà‡zzƒ|qÓØ Ÿ.™.Ó*¨7Ñ3‰à%Ÿ|™|Ðˆ˜Ø‘ˆØ Ÿ+š+ eÐ3¨dÒ3°GÑ<ˆà?KÑ?W˜L×0Ñ0Ó2°7Ò;Ð]aÐØ39Ñ3Ef—n‘nÓ& wÒ/È4€Oà ZÐF[ÐlÐlrX   rŒ   r   r‘   c                 ó^  • U R                  5       S:X  aC  [        R                  " X#-  U R                  U R                  S9nXU'   UR                  X#5      nU$ U R                  tpg[        R                  " X#-  /UQ7U R                  U R                  S.6nXU'   UR
                  " X#/UQ76 nU$ )a-  
Add padding to sequences.

Args:
    inputs: (total_nnz, ...) or (total_nnz,), where total_nnz = number of tokens selected in attention_mask.
    indices: (total_nnz)
    batch: int, batch size
    seqlen: int, max sequence length

Returns:
    padded_inputs: (batch, seqlen, ...) or (batch, seqlen)
r}   )r{   Údevice)rz   r   Úzerosr{   r—   rŠ   r‰   )rr   rŒ   r   r‘   r\   Úpadded_inputsÚ_r’   s           rV   Ú_pad_modernbert_outputr›   &  sœ   € ð$ ‡zzƒ|qÓÜ—’˜U™^°6·<±<ÈÏÉÑVˆØ ˆw‰ØŸ™ EÓ2ˆð Ðð —<‘<ˆˆÜ—’˜U™^Ð]¨dÑ]¸&¿,¹,ÈvÏ}É}Ò]ˆØ ˆw‰ØŸš EÐ9°DÒ9ˆàÐrX   c                   óh   • \ rS rSr\  SS\\R                     S\\   4S jj5       r	\S 5       r
Srg)	ÚApplyRotaryEmbUnpadiE  NrŽ   Ú
max_seqlenc                 óÌ   • UR                  5       nUR                  u  pgp‰US S 2S S24   R                  USU	5      n
[        U
UUSUUSSS9  U R	                  X#U5        XPl        U$ )Nr   rx   r   FT)Úseqlen_offsetsrŽ   rž   ÚinterleavedÚinplace)Ú
contiguousr‰   rŠ   r    Úsave_for_backwardrž   )ÚctxÚqkvÚcosÚsinrŽ   rž   Ú	total_nnzÚ_threeÚ_nheadsÚheaddimÚqks              rV   ÚforwardÚApplyRotaryEmbUnpad.forwardF  sz   € ð n‰nÓˆØ.1¯i©iÑ+ˆ	˜7ð ’BQB‰Z_‰_˜Y¨¨GÓ4ˆÜØØØØØ!Ø!ØØò		
ð 	×Ñ˜c¨
Ô3Ø#ŒØˆ
rX   c                 óÞ   • U R                   u  p#nUR                  5       nUR                  u  pVpxUS S 2S S24   R                  USU5      n	[	        U	UUSUU R
                  SSSS9	  US S S S S S 4$ )Nr   rx   r   FT)r    rŽ   rž   r¡   r¢   Ú	conjugate)Úsaved_tensorsr£   r‰   rŠ   r    rž   )
r¥   Údor§   r¨   rŽ   r©   rª   r«   r¬   Údqks
             rV   ÚbackwardÚApplyRotaryEmbUnpad.backwarde  s‰   € à"×0Ñ0Ñˆ*Ø]‰]‹_ˆØ.0¯h©hÑ+ˆ	˜7ð ’BQB‰in‰n˜Y¨¨GÓ4ˆÜØØØØØ!Ø—~‘~ØØØò
	
ð 4˜˜t T¨4°Ð5Ð5rX   r3   ©NN)rg   rh   ri   rj   Ústaticmethodr   r   ÚTensorrƒ   r®   rµ   ro   r3   rX   rV   r   r   E  sQ   † Øð .2Ø$(ñð
 ˜UŸ\™\Ñ*ðð ˜S‘Môó ðð< ñ6ó ó6rX   r   rŽ   rž   c                 ó0   • [         R                  XX#U5      $ )a‰  
Arguments:
    qkv: (total_nnz, 3, nheads, headdim) - input tensor for packed QKV.
    cos, sin: (seqlen_rotary, rotary_dim / 2)
    interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead
        of 1st half and 2nd half (GPT-NeoX style).
    inplace: if True, apply rotary embedding in-place.
    seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount.
        Most commonly used in inference when we have KV cache.
    cu_seqlens: (batch + 1,) or None
    max_seqlen: int
Return:
    out: (total_nnz, dim)
rotary_dim must be <= headdim
Apply rotary embedding to the first rotary_dim of x.
)r   Úapply)r¦   r§   r¨   rŽ   rž   s        rV   Úapply_rotary_unpaddedr¼   |  s   € ô. ×$Ñ$ S¨sÀ
ÓKÐKrX   c                   ó6  ^ • \ rS rSrSr    SS\S\S\\   S\\R                     S\\R                     4
U 4S jjjr SS	\R                  S
\R                  S\\   S\\R                  \\R                  \R                  4   4   4S jjrS\4S jrSrU =r$ )Ú!ModernBertUnpaddedRotaryEmbeddingi–  zH
The rotary position embeddings applied directly to unpadded sequences.
rz   Úbaserž   r—   r{   c                 óh   >• [         TU ]  XUSS9  X0l        Ub  Ub  Ub  U R                  X4US9  gggg)zú
max_seqlen: if max_seqlen, device, and dtype are provided, we precompute the cos_sin_cache
    up to max_seqlen. If the max_seqlen, device, or dtype during training/inference differ,
    the cos_sin_cache will be recomputed during the forward pass.
F)rz   r¿   r—   r¡   N©r—   r{   )r4   r5   rž   Ú_update_cos_sin_cache)rS   rz   r¿   rž   r—   r{   rU   s         €rV   r5   Ú*ModernBertUnpaddedRotaryEmbedding.__init__›  sM   ø€ ô 	‰Ñ˜S°FÈÐÑNØ$ŒàÑ! fÑ&8¸UÑ=NØ×&Ñ& zÈÐ&ÒNð >OÐ&8Ð!rX   r¦   rŽ   rv   c                 ó–   • Ub$  U R                  X1R                  UR                  S9  [        UU R                  U R
                  UUS9nU$ )z°
Apply rotary embedding *inplace* to qkv.
qkv: (total_nnz, 3, nheads, headdim)
cu_seqlens: (batch + 1,) cumulative sequence lengths
max_seqlen: int max seq length in the batch
rÁ   ©rŽ   rž   )rÂ   r—   r{   r¼   Ú_cos_cachedÚ_sin_cached)rS   r¦   rŽ   rž   s       rV   r®   Ú)ModernBertUnpaddedRotaryEmbedding.forward®  sQ   € ð Ñ!Ø×&Ñ& z¿*¹*ÈCÏIÉIÐ&ÑVä#ØØ×ÑØ×ÑØ!Ø!ñ
ˆð ˆ
rX   c                 óT   • SU R                    SU R                   SU R                   3$ )Nzdim=z, base=z, scale_base=)rz   r¿   Ú
scale_base©rS   s    rV   Ú
extra_reprÚ,ModernBertUnpaddedRotaryEmbedding.extra_reprÇ  s(   € Ød—h‘hZ˜w t§y¡y k°¸t¿¹Ð>OÐPÐPrX   )rž   )rf   NNN©N)rg   rh   ri   rj   rk   rƒ   Úfloatr   r   r—   r{   r5   r¹   r   Útupler®   ÚstrrÌ   ro   rp   rq   s   @rV   r¾   r¾   –  sÝ   ø† ñð Ø$(Ø)-Ø'+ñOàðOð ðOð ˜S‘Mð	Oð
 ˜Ÿ™Ñ&ðOð ˜Ÿ™Ñ$÷Oð Oð. %)ñ	à\‰\ðð —L‘Lðð ˜S‘Mð	ð
 
ˆu|‰|˜U 5§<¡<°·±Ð#=Ñ>Ð>Ñ	?õð2Q˜C÷ Qò QrX   r¾   c                   óø   ^ • \ rS rSrSrS\4U 4S jjr\R                  " SS9S\R                  S\R                  4S	 j5       r SS\\R                     S
\\R                     S\R                  4S jjrSrU =r$ )ÚModernBertEmbeddingsiË  zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
Úconfigc                 ó\  >• [         TU ]  5         Xl        [        R                  " UR
                  UR                  UR                  S9U l        [        R                  " UR                  UR                  UR                  S9U l        [        R                  " UR                  5      U l        g )N)Úpadding_idx©ÚepsÚbias)r4   r5   rÔ   r   Ú	Embeddingr6   r8   r-   Útok_embeddingsÚ	LayerNormr>   r?   ÚnormÚDropoutrF   Údrop©rS   rÔ   rU   s     €rV   r5   ÚModernBertEmbeddings.__init__Ð  su   ø€ Ü‰ÑÔØŒÜ Ÿlšl¨6×+<Ñ+<¸f×>PÑ>PÐ^d×^qÑ^qÑrˆÔÜ—L’L ×!3Ñ!3¸¿¹Èv×O_ÑO_Ñ`ˆŒ	Ü—J’J˜v×7Ñ7Ó8ˆ	rX   T©ÚdynamicÚ	input_idsrv   c                 ó`   • U R                  U R                  U R                  U5      5      5      $ rÎ   )rß   rÝ   rÛ   )rS   rä   s     rV   Úcompiled_embeddingsÚ(ModernBertEmbeddings.compiled_embeddings×  s%   € ày‰y˜Ÿ™ 4×#6Ñ#6°yÓ#AÓBÓCÐCrX   Úinputs_embedsc                 ó  • Ub"  U R                  U R                  U5      5      nU$ U R                  R                  (       a  U R	                  U5      O.U R                  U R                  U R                  U5      5      5      nU$ rÎ   )rß   rÝ   rÔ   rP   ræ   rÛ   )rS   rä   rè   Úhidden_statess       rV   r®   ÚModernBertEmbeddings.forwardÛ  su   € ð Ñ$Ø ŸI™I d§i¡i°Ó&>Ó?ˆMð Ðð —;‘;×0×0ð ×(Ñ(¨Ô3à—Y‘Y˜tŸy™y¨×)<Ñ)<¸YÓ)GÓHÓIð ð
 ÐrX   )rÔ   rß   rÝ   rÛ   r·   )rg   rh   ri   rj   rk   r"   r5   r   ÚcompileÚ
LongTensorr¹   ræ   r   r®   ro   rp   rq   s   @rV   rÓ   rÓ   Ë  s’   ø† ñð9Ð/÷ 9ð ‡]‚]˜4Ñ ðD¨U×-=Ñ-=ð DÀ%Ç,Á,ó Dó !ðDð eiñØ! %×"2Ñ"2Ñ3ðØKSÐTY×T`ÑT`ÑKaðà	‰÷ó rX   rÓ   c                   ón   ^ • \ rS rSrSrS\4U 4S jjrS\R                  S\R                  4S jr	Sr
U =r$ )	ÚModernBertMLPié  a*  Applies the GLU at the end of each ModernBERT layer.

Compared to the default BERT architecture, this block replaces :class:`~transformers.model.bert.modeling_bert.BertIntermediate`
and :class:`~transformers.model.bert.modeling_bert.SelfOutput` with a single module that has similar functionality.
rÔ   c                 ó¤  >• [         TU ]  5         Xl        [        R                  " UR
                  [        UR                  5      S-  UR                  S9U l	        [        UR                     U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR
                  UR                  S9U l        g )Nr   ©rÙ   )r4   r5   rÔ   r   ÚLinearr8   rƒ   r9   rG   ÚWir   rB   ÚactrÞ   rH   rß   ÚWorà   s     €rV   r5   ÚModernBertMLP.__init__ð  s‘   ø€ Ü‰ÑÔØŒÜ—)’)˜F×.Ñ.´°F×4LÑ4LÓ0MÐPQÑ0QÐX^×XgÑXgÑhˆŒÜ˜&×2Ñ2Ñ3ˆŒÜ—J’J˜v×1Ñ1Ó2ˆŒ	Ü—)’)˜F×4Ñ4°f×6HÑ6HÈvÏÉÑ_ˆrX   rê   rv   c                 ó¨   • U R                  U5      R                  SSS9u  p#U R                  U R                  U R	                  U5      U-  5      5      $ )Nr   rx   ©rz   )ró   Úchunkrõ   rß   rô   )rS   rê   ÚinputÚgates       rV   r®   ÚModernBertMLP.forwardø  sG   € Ø—g‘g˜mÓ,×2Ñ2°1¸"Ð2Ð=‰ˆØw‰wt—y‘y §¡¨%£°4Ñ!7Ó8Ó9Ð9rX   )ró   rõ   rô   rÔ   rß   )rg   rh   ri   rj   rk   r"   r5   r   r¹   r®   ro   rp   rq   s   @rV   rï   rï   é  s7   ø† ñð`Ð/÷ `ð: U§\¡\ð :°e·l±l÷ :ò :rX   rï   c                   ó   • \ rS rSrSrg)ÚModernBertRotaryEmbeddingiý  r3   N)rg   rh   ri   rj   ro   r3   rX   rV   rþ   rþ   ý  s   † ÚrX   rþ   ÚmoduleÚModernBertAttentionr¦   Úsliding_window_maskrD   Úbsrz   Úoutput_attentionsc	                 ó°  • U R                  XS9u  p«UR                  SS5      R                  SS9u  pÍn[        XÍX«5      u  pÍU R                  S-  n[
        R                  " XÍR                  SS5      5      U-  nUS:w  a  UnUU-   n[        R                  R                  US[
        R                  S	9R                  UR                  5      n[        R                  R                  UU R                  U R                  S
9n[
        R                  " UU5      nUR                  SS5      R!                  5       nUR#                  USU5      nU(       a  UU4$ U4$ )N©rt   r   r}   r   rø   ç      à¿©rx   rx   rx   ry   )ÚpÚtraining)Ú
rotary_embÚ	transposeÚunbindr   Úhead_dimr   Úmatmulr   r†   ÚsoftmaxÚfloat32Útor{   ÚdropoutrA   r	  r£   rŠ   )rÿ   r¦   rs   r  rt   rD   r  rz   r  Ú_kwargsr§   r¨   ÚqueryÚkeyÚvalueÚscaleÚattn_weightsÚattn_outputs                     rV   Úeager_attention_forwardr    s=  € ð × Ñ  Ð Ð@H€CØŸ™ a¨Ó+×2Ñ2°qÐ2Ð9Ñ€Eä% e°#Ó;J€EàO‰O˜TÑ!€EÜ—<’< §}¡}°Q¸Ó':Ó;¸eÑC€Là˜(Ó"Ø,ˆà .Ñ0€Lô —=‘=×(Ñ(¨¸2ÄUÇ]Á]Ð(ÐS×VÑVÐW\×WbÑWbÓc€LÜ—=‘=×(Ñ(¨¸×9QÑ9QÐ\b×\kÑ\kÐ(Ðl€LÜ—,’,˜|¨UÓ3€KØ×'Ñ'¨¨1Ó-×8Ñ8Ó:€KØ×"Ñ" 2 r¨3Ó/€KÞØ˜\Ð*Ð*Øˆ>ÐrX   r
  Útarget_dtypec	           	      óÎ  • U" XUS9nUR                   [        R                  [        R                  4;  n
U
(       ad  UR                   nUR	                  U5      n[        UUUU R                  (       a  U R                  OSU R                  US9nUR	                  U5      nO5[        UUUU R                  (       a  U R                  OSU R                  US9nUR                  Xg5      4$ )NrÅ   rd   )rŽ   rž   Ú	dropout_pÚdeterministicÚwindow_size)
r{   r   Úfloat16Úbfloat16r  r   r	  rA   rM   rŠ   )rÿ   r¦   r
  rŽ   rž   rD   r  rz   r  r  Úconvert_dtypeÚ
orig_dtypeÚattns                rV   Úflash_attention_forwardr%  &  sË   € ñ S¸JÑ
G€Cà—I‘I¤e§m¡m´U·^±^Ð%DÑD€MÞð —Y‘Yˆ
Øf‰f\Ó"ˆä/ØØ!Ø!Ø28·/·/f×.Ò.ÀsØ ×9Ñ9Ø'ñ
ˆð w‰wzÓ"‰ä/ØØ!Ø!Ø28·/·/f×.Ò.ÀsØ ×9Ñ9Ø'ñ
ˆð I‰IbÓÐ Ð rX   c                 óf  • U R                  XS9u  pšUR                  SS5      R                  SS9u  p¼n[        X¼Xš5      u  p¼US:w  a  Un[        R
                  " UUUU R                  (       a  U R                  OSUS9R                  SS5      R                  5       nUR                  US	U5      nU4$ )
Nr  r   r}   r   rø   r  rd   )r  Ú	attn_maskrx   )
r
  r  r  r   ÚFÚscaled_dot_product_attentionr	  rA   r£   rŠ   )rÿ   r¦   rs   r  rt   rD   r  rz   r  r§   r¨   r  r  r  r  s                  rV   Úsdpa_attention_forwardr*  Q  s»   € ð × Ñ  Ð Ð@H€CØŸ™ a¨Ó+×2Ñ2°qÐ2Ð9Ñ€Eä% e°#Ó;J€Eà˜(Ó"Ø,ˆô 	
×&Ò&ØØØØ28·/·/f×.Ò.ÀsØ$ñ	
÷ 
‰1a‹ß	‰‹ð ð ×"Ñ" 2 r¨3Ó/€KØˆ>ÐrX   )Úflash_attention_2ÚeagerÚsdpac                   óŒ   ^ • \ rS rSrSrSS\S\\   4U 4S jjjr SS\	R                  S\\   S\	R                  4S	 jjrS
rU =r$ )r   i{  an  Performs multi-headed self attention on a batch of unpadded sequences.

If Flash Attention 2 is installed, this module uses Flash Attention to improve throughput.
If Flash Attention 2 is not installed, the implementation will use PyTorch's SDPA kernel,
which requires padding and unpadding inputs, adding some overhead.

See `forward` method for additional details.
rÔ   Úlayer_idc                 óî  >• [         TU ]  5         Xl        X l        UR                  UR
                  -  S:w  a&  [        SUR                   SUR
                   S35      eUR                  U l        UR                  U l        UR
                  U l	        UR                  UR
                  -  U l
        U R                  U R                  -  U l        [        R                  " UR                  SU R                  -  UR                  S9U l        X!R                   -  S:w  aU  UR"                  S-  UR"                  S-  4U l        UR$                  b  UR$                  OUR&                  nUR"                  nOSU l        UR(                  nUR&                  nUR*                  S	:X  a  [-        U R                  XCS
9U l        O*[0        R2                  " U5      nX5l        [7        US9U l        [        R                  " UR                  UR                  UR                  S9U l        UR                  S:”  a   [        R:                  " UR                  5      O[        R<                  " 5       U l        [A        5       U l!        g )Nr   zThe hidden size (z6) is not a multiple of the number of attention heads (Ú)r   rñ   r   r  r+  )rz   rž   r¿   )rÔ   rd   )"r4   r5   rÔ   r/  r8   r;   rR   rA   rM   Ú	num_headsr  Úall_head_sizer   rò   r@   ÚWqkvrC   rD   rE   r&   r7   Ú_attn_implementationr¾   r
  ÚcopyÚdeepcopyr%   rþ   rõ   rÞ   ÚIdentityÚout_dropÚsetÚpruned_heads)rS   rÔ   r/  r%   r7   Úconfig_copyrU   s         €rV   r5   ÚModernBertAttention.__init__…  s  ø€ Ü‰ÑÔØŒØ Œà×Ñ × :Ñ :Ñ:¸aÓ?ÜØ# F×$6Ñ$6Ð#7Ð7mÐnt÷  oIñ  oIð  nJð  JKð  Lóð ð "(×!9Ñ!9ˆÔØ(.×(GÑ(GˆÔ%Ø×3Ñ3ˆŒØ×*Ñ*¨f×.HÑ.HÑHˆŒØ!Ÿ]™]¨T¯^©^Ñ;ˆÔÜ—I’I˜f×0Ñ0°!°d×6HÑ6HÑ2HÈv×OdÑOdÑeˆŒ	à×7Ñ7Ñ7¸1Ó<Ø$*×$:Ñ$:¸aÑ$?À×AWÑAWÐ[\ÑA\Ð#]ˆDÔ Ø4:×4KÑ4KÑ4W˜×0Ò0Ð]c×]uÑ]uˆJØ&,×&<Ñ&<Ñ#à#+ˆDÔ Ø&,×&DÑ&DÐ#Ø×1Ñ1ˆJà×&Ñ&Ð*=Ó=Ü?Ø—M‘MÐ.EñˆDOô Ÿ-š-¨Ó/ˆKØ%/Ô"Ü7¸{ÑKˆDŒOä—)’)˜F×.Ñ.°×0BÑ0BÈ×I^ÑI^Ñ_ˆŒØ@F×@XÑ@XÐ[^Ó@^œŸ
š
 6×#;Ñ#;Ô<Ôdf×doÒdoÓdqˆŒÜ›EˆÕrX   rê   r  rv   c           
      ó  • U R                  U5      nUR                  S   nU R                  R                  S:X  a)  UR	                  SSU R
                  U R                  5      nO)UR	                  USSU R
                  U R                  5      n[        U R                  R                     " U 4UU R                  U R                  UU R                  US.UD6nUS   nU R                  U R                  U5      5      nU4USS  -   $ )Nr   r+  rx   r   )r¦   r
  rD   r  rz   r  r}   )r4  r‰   rÔ   r5  rŠ   r2  r  ÚMODERNBERT_ATTENTION_FUNCTIONr
  rD   r3  r9  rõ   )rS   rê   r  rT   r¦   r  Úattn_outputss          rV   r®   ÚModernBertAttention.forward¬  só   € ð i‰i˜Ó&ˆà× Ñ  Ñ#ˆØ;‰;×+Ñ+Ð/BÓBØ—(‘(˜2˜q $§.¡.°$·-±-Ó@‰Cà—(‘(˜2˜r 1 d§n¡n°d·m±mÓDˆCä4°T·[±[×5UÑ5UÒVØð	
àØ—‘Ø ×0Ñ0ØØ×"Ñ"Ø/ñ	
ð ñ	
ˆð % Q™ˆØŸ™ d§g¡g¨mÓ&<Ó=ˆàÐ ,¨q¨rÐ"2Ñ2Ð2rX   )rõ   r4  r3  rA   rÔ   rM   r  r/  rD   r2  r9  r;  r
  rÎ   ©F)rg   rh   ri   rj   rk   r"   r   rƒ   r5   r   r¹   Úboolr®   ro   rp   rq   s   @rV   r   r   {  s]   ø† ññ%"Ð/ð %"¸8ÀC¹=÷ %"ð %"ðT -2ñ3à—|‘|ð3ð $ D™>ð3ð
 
‰÷3ó 3rX   c                   ót  ^ • \ rS rSrSS\S\\   4U 4S jjjr\R                  " SS9S\R                  S\R                  4S	 j5       r      SS\R                  S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\R                  4S jjrSrU =r$ )ÚModernBertEncoderLayeriÊ  rÔ   r/  c                 ó  >• [         TU ]  5         Xl        US:X  a  [        R                  " 5       U l        O9[        R                  " UR                  UR                  UR                  S9U l        [        XS9U l        [        R                  " UR                  UR                  UR                  S9U l        [        U5      U l        g )Nr   r×   )rÔ   r/  )r4   r5   rÔ   r   r8  Ú	attn_normrÜ   r8   r>   r?   r   r$  Úmlp_normrï   Úmlp©rS   rÔ   r/  rU   s      €rV   r5   ÚModernBertEncoderLayer.__init__Ë  s‰   ø€ Ü‰ÑÔØŒØq‹=ÜŸ[š[›]ˆDNäŸ\š\¨&×*<Ñ*<À&Ç/Á/ÐX^×XhÑXhÑiˆDŒNÜ'¨vÑIˆŒ	ÜŸš V×%7Ñ%7¸V¿_¹_ÐSY×ScÑScÑdˆŒÜ  Ó(ˆrX   Trâ   rê   rv   c                 óB   • U R                  U R                  U5      5      $ rÎ   )rI  rH  ©rS   rê   s     rV   Úcompiled_mlpÚ#ModernBertEncoderLayer.compiled_mlpÖ  s   € àx‰x˜Ÿ™ mÓ4Ó5Ð5rX   rs   r  rt   rŽ   rž   r  c           
      ó
  • U R                  U R                  U5      UUUUUUS9nXS   -   nU R                  R                  (       a  U R	                  U5      OU R                  U R                  U5      5      n	X-   nU4USS  -   $ )N©rs   r  rt   rŽ   rž   r  r   r}   )r$  rG  rÔ   rP   rN  rI  rH  )
rS   rê   rs   r  rt   rŽ   rž   r  r@  Ú
mlp_outputs
             rV   r®   ÚModernBertEncoderLayer.forwardÚ  s›   € ð —y‘yØN‰N˜=Ó)Ø)Ø 3Ø%Ø!Ø!Ø/ð !ð 
ˆð &°Q©Ñ7ˆð {‰{×,×,ð ×Ñ˜mÔ,à—‘˜$Ÿ-™-¨Ó6Ó7ð 	ð
 &Ñ2ˆàÐ ,¨q¨rÐ"2Ñ2Ð2rX   )r$  rG  rÔ   rI  rH  rÎ   )NNNNNF)rg   rh   ri   rj   r"   r   rƒ   r5   r   rì   r¹   rN  rí   rC  r®   ro   rp   rq   s   @rV   rE  rE  Ê  sõ   ø† ñ	)Ð/ð 	)¸8ÀC¹=÷ 	)ð 	)ð ‡]‚]˜4Ñ ð6¨%¯,©,ð 6¸5¿<¹<ó 6ó !ð6ð 26Ø6:Ø37Ø-1Ø$(Ø,1ñ3à—|‘|ð3ð ! §¡Ñ.ð3ð & e§l¡lÑ3ð	3ð
 ˜u×/Ñ/Ñ0ð3ð ˜UŸ\™\Ñ*ð3ð ˜S‘Mð3ð $ D™>ð3ð 
‰÷3ó 3rX   rE  c                   ó    ^ • \ rS rSr% \\S'   SrSrSS/rSr	Sr
SrS\R                  4S	 jr SS
\\   S\S\4U 4S jjjrS rU 4S jrSrU =r$ )ÚModernBertPreTrainedModeliø  rÔ   ÚmodelTrÓ   rE  Frÿ   c                 óþ  ^• U R                   R                  mTc  SmS[        R                  S[        4U4S jjnU R                   R
                  U R                   R
                  [        R                  " SU R                   R                  -  5      -  U R                   R
                  U R                   R                  S-  S.n[        U[        5      (       a  U" UR                  US   5        g [        U[        5      (       a-  U" UR                  US	   5        U" UR                  US
   5        g [        U[         5      (       a-  U" UR"                  US	   5        U" UR                  US
   5        g [        U[$        5      (       a  U" UR&                  US
   5        g [        U[(        5      (       a  U" UR*                  US
   5        g [        U[,        [.        [0        [2        45      (       a  U" UR4                  US   5        g [        U[        R6                  5      (       aX  UR8                  R:                  R=                  S5        UR>                  b%  UR>                  R:                  RA                  5         g g g )Nr   rÿ   Ústdc                 ó  >• [         R                  R                  U R                  SUT* U-  TU-  S9  [	        U [         R
                  5      (       a8  U R                  b*  [         R                  R                  U R                  5        g g g )Nrd   )r+   rX  ÚaÚb)r   ÚinitÚtrunc_normal_ÚweightÚ
isinstancerò   rÙ   Úzeros_)rÿ   rX  Úcutoff_factors     €rV   Úinit_weightÚ<ModernBertPreTrainedModel._init_weights.<locals>.init_weight  st   ø€ ÜG‰G×!Ñ!Ø—‘ØØØ . 3Ñ&Ø #Ñ%ð "ñ ô ˜&¤"§)¡)×,Ñ,Ø—;‘;Ñ*Ü—G‘G—N‘N 6§;¡;Õ/ð +ð -rX   ra   r  )ÚinÚoutÚ	embeddingÚ	final_outrf  rd  re  rg  g      ð?)!rÔ   r=   r   ÚModulerÏ   r<   ÚmathÚsqrtr:   r8   r_  rÓ   rÛ   rï   ró   rõ   r   r4  ÚModernBertPredictionHeadÚdenseÚModernBertForMaskedLMÚdecoderÚ#ModernBertForSequenceClassificationÚModernBertForMultipleChoiceÚ ModernBertForTokenClassificationÚModernBertForQuestionAnsweringÚ
classifierrÜ   r^  ÚdataÚfill_rÙ   Úzero_)rS   rÿ   rb  Ústdsra  s       @rV   Ú_init_weightsÚ'ModernBertPreTrainedModel._init_weights  sæ  ø€ ØŸ™×=Ñ=ˆØÑ ØˆMð	0¤§	¡	ð 	0´÷ 	0ð —+‘+×/Ñ/Ø—;‘;×0Ñ0´4·9²9¸SÀ4Ç;Á;×C`ÑC`Ñ=`Ó3aÑaØŸ™×6Ñ6ØŸ™×0Ñ0°$Ñ6ñ	
ˆô fÔ2×3Ñ3Ù˜×-Ñ-¨t°KÑ/@ÕAÜ˜¤×.Ñ.Ù˜Ÿ	™	 4¨¡:Ô.Ù˜Ÿ	™	 4¨¡;Õ/Ü˜Ô 3×4Ñ4Ù˜Ÿ™ T¨$¡ZÔ0Ù˜Ÿ	™	 4¨¡;Õ/Ü˜Ô 8×9Ñ9Ù˜Ÿ™ d¨5¡kÕ2Ü˜Ô 5×6Ñ6Ù˜Ÿ™¨¨U©Õ4ÜØä3Ü+Ü0Ü.ð	÷
ñ 
ñ ˜×)Ñ)¨4°Ñ+<Õ=Ü˜¤§¡×-Ñ-ØM‰M×Ñ×$Ñ$ SÔ)Ø{‰{Ñ&Ø—‘× Ñ ×&Ñ&Õ(ð 'ð .rX   Úattn_implementationÚis_init_checkrv   c                 ó„   >•  Uc  U R                  5       (       a  SOUn[        TU ]  XS9$ ! [        [        4 a     Nf = f)zB
Checks and dispatches to hhe requested attention implementation.
r+  )rz  r{  )Ú_flash_attn_2_can_dispatchrR   ÚImportErrorr4   Ú%_check_and_adjust_attn_implementation)rS   rz  r{  rU   s      €rV   r  Ú?ModernBertPreTrainedModel._check_and_adjust_attn_implementation6  s`   ø€ ð	ð 'Ñ.°4×3RÑ3R×3TÑ3Tñ $à(ð  ô ‰wÑ<Ø 3ð =ð 
ð 	
øô œKÐ(ó 	Ùð	ús   ƒ, ¬?¾?c                 óÜ  • U R                   R                  SL a  g [        U S5      (       aZ  [        U R                  5      S:”  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                  R                  S:X  aA  U R                   R                  (       a  [
        R                  S5        SU R                   l        U R                   R                  c  [        5       U R                   l        g g )	NFÚhf_device_mapr}   zqIf `accelerate` split the model across devices, `torch.compile` will not work. Falling back to non-compiled mode.Úmpsz|Compiling the model with `torch.compile` and using a `torch.mps` device is not supported. Falling back to non-compiled mode.Úcpuz|Compiling the model with `torch.compile` and using a `torch.cpu` device is not supported. Falling back to non-compiled mode.)
rÔ   rP   ÚhasattrÚlenr‚  ÚloggerÚwarning_oncer—   Útyper   rË   s    rV   Ú_maybe_set_compileÚ,ModernBertPreTrainedModel._maybe_set_compileM  s  € Ø;‰;×(Ñ(¨EÒ1Øä4˜×)Ñ)¬c°$×2DÑ2DÓ.EÈÓ.IØ{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×Ñ˜uÓ$Ø{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×Ñ˜uÓ$Ø{‰{×,×,Ü×#Ñ#ð9ôð -2ˆDK‰KÔ)à;‰;×(Ñ(Ñ0Ü,?Ó,AˆDK‰KÕ)ð 1rX   c                 óÞ   >• [         TU ]  " U0 UD6nU R                  R                  S;   aA  U R                  R                  (       a  [        R                  S5        SU R                  l        U$ )N>   NTzcResizing token embeddings with `torch.compile` is not supported. Falling back to non-compiled mode.F)r4   Úresize_token_embeddingsrÔ   rP   r‡  rˆ  )rS   ÚargsrT   Úmodel_embedsrU   s       €rV   r  Ú1ModernBertPreTrainedModel.resize_token_embeddingsl  s[   ø€ Ü‘wÒ6¸ÐGÀÑGˆà;‰;×(Ñ(¨LÓ8Ø{‰{×,×,Ü×#Ñ#Øyôð -2ˆDK‰KÔ)àÐrX   r3   rB  )rg   rh   ri   rj   r"   Ú__annotations__Úbase_model_prefixÚsupports_gradient_checkpointingÚ_no_split_modulesÚ_supports_flash_attnÚ_supports_sdpaÚ_supports_flex_attnr   rh  rx  r   rÑ   rC  r  rŠ  r  ro   rp   rq   s   @rV   rU  rU  ø  s€   ø‡ àÓØÐØ&*Ð#Ø/Ð1IÐJÐØÐØ€NØÐð2) B§I¡Iô 2)ðj INñ
Ø#+¨C¡=ð
ØAEð
à	÷
ð 
ò.B÷>
ó 
rX   rU  c            !       óæ  ^ • \ rS rSrS\4U 4S jjrS rS r\             SS\	\
R                     S\	\
R                     S\	\
R                     S	\	\
R                     S
\	\
R                     S\	\
R                     S\	\
R                     S\	\   S\	\   S\	\   S\	\   S\	\   S\	\   S\\\
R                  S4   \4   4S jj5       rS\
R                  S\S\
R                  4S jrSrU =r$ )ÚModernBertModeliy  rÔ   c           	      óˆ  >• [         TU ]  U5        Xl        [        U5      U l        [
        R                  " [        UR                  5       Vs/ sH  n[        X5      PM     sn5      U l
        [
        R                  " UR                  UR                  UR                  S9U l        SU l        U R#                  5         g s  snf )Nr×   F)r4   r5   rÔ   rÓ   Ú
embeddingsr   Ú
ModuleListÚranger:   rE  ÚlayersrÜ   r8   r>   r?   Ú
final_normÚgradient_checkpointingÚ	post_initrJ  s      €rV   r5   ÚModernBertModel.__init__{  s•   ø€ Ü‰Ñ˜Ô ØŒÜ.¨vÓ6ˆŒÜ—m’mÜFKÈF×LdÑLdÔFeÓfÑFe¸(Ô# FÖ5ÑFeÑfó
ˆŒô Ÿ,š, v×'9Ñ'9¸v¿¹ÐU[×UeÑUeÑfˆŒØ&+ˆÔ#Ø‰Õùò	 gs   ÁB?c                 ó.   • U R                   R                  $ rÎ   ©r›  rÛ   rË   s    rV   Úget_input_embeddingsÚ$ModernBertModel.get_input_embeddings†  s   € Ø‰×-Ñ-Ð-rX   c                 ó$   • XR                   l        g rÎ   r¤  )rS   r  s     rV   Úset_input_embeddingsÚ$ModernBertModel.set_input_embeddings‰  s   € Ø).‰Õ&rX   rä   rs   r  rt   rè   rŒ   rŽ   rž   Ú
batch_sizeÚseq_lenr  Úoutput_hidden_statesÚreturn_dictrv   .c                 ó6  ^^	^
• Ub  UOU R                   R                  nUb  UOU R                   R                  nUb  UOU R                   R                  nUSL USL-  (       a  [	        S5      eU(       a  SOSnU(       a  SOSnU R                  5         Ub  U R                  X5        T	c+  T
c(  Ub  UR                  SS u  m	m
OUR                  SS u  m	m
Ub  UR                  OUR                  nUc&  [        R                  " T	T
4U[        R                  S9nSnU R                   R                  S:X  aH  TcD  UcA  Uc>  SnUc,  [        R                  " 5          [        XS	9tnmpxnSSS5        OF[        XRS	9tnmpxnO8Uc$  [        R                  " T
US
9R!                  S5      nU R#                  X+S9u  p#U R%                  XS9nU R&                   HD  nU(       a  UU4-   nU" UUUUUUUS9nUS   nU(       d  M*  [)        U5      S:”  d  M;  UUS   4-   nMF     U(       a  UU4-   nU R+                  U5      nU(       a&  [-        UTT	T
S9nUb  [/        U	UU
4S jU 5       5      nU(       d  [/        S UXï4 5       5      $ [1        UUUS9$ ! , (       d  f       Nâ= f)áF  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nz:You must specify exactly one of input_ids or inputs_embedsr3   r   rÁ   Fr+  T)rr   rs   )r—   r   )r  )rä   rè   rQ  r}   ©rr   rŒ   r   r‘   c              3   ó:   >#   • U H  n[        UTTTS 9v •  M     g7f)r°  N)r›   )Ú.0Úhsrª  rŒ   r«  s     €€€rV   Ú	<genexpr>Ú*ModernBertModel.forward.<locals>.<genexpr>ú  s$   øé € ð *á/˜ô +°"¸gÈZÐ`gÖhÚ/ùs   ƒc              3   ó,   #   • U H  oc  M  Uv •  M     g 7frÎ   r3   )r²  Úvs     rV   r´  rµ     s   é € ÐmÑ$[˜qŸ™Ò$[ùs   ‚‹	)Úlast_hidden_staterê   Ú
attentions)rÔ   r  r¬  Úuse_return_dictrR   rŠ  Ú%warn_if_padding_and_no_attention_maskr‰   r—   r   ÚonesrC  r5  Úno_gradr•   ÚarangeÚ	unsqueezeÚ_update_attention_maskr›  rž  r†  rŸ  r›   rÐ   r   )rS   rä   rs   r  rt   rè   rŒ   rŽ   rž   rª  r«  r  r¬  r­  Úall_hidden_statesÚall_self_attentionsr—   Úrepadrš   rê   Úencoder_layerÚlayer_outputss         `  ``           rV   r®   ÚModernBertModel.forwardŒ  sæ  ú€ ðB 2CÑ1NÑ-ÐTX×T_ÑT_×TqÑTqÐà$8Ñ$DÑ È$Ï+É+×JjÑJjð 	ð &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆà˜Ð -°tÐ";×<ÜÐYÓZÐZæ"6™B¸DÐÞ$5™b¸4Ðà×ÑÔ!àÑ Ø×6Ñ6°yÔQàÑ '¡/ØÑ(Ø&3×&9Ñ&9¸"¸1Ð&=Ñ#
™Gà&/§o¡o°b°qÐ&9Ñ#
˜GØ%.Ñ%:×!Ò!À×@TÑ@TˆàÑ!Ü"ŸZšZ¨°WÐ(=ÀfÔTY×T^ÑT^Ñ_ˆNàˆØ;‰;×+Ñ+Ð/BÓBØ‰ :Ñ#5¸*Ñ:LØØ Ñ(ÜŸšÜI`Ø#,ñJÐF˜	 7¨JÀQ÷ )˜ô
 JaØ,ñJÐFM 7¨JÀQøð Ñ#Ü$Ÿ|š|¨G¸FÑC×MÑMÈaÓPà26×2MÑ2MØð 3Nð 3Ñ/ˆNð Ÿ™°)˜ÐYˆà!Ÿ[œ[ˆMÞ#Ø$5¸Ð8HÑ$HÐ!á)ØØ-Ø$7Ø)Ø%Ø%Ø"3ñˆMð *¨!Ñ,ˆMß Ð ¤S¨Ó%7¸!Õ%;Ø&9¸]È1Ñ=MÐ<OÑ&OÒ#ñ )ö"  Ø 1°]Ð4DÑ DÐàŸ™¨Ó6ˆæÜ2Ø$¨g¸ZÐPWñˆMð !Ñ,Ü$)ö *á/ó*ó %Ð!ö
 ÜÑm ]Ð4EÑ$[ÓmÓmÐmÜØ+Ø+Ø*ñ
ð 	
÷i )ús   ÅJ
Ê

Jc                 ó,  • U(       aˆ  U R                   R                  S:X  a'  [        R                  S5        SU R                   l        OGU R                   R                  S:w  a-  [        R                  SU R                   R                   S35        [	        XR
                  5      n[        R                  " UR                  S   5      R                  S5      n[        R                  " XDR                  -
  5      nXPR                   R                  S-  :*  R                  S5      R                  S5      R                  UR                  5      nUR                  UR!                  5       [        R"                  " U R
                  5      R$                  5      nX74$ )Nr-  z’Outputting attentions is only supported with the 'eager' attention implementation, not with "sdpa". Falling back to `attn_implementation="eager"`.r,  zZOutputting attentions is only supported with the eager attention implementation, not with zT. Consider setting `attn_implementation="eager"`. Setting `output_attentions=False`.r   r   )rÔ   r5  r‡  rˆ  r   r{   r   r¾  r‰   r¿  ÚabsÚTrD   r  r—   Úmasked_fillÚlogical_notÚfinfoÚmin)rS   rs   r  Úglobal_attention_maskÚrowsÚdistanceÚwindow_maskr  s           rV   rÀ  Ú&ModernBertModel._update_attention_mask  sJ  € ÞØ{‰{×/Ñ/°6Ó9Ü×#Ñ#ðVôð 4;—‘Õ0Ø—‘×1Ñ1°WÓ<Ü×#Ñ#ð Ø $§¡× @Ñ @ÐAð B:ð:ôô !;¸>Ï:É:Ó VÐô |Š|Ð1×7Ñ7¸Ñ:Ó;×EÑEÀaÓHˆä—9’9˜T§F¡F™]Ó+ˆð Ÿ™×4Ñ4¸Ñ9Ñ9×DÑDÀQÓG×QÑQÐRSÓT×WÑWÐXf×XmÑXmÓnð 	ð 4×?Ñ?À×@WÑ@WÓ@YÔ[`×[fÒ[fÐgk×gqÑgqÓ[r×[vÑ[vÓwÐà$Ð9Ð9rX   )rÔ   r›  rŸ  r   rž  ©NNNNNNNNNNNNN)rg   rh   ri   rj   r"   r5   r¥  r¨  r   r   r   rí   r¹   rƒ   rC  r   rÐ   r   r®   rÀ  ro   rp   rq   s   @rV   r™  r™  y  s  ø† ð	Ð/÷ 	ò.ò/ð ð 15Ø15Ø6:Ø37Ø04Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñx
à˜E×,Ñ,Ñ-ðx
ð ! §¡Ñ.ðx
ð & e§l¡lÑ3ð	x
ð
 ˜u×/Ñ/Ñ0ðx
ð   §¡Ñ-ðx
ð ˜%Ÿ,™,Ñ'ðx
ð ˜UŸ\™\Ñ*ðx
ð ˜S‘Mðx
ð ˜S‘Mðx
ð ˜#‘ðx
ð $ D™>ðx
ð ' t™nðx
ð ˜d‘^ðx
ð 
ˆuU—\‘\ 3Ð&Ñ'¨Ð8Ñ	9ôx
ó ðx
ðt:°U·\±\ð :ÐVZð :Ð_d×_kÑ_k÷ :ò :rX   r™  c                   ój   ^ • \ rS rSrS\4U 4S jjrS\R                  S\R                  4S jrSr	U =r
$ )rk  i'  rÔ   c                 óF  >• [         TU ]  5         Xl        [        R                  " UR
                  UR
                  UR                  5      U l        [        UR                     U l
        [        R                  " UR
                  UR                  UR                  S9U l        g )Nr×   )r4   r5   rÔ   r   rò   r8   rK   rl  r   rL   rô   rÜ   r>   r?   rÝ   rà   s     €rV   r5   Ú!ModernBertPredictionHead.__init__(  so   ø€ Ü‰ÑÔØŒÜ—Y’Y˜v×1Ñ1°6×3EÑ3EÀv×G]ÑG]Ó^ˆŒ
Ü˜&×6Ñ6Ñ7ˆŒÜ—L’L ×!3Ñ!3¸¿¹Èv×O_ÑO_Ñ`ˆ	rX   rê   rv   c                 ó`   • U R                  U R                  U R                  U5      5      5      $ rÎ   )rÝ   rô   rl  rM  s     rV   r®   Ú ModernBertPredictionHead.forward/  s#   € Øy‰y˜Ÿ™ $§*¡*¨]Ó";Ó<Ó=Ð=rX   )rô   rÔ   rl  rÝ   )rg   rh   ri   rj   r"   r5   r   r¹   r®   ro   rp   rq   s   @rV   rk  rk  '  s2   ø† ðaÐ/÷ að> U§\¡\ð >°e·l±l÷ >ò >rX   rk  zd
    The ModernBert Model with a decoder head on top that is used for masked language modeling.
    )Úcustom_introc            "       óF  ^ • \ rS rSrS/rS\4U 4S jjrS rS\R                  4S jr
\R                  " SS	9S
\R                  S\R                  4S j5       r\              SS\\R"                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rm  i3  zdecoder.weightrÔ   c                 ón  >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  " UR                  UR                  UR                  S9U l        U R                  R                  U l        U R                  R                  U l        U R                  5         g )Nrñ   )r4   r5   rÔ   r™  rV  rk  Úheadr   rò   r8   r6   rI   rn  rN   rO   r¡  rà   s     €rV   r5   ÚModernBertForMaskedLM.__init__;  sƒ   ø€ Ü‰Ñ˜Ô ØŒÜ$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—y’y ×!3Ñ!3°V×5FÑ5FÈV×M`ÑM`ÑaˆŒà!%§¡×!>Ñ!>ˆÔØ(,¯©×(LÑ(LˆÔ%ð 	‰ÕrX   c                 ó   • U R                   $ rÎ   ©rn  rË   s    rV   Úget_output_embeddingsÚ+ModernBertForMaskedLM.get_output_embeddingsH  s   € Ø|‰|ÐrX   Únew_embeddingsc                 ó   • Xl         g rÎ   rß  )rS   râ  s     rV   Úset_output_embeddingsÚ+ModernBertForMaskedLM.set_output_embeddingsK  s   € Ø%rX   Trâ   r\   rv   c                 óB   • U R                  U R                  U5      5      $ rÎ   )rn  rÜ  )rS   r\   s     rV   Úcompiled_headÚ#ModernBertForMaskedLM.compiled_headN  s   € à|‰|˜DŸI™I fÓ-Ó.Ð.rX   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  c                 ól  • Ub  UOU R                   R                  nU R                  5         U R                   R                  S:X  a¶  Uc³  Uc°  U	c­  U
c)  Uc&  Ub  UR                  SS u  p«OUR                  SS u  p«Ub  UR
                  OUR
                  nUc%  [        R                  " X«4U[        R                  S9nUc-  [        R                  " 5          [        XXFS9u  pp‰pFSSS5        O[        XRXFS9u  pWp‰pFU R                  UUUUUUUU	U
UUUUS9nUS   nU R                  (       aK  UbH  UR                  S5      nUR                  UR                  S   S5      nX`R                  :g  nUU   nUU   nU R                   R                  (       a  U R!                  U5      OU R#                  U R%                  U5      5      nSnUb*  U R&                  " UU4S	U R                   R(                  0UD6nU R                   R                  S:X  aQ  U R                   R*                  (       d  Uc
  [-        5       O[        R                  " 5          [/        UXzUS
9nSSS5        U(       d  U4nUb  U4U-   $ U$ [1        UUUR2                  UR4                  S9$ ! , (       d  f       GN¦= f! , (       d  f       NU= f)r¯  Nr+  r   rÁ   )rr   rs   rt   ru   ©rä   rs   r  rt   rè   rŒ   rŽ   rž   rª  r«  r  r¬  r­  r   rx   r6   r°  ©ÚlossÚlogitsrê   r¹  )rÔ   rº  rŠ  r5  r‰   r—   r   r¼  rC  r½  r•   rV  rN   rŠ   rO   rP   rç  rn  rÜ  Úloss_functionr6   rQ   r   r›   r   rê   r¹  )rS   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rT   r—   Úoutputsr¸  Úmask_tokensrí  rì  r\   s                          rV   r®   ÚModernBertForMaskedLM.forwardR  sÁ  € ðF &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à;‰;×+Ñ+Ð/BÓBØ‰ :Ñ#5¸*Ñ:LØÑ%¨'©/Ø$Ñ0Ø.;×.AÑ.AÀ"À1Ð.EÑ+˜
 Gà.7¯o©o¸b¸qÐ.AÑ+˜
Ø-6Ñ-B˜×)Ò)È×H\ÑH\à!Ñ)Ü%*§Z¢Z°Ð0EÈfÔ\a×\fÑ\fÑ%gNà Ñ(ÜŸšÜ[rØ#,ÐZfñ\ÑX˜	¨JÀL÷ )˜ô
 \sØ,ÐZfñ\ÑXM¨JÀLð —*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà×!×! fÑ&8à—[‘[ “_ˆFØ 1× 6Ñ 6°v·|±|ÀA±ÈÓ KÐð !×$AÑ$AÑAˆKØ 1°+Ñ >ÐØ˜KÑ(ˆFð {‰{×,×,ð ×ÑÐ0Ô1à—‘˜dŸi™iÐ(9Ó:Ó;ð 	ð ˆØÑØ×%Ò% f¨fÑbÀÇÁ×AWÑAWÐbÐ[aÑbˆDà;‰;×+Ñ+Ð/BÓBØ"&§+¡+×"D×"DÈÉ””Ô\a×\iÒ\iÓ\kÑkÜ/°vÀwÐipÑq÷ lö ØYˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEäØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
÷m )žú÷^ lÕkús   ÃJÉ
J%Ê
J"Ê%
J3)rÔ   rn  rÜ  rV  rO   rN   ©NNNNNNNNNNNNNN)rg   rh   ri   rj   Ú_tied_weights_keysr"   r5   rà  r   rò   rä  r   rì   r¹   rç  r   r   rí   rƒ   rC  r   rÐ   r   r®   ro   rp   rq   s   @rV   rm  rm  3  s½  ø† ð +Ð+ÐðÐ/÷ òð&°B·I±Iô &ð ‡]‚]˜4Ñ ð/ E§L¡Lð /°U·\±\ó /ó !ð/ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñm
à˜E×,Ñ,Ñ-ðm
ð ! §¡Ñ.ðm
ð & e§l¡lÑ3ð	m
ð
 ˜uŸ|™|Ñ,ðm
ð   §¡Ñ-ðm
ð ˜Ÿ™Ñ&ðm
ð ˜%Ÿ,™,Ñ'ðm
ð ˜UŸ\™\Ñ*ðm
ð ˜S‘Mðm
ð ˜S‘Mðm
ð ˜#‘ðm
ð $ D™>ðm
ð ' t™nðm
ð ˜d‘^ðm
ð" 
ˆuU—\‘\Ñ" NÐ2Ñ	3ô#m
ó öm
rX   rm  z`
    The ModernBert Model with a sequence classification head on top that performs pooling.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )ro  iÃ  rÔ   c                 ón  >• [         TU ]  U5        UR                  U l        Xl        [	        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R!                  5         g rÎ   )r4   r5   Ú
num_labelsrÔ   r™  rV  rk  rÜ  r   r   rÞ   rJ   rß   rò   r8   rs  r¡  rà   s     €rV   r5   Ú,ModernBertForSequenceClassification.__init__É  s€   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒØŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒð 	‰ÕrX   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rv   c                 ót  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  SS9UR                  SS	S
9-  nU R                  U5      nU R                  U5      nU R                  U5      nSnUGb  U R                   R                  c‘  U R                  S:X  a  SU R                   l
        OoU R                  S:”  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l
        OSU R                   l
        U R                   R                  S:X  aJ  [!        5       nU R                  S:X  a&  U" UR#                  5       UR#                  5       5      nOŽU" UU5      nO„U R                   R                  S:X  a=  [%        5       nU" UR'                  SU R                  5      UR'                  S5      5      nO-U R                   R                  S:X  a  [)        5       nU" UU5      nU(       d  U4nUb  U4U-   $ U$ [+        UUUR,                  UR.                  S9$ )aª  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nrê  r   r*   r+   rx   r}   rø   T©rz   ÚkeepdimÚ
regressionÚsingle_label_classificationÚmulti_label_classificationrë  )rÔ   rº  rŠ  rV  r(   r¿  r~   rÜ  rß   rs  Úproblem_typerö  r{   r   Úlongrƒ   r
   Úsqueezer	   rŠ   r   r   rê   r¹  )rS   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rT   rï  r¸  Úpooled_outputrí  rì  Úloss_fctr\   s                          rV   r®   Ú+ModernBertForSequenceClassification.forwardÖ  s€  € ðN &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà;‰;×)Ñ)¨UÓ2Ø 1²!°Q°$Ñ 7ÑØ[‰[×+Ñ+¨vÓ5Ø!2°^×5MÑ5MÈbÓ5QÑ!Q× VÑ VÐ[\Ð VÐ ]Ð`n×`rÑ`rØ˜tð asð añ !Ðð Ÿ	™	Ð"3Ó4ˆØŸ	™	 -Ó0ˆØ—‘ Ó/ˆàˆØÒØ{‰{×'Ñ'Ñ/Ø—?‘? aÓ'Ø/;D—K‘KÕ,Ø—_‘_ qÓ(¨f¯l©l¼e¿j¹jÓ.HÈFÏLÉLÔ\a×\eÑ\eÓLeØ/LD—K‘KÕ,à/KD—K‘KÔ,à{‰{×'Ñ'¨<Ó7Ü"›9Ø—?‘? aÓ'Ù# F§N¡NÓ$4°f·n±nÓ6FÓG‘Dá# F¨FÓ3‘DØ—‘×)Ñ)Ð-JÓJÜ+Ó-Ù §¡¨B°·±Ó @À&Ç+Á+ÈbÃ/ÓR‘Ø—‘×)Ñ)Ð-IÓIÜ,Ó.Ù ¨Ó/æØYˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä'ØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
rX   )rs  rÔ   rß   rÜ  rV  rö  rò  )rg   rh   ri   rj   r"   r5   r   r   r   rí   r¹   rƒ   rC  r   rÐ   r   r®   ro   rp   rq   s   @rV   ro  ro  Ã  sk  ø† ðÐ/÷ ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñe
à˜E×,Ñ,Ñ-ðe
ð ! §¡Ñ.ðe
ð & e§l¡lÑ3ð	e
ð
 ˜uŸ|™|Ñ,ðe
ð   §¡Ñ-ðe
ð ˜Ÿ™Ñ&ðe
ð ˜%Ÿ,™,Ñ'ðe
ð ˜UŸ\™\Ñ*ðe
ð ˜S‘Mðe
ð ˜S‘Mðe
ð ˜#‘ðe
ð $ D™>ðe
ð ' t™nðe
ð ˜d‘^ðe
ð" 
ˆuU—\‘\Ñ"Ð$<Ð<Ñ	=ô#e
ó öe
rX   ro  zv
    The ModernBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rq  i?  rÔ   c                 ób  >• [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rÎ   ©r4   r5   rö  r™  rV  rk  rÜ  r   r   rÞ   rJ   rß   rò   r8   rs  r¡  rà   s     €rV   r5   Ú)ModernBertForTokenClassification.__init__E  s{   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒð 	‰ÕrX   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rv   c                 óö  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nSnUb<  [        5       nU" UR                  SU R                  5      UR                  S5      5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                  UR                  S9$ )a  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nrê  r   rx   r}   rë  )rÔ   rº  rŠ  rV  rÜ  rß   rs  r	   rŠ   rö  r   rê   r¹  )rS   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rï  r¸  rí  rì  r  r\   s                        rV   r®   Ú(ModernBertForTokenClassification.forwardQ  s#  € ðH &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà ŸI™IÐ&7Ó8ÐØ ŸI™IÐ&7Ó8ÐØ—‘Ð!2Ó3ˆàˆØÑÜ'Ó)ˆHÙ˜FŸK™K¨¨D¯O©OÓ<¸f¿k¹kÈ"»oÓNˆDæØY ¨¨ Ñ,ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä$ØØØ!×/Ñ/Ø×)Ñ)ñ	
ð 	
rX   ©rs  rß   rÜ  rV  rö  rò  )rg   rh   ri   rj   r"   r5   r   r   r   rí   r¹   rƒ   rC  r   rÐ   r   r®   ro   rp   rq   s   @rV   rq  rq  ?  sk  ø† ð
Ð/÷ 
ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñI
à˜E×,Ñ,Ñ-ðI
ð ! §¡Ñ.ðI
ð & e§l¡lÑ3ð	I
ð
 ˜uŸ|™|Ñ,ðI
ð   §¡Ñ-ðI
ð ˜Ÿ™Ñ&ðI
ð ˜%Ÿ,™,Ñ'ðI
ð ˜UŸ\™\Ñ*ðI
ð ˜S‘MðI
ð ˜S‘MðI
ð ˜#‘ðI
ð $ D™>ðI
ð ' t™nðI
ð ˜d‘^ðI
ð  
ˆuU—\‘\Ñ"Ð$9Ð9Ñ	:ô!I
ó öI
rX   rq  c            "       ó¶  ^ • \ rS rSrS\4U 4S jjr\             SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\
   S\\
   S\\
   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rr  iž  rÔ   c                 ób  >• [         TU ]  U5        UR                  U l        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  UR                  5      U l        U R                  5         g rÎ   r  rà   s     €rV   r5   Ú'ModernBertForQuestionAnswering.__init__   sy   ø€ Ü‰Ñ˜Ô Ø ×+Ñ+ˆŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸×8IÑ8IÓJˆŒà‰ÕrX   rä   rs   r  rt   Ústart_positionsÚend_positionsrŒ   rŽ   rž   rª  r«  r  r¬  r­  rv   c                 óR  • Ub  UOU R                   R                  nU R                  5         U R                  UUUUUUU	U
UUUUS9nUS   nU R	                  U5      nU R                  U5      nU R                  U5      nUR                  SSS9u  nnUR                  S5      R                  5       nUR                  S5      R                  5       nSnUb  Ub  U R                  " UUXV40 UD6nU(       d  UU4USS -   nUb  U4U-   $ U$ [        UUUUR                  UR                  S9$ )r¯  N)rs   r  rt   rŒ   rŽ   rž   rª  r«  r  r¬  r­  r   r}   rx   rø   )rì  Ústart_logitsÚ
end_logitsrê   r¹  )rÔ   rº  rŠ  rV  rÜ  rß   rs  Úsplitr   r£   rî  r   rê   r¹  )rS   rä   rs   r  rt   r  r  rŒ   rŽ   rž   rª  r«  r  r¬  r­  rT   rï  r¸  rí  r  r  rì  r\   s                          rV   r®   Ú&ModernBertForQuestionAnswering.forward«  sc  € ðF &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ×ÑÔ!à—*‘*ØØ)Ø 3Ø%ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà ŸI™IÐ&7Ó8ÐØ ŸI™IÐ&7Ó8ÐØ—‘Ð!2Ó3ˆà#)§<¡<°°r <Ð#:Ñ ˆjØ#×+Ñ+¨BÓ/×:Ñ:Ó<ˆØ×'Ñ'¨Ó+×6Ñ6Ó8ˆ
àˆØÑ&¨=Ñ+DØ×%Ò% l°JÀÑiÐbhÑiˆDæØ" JÐ/°'¸!¸"°+Ñ=ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä+ØØ%Ø!Ø!×/Ñ/Ø×)Ñ)ñ
ð 	
rX   r
  rÓ  )rg   rh   ri   rj   r"   r5   r   r   r   r¹   rƒ   rC  r   rÐ   r   r®   ro   rp   rq   s   @rV   rr  rr  ž  sf  ø† ð	Ð/÷ 	ð ð 26Ø6:Ø/3Ø26Ø04Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñK
à˜EŸL™LÑ)ðK
ð ! §¡Ñ.ðK
ð & e§l¡lÑ3ð	K
ð
 ˜uŸ|™|Ñ,ðK
ð " %§,¡,Ñ/ðK
ð   §¡Ñ-ðK
ð ˜%Ÿ,™,Ñ'ðK
ð ˜UŸ\™\Ñ*ðK
ð ˜S‘MðK
ð ˜S‘MðK
ð ˜#‘ðK
ð $ D™>ðK
ð ' t™nðK
ð ˜d‘^ðK
ð" 
ˆuU—\‘\Ñ"Ð$@Ð@Ñ	Aô#K
ó öK
rX   rr  z«
    The ModernBert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a softmax) e.g. for RocStories/SWAG tasks.
    c            "       ó¸  ^ • \ rS rSrS\4U 4S jjr\              SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\   S\\   S\\   S\\   S\\   S\\   S\\\R                     \4   4S jj5       rSrU =r$ )rp  iú  rÔ   c                 ó8  >• [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        [        R                  R                  UR                  5      U l        [        R                  " UR                  S5      U l        U R                  5         g )Nr}   )r4   r5   rÔ   r™  rV  rk  rÜ  r   r   rÞ   rJ   rß   rò   r8   rs  r¡  rà   s     €rV   r5   Ú$ModernBertForMultipleChoice.__init__   sm   ø€ Ü‰Ñ˜Ô ØŒä$ VÓ,ˆŒ
Ü,¨VÓ4ˆŒ	Ü—H‘H×$Ñ$ V×%>Ñ%>Ó?ˆŒ	ÜŸ)š) F×$6Ñ$6¸Ó:ˆŒð 	‰ÕrX   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rv   c                 óP  • Ub  UOU R                   R                  nUb  UR                  S   OUR                  S   nUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb!  UR                  SUR	                  S5      5      OSnUb1  UR                  SUR	                  S5      UR	                  S5      5      OSnU R                  5         U R                  UUUUUUUU	U
UUUUS9nUS   nU R                   R                  S:X  a
  USS2S4   nOLU R                   R                  S:X  a2  UUR                  S5      -  R                  SS	9UR                  SS
S9-  nU R                  U5      nU R                  U5      nU R                  U5      nUR                  SU5      nSnUb  [        R                  " 5       nU" UU5      nU(       d  U4USS -   nUb  U4U-   $ U$ [        UUUR                   UR"                  S9$ )aK  
sliding_window_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
    Mask to avoid performing attention on padding or far-away tokens. In ModernBert, only every few layers
    perform global attention, while the rest perform local attention. This mask is used to avoid attending to
    far-away tokens in the local attention layers when not using Flash Attention.
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
    num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors.
indices (`torch.Tensor` of shape `(total_unpadded_tokens,)`, *optional*):
    Indices of the non-padding tokens in the input sequence. Used for unpadding the output.
cu_seqlens (`torch.Tensor` of shape `(batch + 1,)`, *optional*):
    Cumulative sequence lengths of the input sequences. Used to index the unpadded tensors.
max_seqlen (`int`, *optional*):
    Maximum sequence length in the batch excluding padding tokens. Used to unpad input_ids and pad output tensors.
batch_size (`int`, *optional*):
    Batch size of the input sequences. Used to pad the output tensors.
seq_len (`int`, *optional*):
    Sequence length of the input sequences including padding tokens. Used to pad the output tensors.
Nr}   rx   éþÿÿÿrê  r   r*   r+   rø   Trù  rë  )rÔ   rº  r‰   rŠ   ÚsizerŠ  rV  r(   r¿  r~   rÜ  rß   rs  r   r	   r   rê   r¹  )rS   rä   rs   r  rt   rè   ru   rŒ   rŽ   rž   rª  r«  r  r¬  r­  rT   Únum_choicesrï  r¸  r  rí  Úreshaped_logitsrì  r  r\   s                            rV   r®   Ú#ModernBertForMultipleChoice.forward  sa  € ðL &1Ñ%<‘kÀ$Ç+Á+×B]ÑB]ˆØ,5Ñ,Ai—o‘o aÒ(À}×GZÑGZÐ[\ÑG]ˆà>GÑ>SI—N‘N 2 y§~¡~°bÓ'9Ô:ÐY]ˆ	ØM[ÑMg˜×,Ñ,¨R°×1DÑ1DÀRÓ1HÔIÐmqˆØGSÑG_|×(Ñ(¨¨\×->Ñ->¸rÓ-BÔCÐeiˆð Ñ(ð ×Ñ˜r =×#5Ñ#5°bÓ#9¸=×;MÑ;MÈbÓ;QÔRàð 	ð 	×ÑÔ!à—*‘*ØØ)Ø 3Ø%Ø'ØØ!Ø!Ø!ØØ/Ø!5Ø#ð ð 
ˆð $ A™JÐà;‰;×)Ñ)¨UÓ2Ø 1²!°Q°$Ñ 7ÑØ[‰[×+Ñ+¨vÓ5Ø!2°^×5MÑ5MÈbÓ5QÑ!Q× VÑ VÐ[\Ð VÐ ]Ð`n×`rÑ`rØ˜tð asð añ !Ðð Ÿ	™	Ð"3Ó4ˆØŸ	™	 -Ó0ˆØ—‘ Ó/ˆà Ÿ+™+ b¨+Ó6ˆàˆØÑÜ×*Ò*Ó,ˆHÙ˜O¨VÓ4ˆDæØ%Ð'¨'°!°"¨+Ñ5ˆFØ)-Ñ)9TG˜fÑ$ÐE¸vÐEä(ØØ"Ø!×/Ñ/Ø×)Ñ)ñ	
ð 	
rX   )rs  rÔ   rß   rÜ  rV  rò  )rg   rh   ri   rj   r"   r5   r   r   r   rí   r¹   rƒ   rC  r   rÐ   r   r®   ro   rp   rq   s   @rV   rp  rp  ú  sk  ø† ð
Ð/÷ 
ð ð 15Ø15Ø6:Ø/3Ø04Ø)-Ø*.Ø-1Ø$(Ø$(Ø!%Ø,0Ø/3Ø&*ñ_
à˜E×,Ñ,Ñ-ð_
ð ! §¡Ñ.ð_
ð & e§l¡lÑ3ð	_
ð
 ˜uŸ|™|Ñ,ð_
ð   §¡Ñ-ð_
ð ˜Ÿ™Ñ&ð_
ð ˜%Ÿ,™,Ñ'ð_
ð ˜UŸ\™\Ñ*ð_
ð ˜S‘Mð_
ð ˜S‘Mð_
ð ˜#‘ð_
ð $ D™>ð_
ð ' t™nð_
ð ˜d‘^ð_
ð" 
ˆuU—\‘\Ñ"Ð$=Ð=Ñ	>ô#_
ó ö_
rX   rp  )r"   r™  rU  rm  ro  rq  rr  rp  r·   rB  )Xr6  ri  Ú
contextlibr   Útypingr   r   r   r   Útorch.nn.functionalr   r†   r(  Útorch.utils.checkpointÚtorch.nnr   r	   r
   Úactivationsr   Úconfiguration_utilsr   Úmodeling_attn_mask_utilsr   Úmodeling_layersr   Úmodeling_outputsr   r   r   r   r   r   Úmodeling_utilsr   Úutilsr   r   r   Úutils.import_utilsr   Úgemma.modeling_gemmar   r   Úflash_attn.flash_attn_interfacer   Úflash_attn.layers.rotaryr   Úflash_attn.ops.triton.rotaryr    ÚobjectÚ
get_loggerrg   r‡  r"   r¹   rÐ   rƒ   r•   r›   ÚautogradÚFunctionr   r¼   r¾   rh  rÓ   rï   rþ   rí   rC  r  r!  r{   r%  r*  r?  r   rE  rU  r™  rk  rm  ro  rq  rr  rp  Ú__all__r3   rX   rV   Ú<module>r4     sõ  ðó  Û Ý "ß +Ñ +ã ß Ð Û Ý ß AÑ Aå !Ý 3Ý BÝ 9÷÷ õ .ß GÑ GÝ 5ß Mñ ×ÑÝPÝ8Þ9à€Oð 
×	Ò	˜HÓ	%€ôBÐ'ô BðP ,0Ø%)ñ	&mØL‰Lð&mà—L‘Lð&mð ˜5Ÿ<™<Ñ(ð&mð U—\‘\Ñ"ð	&mð
 ˆ5<‰<˜Ÿ™ u§|¡|°S¸(À5Ç<Á<Ñ:PÐRZÐ[`×[gÑ[gÑRhÐhÑiõ&mðRØL‰Lðà\‰\ðð ðð ð	ð
 ‡\\ôô>46˜%Ÿ.™.×1Ñ1ô 46ðv *.Ø $ñLð ˜Ÿ™Ñ&ð	Lð
 ˜‘õLô42Q¨ô 2Qôj˜2Ÿ9™9ô ô<:B—I‘Iô :ô(	Ð 4ô 	ð ).ñ"Ø!ð"à	‰ð"ð —L‘Lð"ð Ÿ™ð	"ð
 ˜5×+Ñ+Ñ,ð"ð ˜3 ˜8‘_ð"ð 	ð"ð 
ð"ð   ‘~ð"ð ˆ5—‘˜uŸ|™|Ð+Ñ,¨e°E·L±LÑ.AÐAÑBõ"ð\ !&§¡ñ(!Ø!ð(!à	‰ð(!ð 2ð(!ð —‘ð	(!ð
 ð(!ð ˜3 ˜8‘_ð(!ð 	ð(!ð 
ð(!ð —+‘+ð(!ð ˆ5<‰<Ñõ(!ðV Ø!ð à	‰ð ð —L‘Lð ð Ÿ™ð	 ð
 ˜5×+Ñ+Ñ,ð ð ˜3 ˜8‘_ð ð 	ð ð 
ð ð ˆ5<‰<Ñô ðH 1Ø$Ø"ñ!Ð ôL3˜"Ÿ)™)ô L3ô^+3Ð7ô +3ð\ ô} ó }ó ð}ð@ ôj:Ð/ó j:ó ðj:ôZ	>˜rŸy™yô 	>ñ ðñô
H
Ð5ó H
óð
H
ñV ðñô
t
Ð*Có t
óð
t
ñn ðñô
W
Ð'@ó W
óð
W
ðt ôX
Ð%>ó X
ó ðX
ñv ðñô
m
Ð";ó m
óð
m
ò`	rX   