
    <h                        S r SSKrSSKJrJr  SSKrSSKrSSKJr  SSKJ	r	J
r
Jr  SSKJrJr  SSKJr  SS	KJrJrJrJrJr  SS
KJrJrJr  SSKJrJrJr  SSKJ r   \" 5       (       a  SSKJ!r!  \RD                  " \#5      r$S r%S r&S r'S r(S r) " S S\R                  RT                  5      r+ " S S\RT                  5      r, " S S\RT                  5      r- " S S\RT                  5      r. " S S\RT                  5      r/ " S S\.5      r0\.\0S .r1 " S! S"\RT                  5      r2 " S# S$\RT                  5      r3 " S% S&\RT                  5      r4 " S' S(\5      r5 " S) S*\RT                  5      r6 " S+ S,\RT                  5      r7\ " S- S.\5      5       r8\ " S/ S0\85      5       r9\ " S1 S2\85      5       r: " S3 S4\RT                  5      r;\" S5S69 " S7 S8\85      5       r<\ " S9 S:\85      5       r= " S; S<\RT                  5      r>S= r?/ S>Qr@g)?zPyTorch ESM model.    N)OptionalUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringcan_return_tuplelogging   )	EsmConfig)_flash_attention_forwardc                 V    U R                  SSS9u  p[        R                  " U* U4SS9$ )N   dim)chunktorchcat)xx1x2s      \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/esm/modeling_esm.pyrotate_halfr'   /   s-    WWQBWFB99rc2YB''    c                     US S 2S S 2S U R                   S   2S S 24   nUS S 2S S 2S U R                   S   2S S 24   nX-  [        U 5      U-  -   $ )N)shaper'   )r#   cossins      r&   apply_rotary_pos_embr.   4   sV    
aMaggbkM1$
%C
aMaggbkM1$
%CGA,--r(   c                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zg
This is the gelu implementation from the original ESM repo. Using F.gelu yields subtly wrong results.
g      ?      ?g       @)r!   erfmathsqrtr#   s    r&   gelur5   ;   s.     s7cEIIa$))C.&899::r(   c                 *    X R                  SS5      -   $ )zJMake layer symmetric in final two dimensions, used for contact prediction.r   r*   )	transposer4   s    r&   
symmetrizer8   B   s    {{2r"""r(   c                     U R                  SSS9nU R                  SSS9nU R                  SSS9nX-  nUR                  U5        X-
  nU$ )z=Perform average product correct, used for contact prediction.r   T)keepdimsr*   )r   r*   )sumdiv_)r#   a1a2a12avg
normalizeds         r&   average_product_correctrB   G   sW    	
rD	!B	
rD	!B
%%4%
(C
'CHHSMJr(   c                      ^  \ rS rSrSrS\4U 4S jjrSS jrS\R                  S\R                  S\
\R                  \R                  4   4S	 jrS
rU =r$ )RotaryEmbeddingS   z
Rotary position embeddings based on those in
[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
matrices which depend on their relative positions.
r   c           	         > [         TU ]  5         SS[        R                  " SUS[        R                  S9R                  5       U-  -  -  nUnU R                  SU5        S U l        S U l        S U l	        g )Nr0   i'  r   r   dtypeinv_freq)
super__init__r!   arangeint64floatregister_buffer_seq_len_cached_cos_cached_sin_cached)selfr   rI   	__class__s      r&   rK   RotaryEmbedding.__init__Z   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r(   c                 j   UR                   U   nX0R                  :w  d$  U R                  R                  UR                  :w  a  X0l        [        R
                  " UR                   U   UR                  S9R                  U R                  5      n[        R                  " X@R                  5      n[        R                  " XU4SS9R                  UR                  5      nUR                  5       S S S S 2S S 24   U l        UR                  5       S S S S 2S S 24   U l        U R                  U R                  4$ )Ndevicer   r   )r+   rP   rQ   rX   r!   rL   type_asrI   outerr"   tor,   r-   rR   )rS   r#   seq_dimensionseq_lentfreqsembs          r&   _update_cos_sin_tables&RotaryEmbedding._update_cos_sin_tablese   s    ''-( ***d.>.>.E.E.Q#* QWW]3AHHEMMdmm\AKK==1E))UN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r(   qkreturnc                    U R                  USS9u  U l        U l        [        XR                  U R                  5      R	                  UR
                  S9[        X R                  U R                  5      R	                  UR
                  S94$ )Nr*   )r\   rG   )ra   rQ   rR   r.   r[   rH   )rS   rc   rd   s      r&   forwardRotaryEmbedding.forwardu   s    -1-H-HZ\-H-]*$* !$4$4d6F6FGJJQRQXQXJY $4$4d6F6FGJJQRQXQXJY
 	
r(   )rQ   rP   rR   )r   )__name__
__module____qualname____firstlineno____doc__intrK   ra   r!   Tensortuplerg   __static_attributes____classcell__rT   s   @r&   rD   rD   S   sR    	 C 	 2 
 
%,, 
5u||A[;\ 
 
r(   rD   c                   F   ^  \ rS rSrSr  SS\S\4U 4S jjjrS rSrU =r	$ )	EsmContactPredictionHead~   zWPerforms symmetrization, apc, and computes a logistic regression on the output featuresin_featureseos_idxc                    > [         TU ]  5         Xl        X0l        [        R
                  " USU5      U l        [        R                  " 5       U l        g )Nr   )	rJ   rK   rw   rx   r   Linear
regressionSigmoid
activation)rS   rw   biasrx   rT   s       r&   rK   !EsmContactPredictionHead.__init__   s<     	&))KD9**,r(   c                 N   UR                  U R                  5      R                  U5      nUR                  S5      UR                  S5      -  nX#S S 2S S S S 2S S 24   -  nUSS S2S S24   nUSSS 2SS 24   nUR	                  5       u  pEpgnUR                  XEU-  Xw5      nUR                  U R                  R                  R                  5      n[        [        U5      5      nUR                  SSSS5      nU R                  U R                  U5      R                  S5      5      $ )Nr   r   .r   r   r	   )nerx   r[   	unsqueezesizeviewr{   weightrX   rB   r8   permuter}   squeeze)	rS   tokens
attentionseos_mask
batch_sizelayersheadsseqlen_s	            r&   rg    EsmContactPredictionHead.forward   s   99T\\*--j9%%a(8+=+=a+@@1dD!Q+>"??
SbS#2#.
QR,
/9/@,
E1__Z%P
  ]]OO""))

 -Z
-CD
''1a3
tz:BB1EFFr(   )r}   rx   rw   r{   )Tr   )
ri   rj   rk   rl   rm   rn   rK   rg   rq   rr   rs   s   @r&   ru   ru   ~   s6    a
 	
'
' 	
' 
'G Gr(   ru   c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )EsmEmbeddings   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR(                  5      R+                  S5      SS9  UR                  U l        U R                   S:X  a9  [        R                  " UR(                  UR
                  U R,                  S9U l        UR0                  U l        UR2                  U l        g )	N)padding_idxepsposition_embedding_typeabsoluteposition_ids)r   r   F)
persistent)rJ   rK   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr   rO   r!   rL   max_position_embeddingsexpandr   position_embeddingstoken_dropoutmask_token_idrS   configrT   s     r&   rK   EsmEmbeddings.__init__   s*   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11r(   c                    Uc*  Ub  [        XR                  5      nOU R                  U5      nUc  U R                  U5      nUnU R                  (       a  UR                  XR                  :H  R                  S5      S5      nSnUR                  S5      nXR                  :H  R                  S5      R                  5       U-  nUSU-
  -  SU-
  S S 2S S 4   -  R                  UR                  5      nU R                  S:X  a  U R                  U5      n	XY-   nU R                  b  U R                  U5      nUb,  XRR                  S5      -  R                  UR                  5      nU$ )Nr           gQ?r   r   )"create_position_ids_from_input_idsr   &create_position_ids_from_inputs_embedsr   r   masked_fillr   r   r;   rN   r[   rH   r   r   r   )
rS   	input_idsattention_maskr   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedr   s
             r&   rg   EsmEmbeddings.forward   sp    $A)M]M]^#JJ=Y  00;M #
 #//>P>P1P0[0[\^0_adeJ)(,,R0K#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#9J??&4J%$'?'?'CCGG
HXHXYJ r(   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
Nr   r   rH   rX   r   )r   r!   rL   r   longrX   r   r   )rS   r   input_shapesequence_lengthr   s        r&   r   4EsmEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r(   )r   r   r   r   r   r   r   r   )NNNN)
ri   rj   rk   rl   rm   rK   rg   r   rq   rr   rs   s   @r&   r   r      s+    22 /b= =r(   r   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\
\R                     4S
 jjrSrU =r$ )EsmSelfAttention   c                 ~  > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        U=(       d    [%        USS5      U l        S U l        U R&                  S:X  d  U R&                  S	:X  aH  UR*                  U l        [        R,                  " S
UR*                  -  S-
  U R                  5      U l        O(U R&                  S:X  a  [1        U R                  S9U l        UR2                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryr   r   rotaryr   )rJ   rK   r   r   num_attention_headshasattr
ValueErrorrn   attention_head_sizeall_head_sizer   rz   querykeyvaluer   attention_probs_dropout_probr   r   r   rotary_embeddingsr   r   distance_embeddingrD   
is_decoder	layer_idxrS   r   r   r   rT   s       r&   rK   EsmSelfAttention.__init__  s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%49Q9Q%RD" ++"r(   hidden_statesr   	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionsre   c                    UR                   S   SU R                  U R                  4nU R                  U5      R	                  U5      R                  SS5      nUS Ln	U	(       ac  U R                  U5      R	                  U5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUnO`U R                  U5      R	                  U5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nXR                  S-  -  nU R                  S:X  a  U R                  X5      u  p[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S	:X  GaC  UR                  5       S   n[        R                  " U[        R                  UR                  S
9R	                  SS5      n[        R                  " U[        R                  UR                  S
9R	                  SS5      nX-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S	:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nUb  X-   n[*        R,                  R/                  USS9nU R1                  U5      nUb  UU-  n[        R                  " UR%                  UR&                  5      U5      nUR3                  SSSS5      R5                  5       nUR                  5       S S U R6                  4-   nUR	                  U5      nU(       a  UU4OU4nU R8                  (       a  US-   nU$ )Nr   r   r   r         r   r*   r   r   r   rG   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   r	   N)r+   r   r   r   r   r7   r   r   r   r   r!   matmulr   rL   r   rX   r   r   r[   rH   einsumr   
functionalsoftmaxr   r   
contiguousr   r   )rS   r   r   r   r   r   r   hidden_shapequery_layeris_cross_attention	key_layervalue_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                            r&   rg   EsmSelfAttention.forward!  s    &++A.D4L4LdNfNfgjj/44\BLLQPQR
 3$>!67<<\JTTUVXYZI**%:;@@NXXYZ\]^K3N/44\BLLQPQRI**]388FPPQRTUVK "$<$<d$BB''83%)%;%;K%S"K !<<5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s %/@ --//0@b/I ,,7  -	9O_%7%78I8I%JKX%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]??'Gr(   )r   r   r   r   r   r   r   r   r   r   r   r   r   r   NNNNNNF)ri   rj   rk   rl   rK   r!   ro   r   FloatTensorboolrp   rg   rq   rr   rs   s   @r&   r   r      s    #F 7;15=A>B,1O||O !!2!23O E--.	O
  ((9(9:O !)):): ;O $D>O 
u||	O Or(   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EsmSelfOutputis  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	rJ   rK   r   rz   r   denser   r   r   r   s     r&   rK   EsmSelfOutput.__init__t  sB    YYv1163E3EF
zz&"<"<=r(   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r  r   rS   r   input_tensors      r&   rg   EsmSelfOutput.forwardy  ,    

=1]3%4r(   r  ri   rj   rk   rl   rK   rg   rq   rr   rs   s   @r&   r  r  s      >
 r(   r  c                     ^  \ rS rSrSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
   S
\\R                     4U 4S jjjrSrU =r$ )EsmFlashAttention2i  a6  
ESM flash attention module. This module inherits from `EsmSelfAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 b   > [         TU ]  XUS9  [        5       U l        UR                  U l        g )N)r   r   )rJ   rK   r
   _flash_attn_uses_top_left_maskr   dropout_probr   s       r&   rK   EsmFlashAttention2.__init__  s2    \ef
 /P.Q+"??r(   r   r   r   r   r   r   re   c                   > U(       d  Uc  Ub)  [         R                  S5        [        TU ]  UUUUUU5      $ UR	                  5       u  pxn	U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U R                  U5      5      nU
R                  nU
R                  R                  S:w  a  U
R                  R                  OSnU[        R                  :X  a  [        R                  " 5       (       aA  [        [        S5      (       a  [        R                   " U5      O[        R"                  " 5       nOR[        U R$                  S5      (       a  U R$                  R&                  nO U R                  R(                  R                  n[         R                  SU S35        U
R+                  U5      n
UR+                  U5      nUR+                  U5      nXR,                  S-  -  n
U R.                  S	:X  a  U R1                  X5      u  pO9U R.                  S
:X  d  U R.                  S:X  a  [3        SU R.                   S35      e[5        U
R7                  SSSS5      UR7                  SSSS5      UR7                  SSSS5      UUU R8                  SU R:                  (       a  U R<                  OSU R>                  S9	nURA                  XxS5      nUS 4nU R8                  (       a  US-   nU$ )NzEsmFlashAttention2 does not support output_attentions, head_mask, or cross_attention. Falling back to the manual attention implementation. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   r   r   r   z%ESM flash attention does not support z embeddingsr   r   r   r	   r0   r   )query_length	is_causalsoftmax_scaler   use_top_left_maskr   r   )!loggerwarning_oncerJ   rg   r   transpose_for_scoresr   r   r   rH   rX   typer!   float32is_autocast_enabledr   r  get_autocast_gpu_dtyper   r  r   r[   r   r   r   r   r   r   r   trainingr  r  reshape)rS   r   r   r   r   r   r   bszq_lenr   r   r   r   input_dtypedevice_typetarget_dtypeattn_outputr   rT   s                     r&   rg   EsmFlashAttention2.forward  s    	 59N9ZU
 7?%&!  &**,A//

=0IJ--dhh}.EF	//

=0IJ "''1<1C1C1H1HE1Qk((--W\%--'((** u&:;; ,,[9557  &?@@#{{BB#zz0066 >$ &..6K!\2I%..6K "$<$<d$BB''83%)%;%;K%S"K))^;t?[?[_s?sDTEaEaDbbmnoo
 /1a+aAq)1a+oo)-D%%C"AA

 "))#b9%??'Gr(   )r  r  r   r   )ri   rj   rk   rl   rm   rK   r!   ro   r   r   r   rp   rg   rq   rr   rs   s   @r&   r  r    s    @ 7;15=A>B,1]||] !!2!23] E--.	]
  ((9(9:] !)):): ;] $D>] 
u||	] ]r(   r  )eagerflash_attention_2c                   H   ^  \ rS rSrSU 4S jjrS r      SS jrSrU =r$ )EsmAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        [        R                  " UR                  UR                  S9U l
        g )N)r   r   )rJ   rK   ESM_ATTENTION_CLASSES_attn_implementationrS   r  outputsetpruned_headsr   r   r   r   )rS   r   r   rT   s      r&   rK   EsmAttention.__init__  sY    )&*E*EFvc	#F+Ef&8&8f>S>STr(   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r   r   )lenr   rS   r   r   r6  r   r   r   r   r4  r  r   union)rS   r   indexs      r&   prune_headsEsmAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r(   c           	          U R                  U5      nU R                  UUUUUUS9n	U R                  U	S   U5      n
U
4U	SS  -   nU$ )Nr   r   r   r   r   r   r   )r   rS   r4  )rS   r   r   r   r   r   r   cache_positionhidden_states_lnself_outputsattention_outputr   s               r&   rg   EsmAttention.forward  sh      >>-8yy)"7#9/ ! 
  ;;|AF#%QR(88r(   )r   r4  r6  rS   r   NNNNFN)	ri   rj   rk   rl   rK   r<  rg   rq   rr   rs   s   @r&   r0  r0    s,    U;* "# r(   r0  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )EsmIntermediatei(  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        g r   )rJ   rK   r   rz   r   intermediate_sizer  r   s     r&   rK   EsmIntermediate.__init__)  s,    YYv1163K3KL
r(   r   re   c                 >    U R                  U5      n[        U5      nU$ r   )r  r5   )rS   r   s     r&   rg   EsmIntermediate.forward-  s     

=1]+r(   )r  
ri   rj   rk   rl   rK   r!   ro   rg   rq   rr   rs   s   @r&   rG  rG  (  s)    MU\\ ell  r(   rG  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	EsmOutputi3  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
rJ   rK   r   rz   rI  r   r  r   r   r   r   s     r&   rK   EsmOutput.__init__4  sB    YYv779K9KL
zz&"<"<=r(   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r  r  s      r&   rg   EsmOutput.forward9  r
  r(   r  r  rs   s   @r&   rO  rO  3  r  r(   rO  c                   D   ^  \ rS rSrU 4S jr      SS jrS rSrU =r$ )EsmLayeri@  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        U5      U l	        [        U5      U l        [        U5      U l        [        R                  " UR                   UR"                  S9U l        g )Nr   z> should be used as a decoder model if cross attention is addedr   )rJ   rK   chunk_size_feed_forwardseq_len_dimr0  	attentionr   add_cross_attentionRuntimeErrorcrossattentionrG  intermediaterO  r4  r   r   r   r   r   s     r&   rK   EsmLayer.__init__A  s    '-'E'E$%f- ++#)#=#= ##??"dV+i#jkk".v"6D+F3'f&8&8f>S>STr(   c           	      n   U R                  UUUUS9nUS   n	U R                  (       a  USS n
OUSS  n
U R                  (       aC  Ub@  [        U S5      (       d  [        SU  S35      eU R	                  U	UUUUUS9nUS   n	XSS -   n
U R                  U	5      nU4U
-   n
U R                  (       a  U
S	-   n
U
$ )
N)r   r   r   r   r   r   r\  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r?  r   )rY  r   r   AttributeErrorr\  feed_forward_chunk)rS   r   r   r   r   r   r   r@  self_attention_outputsrC  r   cross_attention_outputslayer_outputs                r&   rg   EsmLayer.forwardP  s	    "&)/	 "0 "
 2!4 ??,Qr2G,QR0G??4@4!122$=dV D` ` 
 '+&9&9 -#&;'="3 ': '#  7q9" ==G../?@/G+ ??'Gr(   c                 l    U R                  U5      nU R                  U5      nU R                  X15      nU$ r   )r   r]  r4  )rS   rC  attention_output_lnintermediate_outputrd  s        r&   ra  EsmLayer.feed_forward_chunk  s9    "nn-=>"//0CD{{#6Ir(   )	r   rZ  rY  rW  r\  r]  r   r4  rX  rE  )	ri   rj   rk   rl   rK   rg   ra  rq   rr   rs   s   @r&   rU  rU  @  s-    U$ "#1f r(   rU  c                   L   ^  \ rS rSrU 4S jr\        SS j5       rSrU =r$ )
EsmEncoderi  c                 0  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf )Nr   F)rJ   rK   r   r   
ModuleListrangenum_hidden_layersrU  layerr   r   r   emb_layer_norm_aftergradient_checkpointing)rS   r   r   rT   s      r&   rK   EsmEncoder.__init__  sq    ]]eFD\D\>]#^>]HV$4>]#^_
$&LL1C1CI^I^$_!&+# $_s   Bc
           
         U(       a  SOS n
U(       a  SOS nU(       a  U R                   R                  (       a  SOS n[        U R                  5       Hb  u  pU(       a  X4-   n
Ub  X=   OS nU" UUUUUUS9nUS   nU(       d  M3  UUS   4-   nU R                   R                  (       d  MY  UUS   4-   nMd     U R                  (       a  U R	                  U5      nU(       a  X4-   n
[        UU
UUS9$ )N )r   r   r   r   r   r   r   r   r   )last_hidden_stater   r   cross_attentions)r   rZ  	enumeraterp  rq  r   )rS   r   r   r   r   r   r   output_hidden_statesreturn_dictr@  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                    r&   rg   EsmEncoder.forward  s	    #7BD$5b4%64;;;Z;Zr`d(4OA#$58H$H!.7.CilO(+-)&;'="3M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U('  5* $$ 55mDM 14D D1++*1	
 	
r(   )r   rq  rr  rp  )NNNNFFTN)	ri   rj   rk   rl   rK   r   rg   rq   rr   rs   s   @r&   rk  rk    s6    ,  "#"0
 0
r(   rk  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )	EsmPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rJ   rK   r   rz   r   r  Tanhr}   r   s     r&   rK   EsmPooler.__init__  s9    YYv1163E3EF
'')r(   r   re   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ Nr   )r  r}   )rS   r   first_token_tensorpooled_outputs       r&   rg   EsmPooler.forward  s6     +1a40

#566r(   )r}   r  rM  rs   s   @r&   r  r    s(    $
U\\ ell  r(   r  c                   F    \ rS rSr% \\S'   SrSr/ SQrS/r	Sr
S rS rS	rg
)EsmPreTrainedModeli  r   esmT)rU  #EsmFoldTriangularSelfAttentionBlockr   zposition_embeddings.weightc                    [        U[        R                  5      (       ak  UR                  R                  R                  SU R                  R                  S9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ax  UR                  R                  R                  SU R                  R                  S9  UR                  b2  UR                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR                  R                  R                  S5        g[        U[        5      (       a%  UR                  R                  R                  5         gg)zInitialize the weightsr   )meanstdNr0   )
isinstancer   rz   r   datanormal_r   initializer_ranger~   zero_r   r   r   fill_	EsmLMHead)rS   modules     r&   _init_weights EsmPreTrainedModel._init_weights  s2   fbii(( MM&&CT[[5R5R&S{{&  &&( '--MM&&CT[[5R5R&S!!-""6#5#56<<> .--KK""$MM$$S)	**KK""$ +r(   c                     g r   ru  rS   s    r&   get_output_embeddings(EsmPreTrainedModel.get_output_embeddings  s     r(   ru  N)ri   rj   rk   rl   r   __annotations__base_model_prefixsupports_gradient_checkpointing_no_split_modules"_keys_to_ignore_on_load_unexpected_supports_flash_attnr  r  rq   ru  r(   r&   r  r    s3    &*#\*F)G&%$r(   r  c                     ^  \ rS rSrSrSU 4S jjrS rS rS r\	\
          SS\\R                     S\\R                     S	\\R                     S
\\R                     S\\R                     S\\R                     S\\R                     S\\   S\\   S\\   S\\\R                     \4   4S jj5       5       rS rSrU =r$ )EsmModeli  a  

The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
cross-attention is added between the self-attention layers, following the architecture described in [Attention is
all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.

To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
`add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
c                   > [         TU ]  U5        Xl        [        U5      U l        [        U5      U l        U(       a  [        U5      OSU l        [        UR                  UR                  -  SS9U l        U R                  5         g)z^
add_pooling_layer (bool, *optional*, defaults to `True`):
    Whether to add a pooling layer
NT)rw   r~   )rJ   rK   r   r   r   rk  encoderr  poolerru   ro  r   contact_head	post_init)rS   r   add_pooling_layerrT   s      r&   rK   EsmModel.__init__  so    
 	 '/!&)+<i'$40063M3MMTX

 	r(   c                 .    U R                   R                  $ r   r   r   r  s    r&   get_input_embeddingsEsmModel.get_input_embeddings  s    ...r(   c                 $    XR                   l        g r   r  )rS   r   s     r&   set_input_embeddingsEsmModel.set_input_embeddings  s    */'r(   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  rp  rY  r<  )rS   heads_to_prunerp  r   s       r&   _prune_headsEsmModel._prune_heads   s<    
 +002LELLu%//;;EB 3r(   r   r   r   r   r   r   r   r   ry  rz  re   c                    Ub  UOU R                   R                  nU	b  U	OU R                   R                  n	U
b  U
OU R                   R                  n
Ub  Ub  [	        S5      eUb"  U R                  X5        UR                  5       nO"Ub  UR                  5       SS nO[	        S5      eUu  pUb  UR                  OUR                  nUc  [        R                  " X4US9nU R                   R                  S:X  a  UnOU R                  X+5      nU R                   R                  (       aE  UbB  UR                  5       u  nnnUU4nUc  [        R                  " UUS9nU R                  U5      nOSnU R                  X@R                   R                  5      nU R!                  UUUUS9nU R#                  UUUUUUU	SS	9nUS
   nU R$                  b  U R%                  U5      OSn['        UUUR(                  UR*                  UR,                  S9$ )a  
input_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`):
    Indices of input sequence tokens in the vocabulary.

    Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
    [`PreTrainedTokenizer.__call__`] for details.

    [What are input IDs?](../glossary#input-ids)
position_ids (`torch.LongTensor` of shape `((batch_size, sequence_length))`, *optional*):
    Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
    config.max_position_embeddings - 1]`.

    [What are position IDs?](../glossary#position-ids)
inputs_embeds (`torch.FloatTensor` of shape `((batch_size, sequence_length), hidden_size)`, *optional*):
    Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
    is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
    model's internal embedding lookup matrix.
NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrW   r.  )r   r   r   r   T)r   r   r   r   r   ry  rz  r   )rv  pooler_outputr   r   rw  )r   r   ry  use_return_dictr   %warn_if_padding_and_no_attention_maskr   rX   r!   onesr3  get_extended_attention_maskr   invert_attention_maskget_head_maskro  r   r  r  r   r   r   rw  )rS   r   r   r   r   r   r   r   r   ry  rz  r   r   r   rX   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                            r&   rg   EsmModel.forward(  s=   B 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66yQ#..*K&',,.s3KTUU!,
%.%:!!@T@T!"ZZ*)A6RN;;++/BB&4#
 594T4TUc4q# ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y++2O2OP	??%)'	 + 
 ,,2"7#B/!5 ' 	
 *!,8<8OO4UY;-')77&11,==
 	
r(   c                 6   U " XSSS9R                   n[        R                  " USS9nX2R                  S5      R                  S5      R                  S5      -  nX2R                  S5      R                  S5      R                  S5      -  nU R	                  X5      $ )NT)r   rz  r   r   r   r   r	      )r   r!   stackr   r  )rS   r   r   attnss       r&   predict_contactsEsmModel.predict_contacts  s    V`deppEq)
 	))!,66q9CCAFF))!,66q9CCAFF  //r(   )r   r  r   r  r  )T)
NNNNNNNNNN)ri   rj   rk   rl   rm   rK   r  r  r  r   r   r   r!   ro   r   r   rp   r   rg   r  rq   rr   rs   s   @r&   r  r    s<   
(/0C  -115/3,0048<9=,0/3&*h
ELL)h
 !.h
 u||,	h

 ELL)h
  -h
  (5h
 !) 6h
 $D>h
 'tnh
 d^h
 
uU\\"$PP	Qh
  h
T	0 	0r(   r  c                     ^  \ rS rSrS/rU 4S jrS rS r\\	           SS\
\R                     S\
\R                     S\
\R                     S	\
\R                     S
\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\   S\
\   S\
\   S\\\4   4S jj5       5       rS rSrU =r$ )EsmForMaskedLMi  zlm_head.decoder.weightc                    > [         TU ]  U5        UR                  (       a  [        R	                  S5        [        USS9U l        [        U5      U l        U R                  5         U R                  5         g )NzjIf you want to use `EsmForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention.Fr  )rJ   rK   r   r  warningr  r  r  lm_headinit_weightsr  r   s     r&   rK   EsmForMaskedLM.__init__  s\     NN1
 Fe< (r(   c                 .    U R                   R                  $ r   r  decoderr  s    r&   r  $EsmForMaskedLM.get_output_embeddings  s    ||###r(   c                 $    XR                   l        g r   r  )rS   new_embeddingss     r&   set_output_embeddings$EsmForMaskedLM.set_output_embeddings  s    -r(   r   r   r   r   r   r   r   labelsr   ry  rz  re   c                    Ub  UOU R                   R                  nU R                  UUUUUUUU	U
SS9
nUS   nU R                  U5      nSnUba  [	        5       nUR                  UR                  5      nU" UR                  SU R                   R                  5      UR                  S5      5      n[        UUUR                  UR                  S9$ )az  
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
    config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
    loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
NT)	r   r   r   r   r   r   r   ry  rz  r   r   losslogitsr   r   )r   r  r  r  r   r[   rX   r   r   r   r   r   )rS   r   r   r   r   r   r   r   r  r   ry  rz  r   r  prediction_scoresmasked_lm_lossloss_fcts                    r&   rg   EsmForMaskedLM.forward  s    , &1%<k$++B]B](()%'"7#9/!5  
 "!* LL9')HYY0778F%&7&<&<RAWAW&XZ`ZeZefhZijN$!//))	
 	
r(   c                 4    U R                   R                  XS9$ )N)r   )r  r  )rS   r   r   s      r&   r  EsmForMaskedLM.predict_contacts  s    xx(((OOr(   )r  r  )NNNNNNNNNNN)ri   rj   rk   rl   _tied_weights_keysrK   r  r  r   r   r   r!   
LongTensorro   r   r   r   rp   r   rg   r  rq   rr   rs   s   @r&   r  r    sE   23 $.  151537,059=A9=-1,0/3&*1
E,,-1
 !.1
 u//0	1

 ELL)1
   1 121
  ((9(9:1
 !) 61
 ))*1
 $D>1
 'tn1
 d^1
 
un$	%1
  1
fP Pr(   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z&ESM Head for masked language modeling.c                   > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  UR                  S9U l        [        R                  " UR                  UR                  SS9U l
        [        R                  " [        R                  " UR                  5      5      U l        g )Nr   F)r~   )rJ   rK   r   rz   r   r  r   r   r   r   r  	Parameterr!   zerosr~   r   s     r&   rK   EsmLMHead.__init__  s    YYv1163E3EF
,,v'9'9v?T?TUyy!3!3V5F5FUSLLV->->!?@	r(   c                     U R                  U5      n[        U5      nU R                  U5      nU R                  U5      U R                  -   nU$ r   )r  r5   r   r  r~   rS   featureskwargsr#   s       r&   rg   EsmLMHead.forward  sD    JJx GOOA LLOdii'r(   )r~   r  r  r   	ri   rj   rk   rl   rm   rK   rg   rq   rr   rs   s   @r&   r  r    s    0A r(   r  z
    ESM Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
    output) e.g. for GLUE tasks.
    )custom_introc                   <  ^  \ rS rSrU 4S jr\\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )EsmForSequenceClassificationi  c                    > [         TU ]  U5        UR                  U l        Xl        [	        USS9U l        [        U5      U l        U R                  5         U R                  5         g NFr  )
rJ   rK   
num_labelsr   r  r  EsmClassificationHead
classifierr  r  r   s     r&   rK   %EsmForSequenceClassification.__init__  sR      ++Fe</7r(   r   r   r   r   r   r  r   ry  rz  re   c
                 .   U	b  U	OU R                   R                  n	U R                  UUUUUUUSS9n
U
S   nU R                  U5      nSnUGb  UR	                  UR
                  5      nU R                   R                  c  U R                  S:X  a  SU R                   l        OoU R                  S:  aN  UR                  [        R                  :X  d  UR                  [        R                  :X  a  SU R                   l        OSU R                   l        U R                   R                  S:X  aI  [        5       nU R                  S:X  a&  U" UR                  5       UR                  5       5      nOU" X5      nOU R                   R                  S:X  a=  [        5       nU" UR                  S	U R                  5      UR                  S	5      5      nO,U R                   R                  S:X  a  [!        5       nU" X5      n[#        UUU
R$                  U
R&                  S
9$ )ae  
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
    Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
    config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
    `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
NTr   r   r   r   r   ry  rz  r   r   r{   single_label_classificationmulti_label_classificationr   r  )r   r  r  r  r[   rX   problem_typer  rH   r!   r   rn   r   r   r   r   r   r   r   r   rS   r   r   r   r   r   r  r   ry  rz  r   r  r  r  r  s                  r&   rg   $EsmForSequenceClassification.forward  s   ( &1%<k$++B]B](()%'/!5  	
 "!*1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#F3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r(   )r  r   r  r  	NNNNNNNNN)ri   rj   rk   rl   rK   r   r   r   r!   r  ro   r   r   r   rp   r   rg   rq   rr   rs   s   @r&   r  r    s    
  151537,059-1,0/3&*?
E,,-?
 !.?
 u//0	?

 ELL)?
   1 12?
 ))*?
 $D>?
 'tn?
 d^?
 
u..	/?
  ?
r(   r  c                   <  ^  \ rS rSrU 4S jr\\         SS\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S\\R                     S	\\   S
\\   S\\   S\\\4   4S jj5       5       rSrU =r$ )EsmForTokenClassificationi_  c                 N  > [         TU ]  U5        UR                  U l        [        USS9U l        [
        R                  " UR                  5      U l        [
        R                  " UR                  UR                  5      U l        U R                  5         U R                  5         g r   )rJ   rK   r  r  r  r   r   r   r   rz   r   r  r  r  r   s     r&   rK   "EsmForTokenClassification.__init__a  su      ++Fe<zz&"<"<=))F$6$68I8IJr(   r   r   r   r   r   r  r   ry  rz  re   c
                    U	b  U	OU R                   R                  n	U R                  UUUUUUUSS9n
U
S   nU R                  U5      nU R	                  U5      nSnUbW  [        5       nUR                  UR                  5      nU" UR                  SU R                  5      UR                  S5      5      n[        UUU
R                  U
R                  S9$ )z
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
    Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
NTr  r   r   r  )r   r  r  r   r  r   r[   rX   r   r  r   r   r   r
  s                  r&   rg   !EsmForTokenClassification.forwardm  s    $ &1%<k$++B]B](()%'/!5  	
 "!*,,71')HYYv}}-FFKKDOO<fkk"oND$!//))	
 	
r(   )r  r   r  r  r  )ri   rj   rk   rl   rK   r   r   r   r!   r  ro   r   r   r   rp   r   rg   rq   rr   rs   s   @r&   r  r  _  s    
  151537,059-1,0/3&*.
E,,-.
 !..
 u//0	.

 ELL).
   1 12.
 ))*.
 $D>.
 'tn.
 d^.
 
u++	,.
  .
r(   r  c                   2   ^  \ rS rSrSrU 4S jrS rSrU =r$ )r  i  z-Head for sentence-level classification tasks.c                 ,  > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        [        R                  " UR                  UR                  5      U l
        g r   )rJ   rK   r   rz   r   r  r   r   r   r  out_projr   s     r&   rK   EsmClassificationHead.__init__  s`    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr(   c                     US S 2SS S 24   nU R                  U5      nU R                  U5      n[        R                  " U5      nU R                  U5      nU R	                  U5      nU$ r  )r   r  r!   tanhr  r  s       r&   rg   EsmClassificationHead.forward  sY    Q1WLLOJJqMJJqMLLOMM!r(   )r  r   r  r  rs   s   @r&   r  r    s    7I r(   r  c                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r   r   )r   rn   r!   cumsumrY   r   )r   r   maskincremental_indicess       r&   r   r     sP     <<$((*D,,t3;;DADH##%33r(   )r  r  r  r  r  )Arm   r2   typingr   r   r!   torch.utils.checkpointr   torch.nnr   r   r   modeling_flash_attention_utilsr
   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   utilsr   r   r   configuration_esmr   r   
get_loggerri   r  r'   r.   r5   r8   rB   ModulerD   ru   r   r   r  r  r2  r0  rG  rO  rU  rk  r  r  r  r  r  r  r  r  r   __all__ru  r(   r&   <module>r*     s,      "    A A h 9  d c > > ( J 
		H	%(
.;#
	(
ehhoo (
V Gryy  GF\=BII \=~pryy pf
BII 
m) mb + /299 /dbii 
		 
G) GT9
 9
z		    B d0! d0 d0N OP' OP OPd		 * N
#5 N
N
b =
 2 =
 =
@BII &4 r(   