
    <h*                       S SK r S SKrS SKJr  S SKJrJrJr  S SKrS SKJ	r	J
r
  SSKJr  SSKJrJr  SSKJr  SS	KJr  SS
KJr  SSKJrJr  SSKJr  SSKJrJrJrJrJ r   SSK!J"r"J#r#  SSK$J%r%J&r&J'r'J(r(J)r)J*r*  SSK+J,r,  SSK-J.r.J/r/J0r0J1r1  SSK2J3r3  SSK4J5r5J6r6  \" 5       (       a  SSKJ7r7  \1Rp                  " \95      r:S r; " S S\
Rx                  5      r=S r>S r? " S S\
Rx                  5      r@ " S S\
Rx                  5      rA " S S \
Rx                  5      rB " S! S"\A5      rC\A\CS#.rD " S$ S%\
Rx                  5      rES& rF " S' S(\
Rx                  5      rG " S) S*\
Rx                  5      rH " S+ S,\5      rI " S- S.\
Rx                  5      rJ " S/ S0\
Rx                  5      rK\/ " S1 S2\'5      5       rL " S3 S4\L5      rM " S5 S6\
Rx                  5      rN " S7 S8\
Rx                  5      rO " S9 S:\
Rx                  5      rP\\/ " S; S<\ 5      5       5       rQ " S= S>\
Rx                  5      rR " S? S@\
Rx                  5      rS\" SA5       " SB SC\
Rx                  5      5       rT " SD SE\
Rx                  5      rU " SF SG\
Rx                  5      rVSH rWSbSI jrXSJ\R                  SK\YSL\R                  4SM jrZ ScSN\
Rx                  SO\R                  SP\R                  SQ\R                  SR\\R                     SS\[ST\[SU\,\.   4SV jjr\ " SW SX\
Rx                  5      r] " SY SZ\5      r^\/ " S[ S\\'5      5       r_ " S] S^\_5      r` " S_ S`\_\5      ra/ SaQrbg)d    N)	dataclass)CallableOptionalUnion)Tensornn   )ACT2FN)CacheDynamicCache)GenerationMixin)use_kernel_forward_from_hub)create_causal_mask)!flash_attn_supports_top_left_maskis_flash_attn_available)GradientCheckpointingLayer)"BaseModelOutputWithCrossAttentionsBaseModelOutputWithPast,BaseModelOutputWithPoolingAndCrossAttentionsCausalLMOutputWithPastModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSModuleUtilsMixinPreTrainedModel find_pruneable_heads_and_indicesget_parameter_dtypeprune_linear_layer)Unpack)TransformersKwargsauto_docstringcan_return_tuplelogging)check_model_inputs   )EvollaConfigSaProtConfig)_flash_attention_forwardc                     U R                  U5      R                  5       n[        R                  " USS9R	                  U5      U-  nUR                  5       U-   $ )z
Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
are ignored. This is modified from fairseq's `utils.make_positions`.

Args:
    x: torch.Tensor x:

Returns: torch.Tensor
r&   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxmaskincremental_indicess       b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/evolla/modeling_evolla.py"create_position_ids_from_input_idsr8   B   sP     <<$((*D,,t3;;DADH##%33    c                   D   ^  \ rS rSrSrU 4S jr    SS jrS rSrU =r	$ )EvollaSaProtEmbeddingsR   zN
Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
c                   > [         TU ]  5         [        R                  " UR                  UR
                  UR                  S9U l        UR                  (       a/  [        R                  " UR
                  UR                  S9U l        OS U l        [        R                  " UR                  5      U l        [        USS5      U l        U R#                  S[$        R&                  " UR(                  5      R+                  S5      SS9  UR                  U l        U R                   S:X  a9  [        R                  " UR(                  UR
                  U R,                  S9U l        UR0                  U l        UR2                  U l        S U l        g )	N)r4   epsposition_embedding_typeabsoluteposition_ids)r&   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsemb_layer_norm_before	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutgetattrr@   register_bufferr/   arangemax_position_embeddingsexpandr4   position_embeddingstoken_dropoutmask_token_idrB   selfconfig	__class__s     r7   rG   EvollaSaProtEmbeddings.__init__W   s2   !||F,=,=v?Q?Q_e_r_rs'' ll6+=+=6CXCXYDO"DOzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 "..'':5')||..0B0BPTP`P`(D$ $11#11 r9   c                    Uc*  Ub  [        XR                  5      nOU R                  U5      nUc  U R                  U5      nUnU R                  (       a  UR                  XR                  :H  R                  S5      S5      nSnUR                  S5      nXR                  :H  R                  S5      R                  5       U-  nUSU-
  -  SU-
  S S 2S S 4   -  R                  UR                  5      nU R                  S:X  a  U R                  U5      n	XY-   nU R                  b  U R                  U5      nUb,  XRR                  S5      -  R                  UR                  5      nU$ )NrC           gQ?r&   rA   )r8   r4   &create_position_ids_from_inputs_embedsrL   rZ   masked_fillr[   	unsqueezesumfloattodtyper@   rY   rP   )
r]   r3   attention_maskrB   inputs_embeds
embeddingsmask_ratio_trainsrc_lengthsmask_ratio_observedrY   s
             r7   forwardEvollaSaProtEmbeddings.forwardp   sp    $A)M]M]^#JJ=Y  00;M #
 #//>P>P1P0[0[\^0_adeJ)(,,R0K#,0B0B#B"G"G"K"Q"Q"SVa"a$,<(<=EXAXZ[]acgZg@hhll  J '':5"&":":<"H#9J??&4J%$'?'?'CCGG
HXHXYJ r9   c                    UR                  5       SS nUS   n[        R                  " U R                  S-   X0R                  -   S-   [        R                  UR
                  S9nUR                  S5      R                  U5      $ )z
We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

Args:
    inputs_embeds: torch.Tensor

Returns: torch.Tensor
NrC   r&   ri   devicer   )sizer/   rV   r4   r2   rt   re   rX   )r]   rk   input_shapesequence_lengthrB   s        r7   rc   =EvollaSaProtEmbeddings.create_position_ids_from_inputs_embeds   s~     $((*3B/%a.||q /4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r9   )	rS   rP   r[   r4   r@   rY   rB   rZ   rL   )NNNN)
__name__
__module____qualname____firstlineno____doc__rG   rp   rc   __static_attributes____classcell__r_   s   @r7   r;   r;   R   s+    !6 /b= =r9   r;   c                 V    U R                  SSS9u  p[        R                  " U* U4SS9$ )N   rC   r+   )chunkr/   catxx1x2s      r7   rotate_half_esmr      s-    WWQBWFB99rc2YB''r9   c                     US S 2S S 2S U R                   S   2S S 24   nUS S 2S S 2S U R                   S   2S S 24   nX-  [        U 5      U-  -   $ )N)shaper   )r   cossins      r7   apply_rotary_pos_emb_esmr      sW    
aMaggbkM1$
%C
aMaggbkM1$
%CG*S011r9   c                      ^  \ rS rSrSrS\4U 4S jjrSS jrS\R                  S\R                  S\
\R                  \R                  4   4S	 jrS
rU =r$ )EvollaSaProtRotaryEmbedding   z
Rotary position embeddings based on those in
[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer). Query and keys are transformed by rotation
matrices which depend on their relative positions.
r,   c           	         > [         TU ]  5         SS[        R                  " SUS[        R                  S9R                  5       U-  -  -  nUnU R                  SU5        S U l        S U l        S U l	        g )N      ?i'  r   r   ri   inv_freq)
rF   rG   r/   rV   int64rg   rU   _seq_len_cached_cos_cached_sin_cached)r]   r,   r   r_   s      r7   rG   $EvollaSaProtRotaryEmbedding.__init__   sl    %ELLC%++$N$T$T$VY\$\]^Z2#r9   c                 j   UR                   U   nX0R                  :w  d$  U R                  R                  UR                  :w  a  X0l        [        R
                  " UR                   U   UR                  S9R                  U R                  5      n[        R                  " X@R                  5      n[        R                  " XU4SS9R                  UR                  5      nUR                  5       S S S S 2S S 24   U l        UR                  5       S S S S 2S S 24   U l        U R                  U R                  4$ )Nrt   rC   r+   )r   r   r   rt   r/   rV   r1   r   outerr   rh   r   r   r   )r]   r   seq_dimensionseq_lentfreqsembs          r7   _update_cos_sin_tables2EvollaSaProtRotaryEmbedding._update_cos_sin_tables   s    ''-( ***d.>.>.E.E.Q#* QWW]3AHHEMMdmm\AKK==1E))UN366qxx@C"wwytQ)9:D"wwytQ)9:D!1!111r9   qkreturnc                     U R                  USS9u  U l        U l        [        XR                  U R                  5      [        X R                  U R                  5      4$ )Nr   )r   )r   r   r   r   )r]   r   r   s      r7   rp   #EvollaSaProtRotaryEmbedding.forward   s[    -1-H-HZ\-H-]*$* %Q(8(8$:J:JK$Q(8(8$:J:JK
 	
r9   )r   r   r   )r   )ry   rz   r{   r|   r}   r.   rG   r   r/   r   tuplerp   r~   r   r   s   @r7   r   r      sR    	 C 	 2 
 
%,, 
5u||A[;\ 
 
r9   r   c                      ^  \ rS rSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S\\	   S	\
\R                     4S
 jjrSrU =r$ )EvollaSaProtSelfAttention   c                 ~  > [         TU ]  5         Xl        UR                  UR                  -  S:w  a7  [        US5      (       d&  [        SUR                   SUR                   S35      eUR                  U l        [        UR                  UR                  -  5      U l        U R                  U R                  -  U l	        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                  U R                  5      U l        [        R                  " UR                   5      U l        U=(       d    [%        USS5      U l        S U l        U R&                  S:X  d  U R&                  S	:X  aH  UR*                  U l        [        R,                  " S
UR*                  -  S-
  U R                  5      U l        O(U R&                  S:X  a  [1        U R                  S9U l        UR2                  U l        X0l        g )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r@   rA   relative_keyrelative_key_queryr   r&   rotaryr+   )rF   rG   r^   rJ   num_attention_headshasattr
ValueErrorr.   attention_head_sizeall_head_sizer   LinearquerykeyvaluerQ   attention_probs_dropout_probrS   rT   r@   rotary_embeddingsrW   rH   distance_embeddingr   
is_decoder	layer_idxr]   r^   r@   r   r_   s       r7   rG   "EvollaSaProtSelfAttention.__init__   s    : ::a?PVXhHiHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ "&''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD#))X5%@TE]E]%^D" ++"r9   hidden_statesrj   	head_maskencoder_hidden_statesencoder_attention_maskoutput_attentionsr   c                    UR                   S   SU R                  U R                  4nU R                  U5      R	                  U5      R                  SS5      nUS Ln	U	(       ac  U R                  U5      R	                  U5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nUnO`U R                  U5      R	                  U5      R                  SS5      n
U R                  U5      R	                  U5      R                  SS5      nXR                  S-  -  nU R                  S:X  a  U R                  X5      u  p[        R                  " XR                  SS5      5      nU R                  S:X  d  U R                  S	:X  GaC  UR                  5       S   n[        R                  " U[        R                  UR                  S
9R	                  SS5      n[        R                  " U[        R                  UR                  S
9R	                  SS5      nX-
  nU R!                  UU R"                  -   S-
  5      nUR%                  UR&                  S9nU R                  S:X  a  [        R(                  " SUU5      nUU-   nOHU R                  S	:X  a8  [        R(                  " SUU5      n[        R(                  " SU
U5      nUU-   U-   nUb  X-   n[*        R,                  R/                  USS9nU R1                  U5      nUb  UU-  n[        R                  " UR%                  UR&                  5      U5      nUR3                  SSSS5      R5                  5       nUR                  5       S S U R6                  4-   nUR	                  U5      nU(       a  UU4OU4nU R8                  (       a  US-   nU$ )Nr   rC   r&   r         r   r   r   r   rs   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr+   r	   N)r   r   r   r   view	transposer   r   r@   r   r/   matmulru   rV   r2   rt   r   rW   rh   ri   einsumr   
functionalsoftmaxrS   permute
contiguousr   r   )r]   r   rj   r   r   r   r   hidden_shapequery_layeris_cross_attention	key_layervalue_layerattention_scores
seq_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                            r7   rp   !EvollaSaProtSelfAttention.forward  s    &++A.D4L4LdNfNfgjj/44\BLLQPQR
 3$>!67<<\JTTUVXYZI**%:;@@NXXYZ\]^K3N/44\BLLQPQRI**]388FPPQRTUVK "$<$<d$BB''83%)%;%;K%S"K !<<5H5HR5PQ''>9T=Y=Y]q=q&++-a0J"\\*EJJ}OcOcdiijlnopN"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s %/@ --//0@b/I ,,7  -	9O_%7%78I8I%JKX%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]??'Gr9   )r   r   r^   r   rS   r   r   r   rW   r   r@   r   r   r   NNNNNNF)ry   rz   r{   r|   rG   r/   r   r   FloatTensorboolr   rp   r~   r   r   s   @r7   r   r      s    #F 7;15=A>B,1O||O !!2!23O E--.	O
  ((9(9:O !)):): ;O $D>O 
u||	O Or9   r   c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EvollaSaProtSelfOutputi]  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " UR                  5      U l        g r   )	rF   rG   r   r   rJ   denserQ   rR   rS   r\   s     r7   rG   EvollaSaProtSelfOutput.__init__^  sB    YYv1163E3EF
zz&"<"<=r9   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   rS   r]   r   input_tensors      r7   rp   EvollaSaProtSelfOutput.forwardc  ,    

=1]3%4r9   r   ry   rz   r{   r|   rG   rp   r~   r   r   s   @r7   r   r   ]      >
 r9   r   c                     ^  \ rS rSrSrSU 4S jjr     SS\R                  S\\R                     S\\R                     S\\R                     S\\R                     S	\\
   S
\\R                     4U 4S jjjrSrU =r$ )EvollaSaProtFlashAttention2ij  aJ  
EVOLLA_SA_PROT flash attention module. This module inherits from `EvollaSaProtSelfAttention` as the weights of the module stays
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
flash attention and deal with padding tokens in case the input contains any of them.
c                 b   > [         TU ]  XUS9  [        5       U l        UR                  U l        g )N)r@   r   )rF   rG   r   _flash_attn_uses_top_left_maskr   dropout_probr   s       r7   rG   $EvollaSaProtFlashAttention2.__init__q  s2    \ef
 /P.Q+"??r9   r   rj   r   r   r   r   r   c                   > U(       d  Uc  Ub)  [         R                  S5        [        TU ]  UUUUUU5      $ UR	                  5       u  pxn	U R                  U R                  U5      5      n
U R                  U R                  U5      5      nU R                  U R                  U5      5      nU
R                  nU
R                  R                  S:w  a  U
R                  R                  OSnU[        R                  :X  a  [        R                  " 5       (       aA  [        [        S5      (       a  [        R                   " U5      O[        R"                  " 5       nOR[        U R$                  S5      (       a  U R$                  R&                  nO U R                  R(                  R                  n[         R                  SU S35        U
R+                  U5      n
UR+                  U5      nUR+                  U5      nXR,                  S-  -  n
U R.                  S	:X  a  U R1                  X5      u  pO9U R.                  S
:X  d  U R.                  S:X  a  [3        SU R.                   S35      e[5        U
R7                  SSSS5      UR7                  SSSS5      UR7                  SSSS5      UUU R8                  SU R:                  (       a  U R<                  OSU R>                  S9	nURA                  XxS5      nUS 4nU R8                  (       a  US-   nU$ )NzEvollaSaProtFlashAttention2 does not support output_attentions, head_mask, or cross_attention. Falling back to the manual attention implementation. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.mpscpuget_autocast_dtype_pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   r   r   r   z%ESM flash attention does not support z embeddingsr   r   r&   r	   r   rb   )query_length	is_causalsoftmax_scalerS   use_top_left_maskrC   r   )!loggerwarning_oncerF   rp   ru   transpose_for_scoresr   r   r   ri   rt   typer/   float32is_autocast_enabledr   r  get_autocast_gpu_dtyper^   r  weightrh   r   r@   r   r   r)   r   r   trainingr   r   reshape)r]   r   rj   r   r   r   r   bszq_len_r   r   r   input_dtypedevice_typetarget_dtypeattn_outputr   r_   s                     r7   rp   #EvollaSaProtFlashAttention2.forwardz  s    	 59N9ZU
 7?%&!  &**,A//

=0IJ--dhh}.EF	//

=0IJ "''1<1C1C1H1HE1Qk((--W\%--'((** u&:;; ,,[9557  &?@@#{{BB#zz0066 >$ &..6K!\2I%..6K "$<$<d$BB''83%)%;%;K%S"K))^;t?[?[_s?sDTEaEaDbbmnoo
 /1a+aAq)1a+oo)-D%%C"AA

 "))#b9%??'Gr9   )r   r   r   r   )ry   rz   r{   r|   r}   rG   r/   r   r   r   r   r   rp   r~   r   r   s   @r7   r   r   j  s    @ 7;15=A>B,1]||] !!2!23] E--.	]
  ((9(9:] !)):): ;] $D>] 
u||	] ]r9   r   )eagerflash_attention_2c                   H   ^  \ rS rSrSU 4S jjrS r      SS jrSrU =r$ )EvollaSaProtAttentioni  c                    > [         TU ]  5         [        UR                     " XS9U l        [        U5      U l        [        5       U l        [        R                  " UR                  UR                  S9U l
        g )N)r   r>   )rF   rG    EVOLLA_SA_PROT_ATTENTION_CLASSES_attn_implementationr]   r   outputsetpruned_headsr   rN   rJ   rO   r]   r^   r   r_   s      r7   rG   EvollaSaProtAttention.__init__  sZ    4V5P5PQRXn	,V4Ef&8&8f>S>STr9   c                 6   [        U5      S:X  a  g [        XR                  R                  U R                  R                  U R
                  5      u  p[        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l        [        U R                  R                  U5      U R                  l	        [        U R                  R                  USS9U R                  l        U R                  R                  [        U5      -
  U R                  l        U R                  R                  U R                  R                  -  U R                  l        U R
                  R                  U5      U l        g )Nr   r&   r+   )lenr   r]   r   r   r$  r   r   r   r   r"  r   r   union)r]   headsindexs      r7   prune_heads!EvollaSaProtAttention.prune_heads  s   u:?79900$))2O2OQUQbQb

 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r9   c           	          U R                  U5      nU R                  UUUUUUS9n	U R                  U	S   U5      n
U
4U	SS  -   nU$ )Nrj   r   r   r   r   r   r&   )rN   r]   r"  )r]   r   rj   r   r   r   r   cache_positionhidden_states_lnself_outputsattention_outputr   s               r7   rp   EvollaSaProtAttention.forward  sh      >>-8yy)"7#9/ ! 
  ;;|AF#%QR(88r9   )rN   r"  r$  r]   r   NNNNFN)	ry   rz   r{   r|   rG   r,  rp   r~   r   r   s   @r7   r  r    s,    U;* "# r9   r  c                 n    U S-  S[         R                  " U [        R                  " S5      -  5      -   -  $ )zr
This is the gelu implementation from the original EVOLLA_SA_PROT repo. Using F.gelu yields subtly wrong results.
g      ?r   g       @)r/   erfmathsqrt)r   s    r7   gelur:    s.     s7cEIIa$))C.&899::r9   c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )EvollaSaProtIntermediatei  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        g r   )rF   rG   r   r   rJ   intermediate_sizer   r\   s     r7   rG   !EvollaSaProtIntermediate.__init__  s,    YYv1163K3KL
r9   r   r   c                 >    U R                  U5      n[        U5      nU$ r   )r   r:  )r]   r   s     r7   rp    EvollaSaProtIntermediate.forward  s     

=1]+r9   )r   
ry   rz   r{   r|   rG   r/   r   rp   r~   r   r   s   @r7   r<  r<    s)    MU\\ ell  r9   r<  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )EvollaSaProtOutputi$  c                    > [         TU ]  5         [        R                  " UR                  UR
                  5      U l        [        R                  " UR                  5      U l	        g r   )
rF   rG   r   r   r>  rJ   r   rQ   rR   rS   r\   s     r7   rG   EvollaSaProtOutput.__init__%  sB    YYv779K9KL
zz&"<"<=r9   c                 R    U R                  U5      nU R                  U5      nX-   nU$ r   r   r   s      r7   rp   EvollaSaProtOutput.forward*  r   r9   r   r   r   s   @r7   rD  rD  $  r   r9   rD  c                   D   ^  \ rS rSrU 4S jr      SS jrS rSrU =r$ )EvollaSaProtLayeri1  c                   > [         TU ]  5         UR                  U l        SU l        [	        U5      U l        UR                  U l        UR                  U l        U R                  (       a/  U R                  (       d  [        U  S35      e[	        U5      U l	        [        U5      U l        [        U5      U l        [        R                  " UR                   UR"                  S9U l        g )Nr&   z> should be used as a decoder model if cross attention is addedr>   )rF   rG   chunk_size_feed_forwardseq_len_dimr  	attentionr   add_cross_attentionRuntimeErrorcrossattentionr<  intermediaterD  r"  r   rN   rJ   rO   r\   s     r7   rG   EvollaSaProtLayer.__init__2  s    '-'E'E$.v6 ++#)#=#= ##??"dV+i#jkk"7"?D4V<(0f&8&8f>S>STr9   c           	      n   U R                  UUUUS9nUS   n	U R                  (       a  USS n
OUSS  n
U R                  (       aC  Ub@  [        U S5      (       d  [        SU  S35      eU R	                  U	UUUUUS9nUS   n	XSS -   n
U R                  U	5      nU4U
-   n
U R                  (       a  U
S	-   n
U
$ )
N)rj   r   r   r   r&   rC   rQ  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r/  r   )rN  r   r   AttributeErrorrQ  feed_forward_chunk)r]   r   rj   r   r   r   r   r0  self_attention_outputsr3  r   cross_attention_outputslayer_outputs                r7   rp   EvollaSaProtLayer.forwardA  s	    "&)/	 "0 "
 2!4 ??,Qr2G,QR0G??4@4!122$=dV D` ` 
 '+&9&9 -#&;'="3 ': '#  7q9" ==G../?@/G+ ??'Gr9   c                 l    U R                  U5      nU R                  U5      nU R                  X15      nU$ r   )rN   rR  r"  )r]   r3  attention_output_lnintermediate_outputrY  s        r7   rV  $EvollaSaProtLayer.feed_forward_chunkt  s9    "nn-=>"//0CD{{#6Ir9   )	rN   rO  rN  rL  rQ  rR  r   r"  rM  r5  )	ry   rz   r{   r|   rG   rp   rV  r~   r   r   s   @r7   rJ  rJ  1  s-    U$ "#1f r9   rJ  c                   L   ^  \ rS rSrU 4S jr\        SS j5       rSrU =r$ )EvollaSaProtEncoderi{  c                 0  > [         TU ]  5         Xl        [        R                  " [        UR                  5       Vs/ sH  n[        U5      PM     sn5      U l        [        R                  " UR                  UR                  S9U l        SU l        g s  snf )Nr>   F)rF   rG   r^   r   
ModuleListrangenum_hidden_layersrJ  layerrN   rJ   rO   emb_layer_norm_aftergradient_checkpointing)r]   r^   r  r_   s      r7   rG   EvollaSaProtEncoder.__init__|  sr    ]]uVMeMeGf#gGf!$5f$=Gf#gh
$&LL1C1CI^I^$_!&+# $hs   Bc
           
         U(       a  SOS n
U(       a  SOS nU(       a  U R                   R                  (       a  SOS n[        U R                  5       Hb  u  pU(       a  X4-   n
Ub  X=   OS nU" UUUUUUS9nUS   nU(       d  M3  UUS   4-   nU R                   R                  (       d  MY  UUS   4-   nMd     U R                  (       a  U R	                  U5      nU(       a  X4-   n
[        UU
UUS9$ )N )r   rj   r   r   r   r   r   r&   r   last_hidden_stater   
attentionscross_attentions)r^   rO  	enumeratere  rf  r   )r]   r   rj   r   r   r   r   output_hidden_statesreturn_dictr0  all_hidden_statesall_self_attentionsall_cross_attentionsilayer_modulelayer_head_masklayer_outputss                    r7   rp   EvollaSaProtEncoder.forward  s	    #7BD$5b4%64;;;Z;Zr`d(4OA#$58H$H!.7.CilO(+-)&;'="3M *!,M  &9]1=M<O&O#;;222+?=QRCSBU+U('  5* $$ 55mDM 14D D1++*1	
 	
r9   )r^   rf  rg  re  )NNNNFFTN)	ry   rz   r{   r|   rG   r#   rp   r~   r   r   s   @r7   r`  r`  {  s6    ,  "#"0
 0
r9   r`  c                   b   ^  \ rS rSrU 4S jrS\R                  S\R                  4S jrSrU =r	$ )EvollaSaProtPooleri  c                    > [         TU ]  5         [        R                  " UR                  UR                  5      U l        [        R                  " 5       U l        g r   )rF   rG   r   r   rJ   r   Tanh
activationr\   s     r7   rG   EvollaSaProtPooler.__init__  s9    YYv1163E3EF
'')r9   r   r   c                 \    US S 2S4   nU R                  U5      nU R                  U5      nU$ )Nr   )r   r~  )r]   r   first_token_tensorpooled_outputs       r7   rp   EvollaSaProtPooler.forward  s6     +1a40

#566r9   )r~  r   rB  r   s   @r7   r{  r{    s(    $
U\\ ell  r9   r{  c                   0    \ rS rSr% \\S'   S/rSrS rSr	g)EvollaSaProtPreTrainedModeli  r^   rJ  Tc                    U R                   R                  n[        U[        R                  5      (       aW  UR
                  R                  R                  SUS9  UR                  b%  UR                  R                  R                  5         gg[        U[        R                  5      (       ad  UR
                  R                  R                  SUS9  UR                  b2  UR
                  R                  UR                     R                  5         gg[        U[        R                  5      (       aJ  UR                  R                  R                  5         UR
                  R                  R                  S5        gg)zInitialize the weightsrb   meanstdNr   )r^   initializer_range
isinstancer   r   r  datanormal_biaszero_rH   r4   rN   fill_)r]   moduler  s      r7   _init_weights)EvollaSaProtPreTrainedModel._init_weights  s   kk++fbii((MM&&CS&9{{&  &&( '--MM&&CS&9!!-""6#5#56<<> .--KK""$MM$$S) .r9   rj  N)
ry   rz   r{   r|   r(   __annotations___no_split_modules_supports_flash_attnr  r~   rj  r9   r7   r  r    s    ,-*r9   r  c                     ^  \ rS rSrS\4U 4S jjrS rS rS r\	 SS\
\R                     S\
\R                     S	\\\R                     \4   4S
 jj5       r SS\S\\   S\R$                  S\R&                  S	\4
S jjrSrU =r$ )EvollaSaProtProteinEncoderi  r^   c                 d   > [         TU ]  U5        [        U5      U l        [	        U5      U l        g r   )rF   rG   r;   rl   r`  encoderr\   s     r7   rG   #EvollaSaProtProteinEncoder.__init__  s(     08*62r9   c                 .    U R                   R                  $ r   rl   rL   r]   s    r7   get_input_embeddings/EvollaSaProtProteinEncoder.get_input_embeddings  s    ...r9   c                 $    XR                   l        g r   r  r]   r   s     r7   set_input_embeddings/EvollaSaProtProteinEncoder.set_input_embeddings  s    */'r9   c                     UR                  5        H7  u  p#U R                  R                  U   R                  R	                  U5        M9     g)z
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
class PreTrainedModel
N)itemsr  re  rN  r,  )r]   heads_to_prunere  r*  s       r7   _prune_heads'EvollaSaProtProteinEncoder._prune_heads  s<    
 +002LELLu%//;;EB 3r9   r3   rj   r   c                 0   UR                  5       nUu  pEUR                  nUc  [        R                  " XE4US9nU R	                  XS9nU R                  X#5      nU R                  XxS9n	U	S   n
[        U
U	R                  U	R                  U	R                  S9$ )Nr   r3   rj   )rj   r   rk  )ru   rt   r/   onesrl   get_extended_attention_maskr  r   r   rm  rn  )r]   r3   rj   rv   
batch_sizer   rt   rk   extended_attention_maskencoder_outputssequence_outputs              r7   rp   "EvollaSaProtProteinEncoder.forward  s      nn&!,
!!!"ZZ*)A6RN)["&"B"B>"_,,},])!,;-)77&11,==	
 	
r9   rv   rt   ri   c                 P   Uc  [        U 5      nUR                  5       S:X  a  U R                  R                  (       d  Ub  [        R
                  " S[        5        UR                  5       S:X  a  USS2SSS2SS24   nOqUR                  5       S:X  aA  U R                  R                  (       a  [        R                  " X!U5      nO*USS2SSSS24   nO[        SU SUR                   S35      eUR                  US9nS	U-
  [        R                  " U5      R                  -  nU$ )
a  
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.

Arguments:
    attention_mask (`torch.Tensor`):
        Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
    input_shape (`Tuple[int]`):
        The shape of the input to the model.

Returns:
    `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
Nr   zNThe `device` argument is deprecated and will be removed in v5 of Transformers.r	   z!Wrong shape for input_ids (shape z) or attention_mask (shape r   r   r   )r   r,   r^   r   warningswarnFutureWarningr   *create_extended_attention_mask_for_decoderr   r   rh   r/   finfomin)r]   rj   rv   rt   ri   r  s         r7   r  6EvollaSaProtProteinEncoder.get_extended_attention_mask	  s     ='-E""$)dkk.D.D!dfs
 1$&4Qa]&C#!Q& {{%%*:*e*e+' +9D$9I*J'3K=@[\j\p\p[qqrs  #:"<"<5"<"I#&)@#@EKKPUDVDZDZ"Z&&r9   )rl   r  r   r   )ry   rz   r{   r|   r(   rG   r  r  r  r#   r   r/   r   r   r   r   rp   r.   rt   rg   r  r~   r   r   s   @r7   r  r    s    3| 3
/0C  26
ELL)
 !.
 
uU\\"$PP	Q	
 
2 rv2'$2'38:2'GL||2'chcncn2'	2' 2'r9   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )!EvollaSequenceCompressorAttentioni>  c                 X  > [         TU ]  5         US-  U l        X0l        X#-  n[        R
                  " U5      U l        [        R
                  " U5      U l        [        R                  " XSS9U l	        [        R                  " XS-  SS9U l
        [        R                  " XASS9U l        g )Nr   Fr  r   )rF   rG   scaler*  r   rN   
norm_medianorm_latentsr   to_qto_kvto_out)r]   r,   dim_headr*  	inner_dimr_   s        r7   rG   *EvollaSequenceCompressorAttention.__init__?  s    t^

$	,,s+LL-IIc59	YYsM>
ii	U;r9   c                 &   U R                  U5      nU R                  U5      nU R                  nU R                  U5      n[        R
                  " X4SS9nU R                  U5      R                  SSS9u  pxUR                  UR                  S5      UR                  S5      US5      R                  SSSS5      nUR                  UR                  S5      UR                  S5      US5      R                  SSSS5      nUR                  UR                  S5      UR                  S5      US5      R                  SSSS5      nXPR                  -  n[        R                  " XWR                  SS5      5      n	XR                  SSS	9R                  5       -
  n	U	R                   u  pp[        R"                  " X5      R%                  UR&                  5      nUS
S
2S
S
S
S
24   nUS
S
S
2S
S
2S
4   nUU-  nU	R)                  SU-
  R+                  5       S5      n	U	R-                  SS9n[        R                  " UU5      nUR                  SSSS5      nUR/                  UR                  S5      UR                  S5      S5      nU R1                  U5      $ )z
Args:
    x (torch.Tensor): image features
        shape (b, n1, D)
    latent (torch.Tensor): latent features
        shape (b, n2, D);  n2: num of latent tokens
r   r+   r   rC   r   r&   r	   Tr,   keepdimNg     )r  r  r*  r  r/   r   r  r   r   ru   r   r  r   r   amaxdetachr   r  rh   rt   rd   r   r   r  r  )r]   r   latentsr5   hr   kv_inputr   vsimbsnhskdokdr  mask_expones_expattnouts                      r7   rp   )EvollaSequenceCompressorAttention.forwardL  s2    OOA##G,JJIIg99a\r2zz(#))2 * 
 FF166!9affQiB/771aCFF166!9affQiB/771aCFF166!9affQiB/771aC

N ll1kk"b12HHTH299;;99zz""%%dkk24q()aD()("ooq4xoo/6{{r{"ll4#kk!Q1% kk#((1+sxx{B7{{3r9   )r*  r  r  r  r  r  r  )@      r   r   s   @r7   r  r  >  s    <)  ) r9   r  c                   2   ^  \ rS rSrSU 4S jjrS rSrU =r$ )EvollaFeedForwardix  c                   > [         TU ]  5         [        X-  5      n[        R                  " U5      U l        [        R                  " XSS9U l        [        R                  " 5       U l	        [        R                  " X1SS9U l
        g NFr  )rF   rG   r.   r   rN   normr   fc1GELUr~  fc2)r]   r,   multr  r_   s       r7   rG   EvollaFeedForward.__init__y  sZ    
O	LL%	99S%8'')99Y%8r9   c           	      ~    U R                  U R                  U R                  U R                  U5      5      5      5      $ r   )r  r~  r  r  )r]   r   s     r7   rp   EvollaFeedForward.forward  s+    xx1(>?@@r9   )r~  r  r  r  )   r   r   s   @r7   r  r  x  s    9A Ar9   r  c                   6   ^  \ rS rSrS\4U 4S jjrS rSrU =r$ )!EvollaSequenceCompressorResampleri  r^   c                   > [         TU ]  5         UR                  R                  nUR                  U l        [        R                  " [        R                  " U R
                  U5      SS9U l
        [        R                  " / 5      U l        [        UR                  5       Ha  nU R                  R                  [        R                  " [!        X!R"                  UR$                  S9['        X!R(                  S9/5      5        Mc     [        R*                  " UR                  5      U l        [        R.                  " X!R                  5      U l        g )NT)requires_grad)r,   r  r*  )r,   r  )rF   rG   protein_encoder_configrJ   resampler_num_latentsnum_latentsr   	Parameterr/   randnr  rb  layersrc  resampler_depthappendr  resampler_dim_headresampler_headsr  resampler_ff_multrN   r  r   protein_projector)r]   r^   protein_repr_dimr  r_   s       r7   rG   *EvollaSequenceCompressorResampler.__init__  s    !88DD!77||EKK0@0@BR$ScghmmB'v--.AKK9 0;T;T\b\r\r *.>E]E]^		 / LL!3!34	!#+;=O=O!Pr9   c                 d   UR                   S   nUR                   u  pE[        R                  " X@R                  5      R	                  UR
                  5      n[        R                  " X&4SS9n[        R                  " U5      R	                  U R                  R
                  5      nU R                  S    UR                  SSS5      -  nUR	                  UR                  5      nU R                   H  u  pU	" XU5      U-   nU
" U5      U-   nM     U R                  U5      nU R                  U5      $ )Nr   r&   r+   rC   )r   r/   r  r  rh   rt   r   r  r   ri   r  r  r  )r]   embedsr5   br  r  latent_maskr  r  r  fftransformed_features               r7   rp   )EvollaSequenceCompressorResampler.forward  s    LLO

jj%5%5699$++Fyy$,!4 zz!} 3 34,,t$tyyQ'::**V\\*HD6D1G;GkG+G $ #44W=yy,--r9   )r  r  r  r  r  )	ry   rz   r{   r|   r'   rG   rp   r~   r   r   s   @r7   r  r    s    Q| Q*. .r9   r  c                       \ rS rSr% Sr\R                  \S'   Sr\	\R                     \S'   Sr
\	\\R                  S4      \S'   Sr\	\\R                  S4      \S'   Srg)	EvollaProteinEncoderModelOutputi  Nsequence_compressor_outputrl  .r   rm  rj  )ry   rz   r{   r|   r  r/   r   r  rl  r   r   r   rm  r~   rj  r9   r7   r  r    si     59 1 1859x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r9   r  c                   t   ^  \ rS rSrS\4U 4S jjr\S\R                  S\R                  4S j5       r
SrU =r$ )EvollaProteinEncoderi  r^   c                 n   > [         TU ]  5         [        UR                  S9U l        [        US9U l        g )Nr^   )rF   rG   r  r  modelr  sequence_compressor_resamplerr\   s     r7   rG   EvollaProteinEncoder.__init__  s.    /v7T7TU
-NV\-]*r9   r3   rj   c                     U R                  XS9nUR                  nU R                  XR5      n[        UUR                  S9$ )Nr  )r  rl  )r
  rl  r  r  )r]   r3   rj   kwargsprotein_outputprotein_embedssequence_reprs          r7   rp   EvollaProteinEncoder.forward  sF    iW'99::>Z.'4,>>
 	
r9   )r
  r  )ry   rz   r{   r|   r'   rG   r#   r/   
LongTensorr   rp   r~   r   r   s   @r7   r  r    s?    ^| ^
 
!1!1 
5CTCT 
 
r9   r  c                   r   ^  \ rS rSr   S	S\\   S\\   S\\   4U 4S jjjrS r       S
S jrSr	U =r
$ )#EvollaSequenceAlignerCrossAttentioni  protein_encoder_dimstructure_encoder_dimmsa_encoder_dimc                   > [         TU ]  5         UR                  U l        UR                  U l        U R                  S-  U l        [        U R                  U R                  -  5      U l        U R                  U R                  -  U l        UR                  nUR                  nUR                  n[        R                  " U R                  U R                  5      U l        UbK  [        R                  " X R                  5      U l        [        R                  " X R                  5      U l        OS U l        S U l        UbK  [        R                  " X0R                  5      U l        [        R                  " X0R                  5      U l        OS U l        S U l        UbK  [        R                  " X@R                  5      U l        [        R                  " X@R                  5      U l        OS U l        S U l        [)        U R                  5      U l        [        R,                  " U5      U l        [        R                  " U R                  U R                  US9U l        [3        U R                  U5      U l        [        R6                  " [8        R:                  " S/5      5      U l        [        R6                  " [8        R:                  " S/5      5      U l        g )Nr   r  rb   ) rF   rG   rJ   r   r  r.   r   r   $aligner_attention_probs_dropout_probaligner_enable_biasaligner_ffn_multr   r   r   key_proteinvalue_proteinkey_structurevalue_structurekey_msa	value_msaEvollaRMSNormattention_normrQ   rS   out_projr  r   r  r/   tensorgate_attentiongate_ffw)	r]   r^   r  r  r  r   enable_biasffn_multr_   s	           r7   rG   ,EvollaSequenceAlignerCrossAttention.__init__  s    	!--#)#=#= --t3
#&t'7'7$:R:R'R#S !558P8PP'-'R'R$00**YYt//1C1CD
*!yy)<>P>PQD!#+>@R@R!SD#D!%D ,!#+@BTBT!UD#%99-BDVDV#WD !%D#'D &99_6H6HIDLYY8J8JKDNDL!DN+D,<,<=zz">?		$"2"2D4D4D;W#D$4$4h? ll5<<+>?U\\3%%89r9   c	                    XgU/n	U	 V
s/ sH	  oc  M  U
PM     n	n
U	(       d  [        S5      e[        R                  " U	SS9n	U R                  U5      nU R	                  U5      nU R
                  bA  U R                  b4  UR                  U5      nU R                  U5      nU R                  U5      nOSnSnU R                  bA  U R                  b4  UR                  U5      nU R                  U5      nU R                  U5      nOSnSnU R                  bA  U R                  b4  UR                  U5      nU R                  U5      nU R                  U5      nOSnSnXU/nU V
s/ sH	  oc  M  U
PM     nn
[        R                  " USS9nXU/nU V
s/ sH	  oc  M  U
PM     nn
[        R                  " USS9nUR                  5       SS U R                  U R                  4-   nUR                  " U6 R!                  SSSS5      nUR                  5       SS U R                  U R                  4-   nUR                  " U6 R!                  SSSS5      nUR                  5       SS U R                  U R                  4-   nUR                  " U6 R!                  SSSS5      nXR"                  -  nUcN  [        R$                  " UR                  S5      UR                  S5      5      R                  UR&                  5      nUSS2SSS2S4   U	SS2SSSS24   -  n[        R(                  " UUR+                  SS	5      5      nUUR-                  SS
S9R/                  5       -
  nUR1                  SU-
  R3                  5       [        R4                  " UR6                  5      R8                  5      n[:        R<                  " SS9" U5      n[        R(                  " UU5      nUR!                  SSSS5      R?                  5       nUR                  5       SS	 U R@                  4-   nUR                  " U6 nU RC                  U5      nU$ s  sn
f s  sn
f s  sn
f )z
query_states: text
key_value_states: protein
query_states: [bs, query_seq_len, dim]
key_value_states: [bs, kv_seq_len, dim]
query_attn_mask: [bs, query_seq_len]
kv_attn_mask: [bs, kv_seq_len]
Nz=At least one modality should be provided for cross attention.r&   r+   rC   r   r   r	   r   Tr  )"r   r/   r   r$  r   r  r  rh   r  r   r!  r"  ru   r   r   r   r   r  r  rt   r   r   r  r  rd   r   r  ri   r  r   Softmaxr   r   r%  )r]   query_statesprotein_key_value_statesstructure_key_value_statesmsa_key_value_statesquery_attn_maskprotein_kv_attn_maskstructure_kv_attn_maskmsa_kv_attn_maskkv_attn_maskr  r   key_layer_proteinvalue_layer_proteinkey_layer_structurevalue_layer_structurekey_layer_msavalue_layer_msar   r   new_query_layer_shapenew_key_layer_shapenew_value_layer_shaperj   attn_weightsr   r   r   r   s                                r7   cross_attention3EvollaSequenceAlignerCrossAttention.cross_attention  sK   * -FVW#/A<a<A\]]yy15)),7 jj-'D,>,>,J'?'B'B<'P$ $ 0 01I J"&"4"45M"N $"&)d.B.B.N)C)F)F|)T&"&"4"45O"P$($8$89S$T!"&$(!<<#(B#7#:#:<#H  LL)=>M"nn-ABO M"O&]K	 );	1Q		;IIiQ/	*?S"-?+Qq+?ii3 + 0 0 23B 7$$$$;
 !
 "&&(=>FFq!QPQR'nn.s3$$$$7
 
 NN$78@@Aq!L	 + 0 0 23B 7$$$$;
 !
 "&&(=>FFq!QPQR!JJ. "#jj):):1)=|?P?PQR?STWWXdXkXklO(D!T)9:\!TSWYZJZ=[[||K1D1DR1LM#l&7&7B&7&M&T&T&VV'33%%'\5G5G)H)L)L
 **,-=> _kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**,CDm4q BL < @s"   P?P?!Q*QQ	Q	c                 z   Ubv  UR                   u  pnUcc  [        R                  " X5      R                  U	R                  5      U	R                  X4S9R                  -  R                  UR                  5      nOS nUby  UR                   u  nnnUce  [        R                  " UU5      R                  U	R                  5      U
R                  UU4S9R                  -  R                  UR                  5      nOS nUby  UR                   u  nnnUce  [        R                  " UU5      R                  U	R                  5      UR                  UU4S9R                  -  R                  UR                  5      nOS nUnUb  UR                  5       (       d0  Ub  UR                  5       (       d  Ub  UR                  5       (       ay  UnU R                  UUUUUUUUS9n[        R                  " U R                  5      U-  nUU-   nUnU R                  U5      [        R                  " U R                  5      -  nUU-   nU$ )N)ru   )r.  r/  r0  r1  r2  r3  r4  r5  )r   r/   r  rh   rt   rX   TanyrA  tanhr'  r   r(  )r]   r.  protein_kv_statesstructure_kv_statesmsa_kv_statesr2  r3  r4  r5  protein_batch_maskstructure_batch_maskmsa_batch_maskpast_key_valuer  protein_kv_seq_lenr,   structure_kv_seq_lenmsa_kv_seq_lenr   residuals                       r7   rp   +EvollaSequenceAlignerCrossAttention.forwardo  sP    (*;*A*A'BC#+JJr699:L:S:ST(//6H5M/NPPQ"&--. %
 $( *,?,E,E)B$c%-JJr#78;;<N<U<UV*118Lb7Q1RTTU"(//0 '
 &*"$&3&9&9#B'JJr>2556H6O6OP$++."1E+FHHI"]))* !
  $$ */C/G/G/I/I#/4J4N4N4P4P).>.B.B.D.D$H 00*):+>%2 /%9'=!1 1 	M "JJt':':;mKM$}4M$H GGM2UZZ5NNM$}4Mr9   )r   r   r$  rS   r   r'  r(  rJ   r!  r  r  r   r%  r   r  r"  r  r   )NNNNNNNNNN)ry   rz   r{   r|   r   r.   rG   rA  rp   r~   r   r   s   @r7   r  r    sm     .2/3)-1: &c]1:  (}	1:
 "#1: 1:fnn "#!G Gr9   r  RMSNormc                   8   ^  \ rS rSrSU 4S jjrS rS rSrU =r$ )r#  i  c                    > [         TU ]  5         [        R                  " [        R
                  " U5      5      U l        X l        g)z,
EvollaRMSNorm is equivalent to T5LayerNorm
N)rF   rG   r   r  r/   r  r  variance_epsilon)r]   rJ   r?   r_   s      r7   rG   EvollaRMSNorm.__init__  s/     	ll5::k#:; #r9   c                    UR                   nUR                  [        R                  5      nUR	                  S5      R                  SSS9nU[        R                  " X0R                  -   5      -  nU R                  UR                  U5      -  $ )Nr   rC   T)r  )	ri   rh   r/   r  powr  rsqrtrW  r  )r]   r   r  variances       r7   rp   EvollaRMSNorm.forward  sw    #))%((7 $$Q',,R,>%H?T?T4T(UU{{]--k:::r9   c                 ^    [        U R                  R                  5       SU R                   3$ )Nz, eps=)r   r  r   rW  r  s    r7   
extra_reprEvollaRMSNorm.extra_repr  s*    ))*+6$2G2G1HIIr9   )rW  r  )gư>)	ry   rz   r{   r|   rG   rp   r_  r~   r   r   s   @r7   r#  r#    s    $;J Jr9   r#  c                   l   ^  \ rS rSrSS\4U 4S jjjr\R                  " 5       \S 5       5       r	Sr
U =r$ )EvollaRotaryEmbeddingi  r^   c                   > [         TU ]  5         [        US5      (       aZ  [        UR                  [
        5      (       a;  UR                  R                  SUR                  R                  S5      5      U l        OSU l        UR                  U l	        UR                  U l
        Xl        [        U R                     U l        U R                  U R                  U5      u  o0l        U R                  SUSS9  U R                   U l        g )Nrope_scaling	rope_typer  defaultr   FrD   )rF   rG   r   r  rd  dictgetre  rW   max_seq_len_cachedoriginal_max_seq_lenr^   r   rope_init_fnattention_scalingrU   r   original_inv_freq)r]   r^   rt   r   r_   s       r7   rG   EvollaRotaryEmbedding.__init__  s    6>**z&:M:Mt/T/T#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q((ZeD!%r9   c                 b   U R                   S S S 2S 4   R                  5       R                  UR                  S   SS5      R	                  UR
                  5      nUS S 2S S S 24   R                  5       n[        UR
                  R                  [        5      (       a0  UR
                  R                  S:w  a  UR
                  R                  OSn[        R                  " USS9   UR                  5       UR                  5       -  R                  SS5      n[        R                  " Xf4SS	9nUR                  5       U R                  -  nUR                  5       U R                  -  n	S S S 5        WR	                  UR                   S
9W	R	                  UR                   S
94$ ! , (       d  f       N@= f)Nr   rC   r&   r   r  F)r  enabledr   r+   r   )r   rg   rX   r   rh   rt   r  r  strr/   autocastr   r   r   rl  r   ri   )
r]   r   rB   inv_freq_expandedposition_ids_expandedr  r   r   r   r   s
             r7   rp   EvollaRotaryEmbedding.forward  sR    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E'E!((--[`J`ahhmmfk^^UC&,,.1F1L1L1NNYYZ[]^_E))UN3C'')d444C'')d444C	 D vvAGGv$cff177f&;;; DCs   $BF  
F.)rl  r^   ri  rm  rj  rk  re  r   )ry   rz   r{   r|   r'   rG   r/   no_gradr   rp   r~   r   r   s   @r7   rb  rb    s6    /| / /" ]]_<  <r9   rb  c                   .   ^  \ rS rSrU 4S jrS rSrU =r$ )	EvollaMLPi  c                   > [         TU ]  5         Xl        UR                  U l        UR                  U l        [
        R                  " U R                  U R                  UR                  S9U l        [
        R                  " U R                  U R                  UR                  S9U l	        [
        R                  " U R                  U R                  UR                  S9U l
        [        UR                     U l        g )Nr  )rF   rG   r^   rJ   r>  r   r   mlp_bias	gate_projup_proj	down_projr
   
hidden_actact_fnr\   s     r7   rG   EvollaMLP.__init__  s    !--!'!9!94#3#3T5K5KRXRaRabyy!1!143I3IPVP_P_`4#9#94;K;KRXRaRabV../r9   c                     U R                  U R                  U R                  U5      5      U R                  U5      -  5      nU$ r   )r}  r  r{  r|  )r]   r   r}  s      r7   rp   EvollaMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r9   )r  r^   r}  r{  rJ   r>  r|  r   r   s   @r7   rx  rx    s    0 r9   rx  c                     U SSU R                   S   S-  24   nU SU R                   S   S-  S24   n[        R                  " U* U4SS9$ )z*Rotates half the hidden dims of the input..NrC   r   r+   )r   r/   r   r   s      r7   rotate_halfr     sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r9   c                     UR                  U5      nUR                  U5      nX-  [        U 5      U-  -   nX-  [        U5      U-  -   nXg4$ )a  Applies Rotary Position Embedding to the query and key tensors.

Args:
    q (`torch.Tensor`): The query tensor.
    k (`torch.Tensor`): The key tensor.
    cos (`torch.Tensor`): The cosine part of the rotary embedding.
    sin (`torch.Tensor`): The sine part of the rotary embedding.
    position_ids (`torch.Tensor`, *optional*):
        Deprecated and unused.
    unsqueeze_dim (`int`, *optional*, defaults to 1):
        The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
        sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
        that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
        k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
        cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
        the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
Returns:
    `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
)re   r  )r   r   r   r   rB   unsqueeze_dimq_embedk_embeds           r7   apply_rotary_pos_embr    sS    ( --
&C
--
&Cw;q>C/0Gw;q>C/0Gr9   r   n_repr   c                     U R                   u  p#pEUS:X  a  U $ U SS2SS2SSS2SS24   R                  X#XU5      n U R                  X#U-  XE5      $ )z
This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
r&   N)r   rX   r  )r   r  batchnum_key_value_headsslenhead_dims         r7   	repeat_kvr  "  s_    
 2?1D1D.Ez!!Qa"23::5W\dlmM  e(CTTTr9   r  r   r   r   rj   scalingrS   r  c                 @   [        X R                  5      n[        X0R                  5      n	[        R                  " XR	                  SS5      5      U-  n
Ub"  US S 2S S 2S S 2S UR
                  S   24   nX-   n
[        R                  R                  U
S[        R                  S9R                  UR                  5      n
[        R                  R                  XU R                  S9n
[        R                  " X5      nUR	                  SS5      R                  5       nX4$ )Nr   r	   r   rC   )r,   ri   )pr  r&   )r  num_key_value_groupsr/   r   r   r   r   r   r   r  rh   ri   rS   r  r   )r  r   r   r   rj   r  rS   r  
key_statesvalue_statesr@  causal_maskr  s                r7   eager_attention_forwardr  .  s     3 ; ;<JU$?$?@L<<';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#1==((2U]](SVVW\WbWbcL==((6??([L,,|:K''1-88:K$$r9   c                     ^  \ rS rSrSrS\S\4U 4S jjr  SS\R                  S\
\R                  \R                  4   S\\R                     S	\\   S
\\R                     S\\   S\
\R                  \R                  4   4S jjrSrU =r$ )EvollaAttentioniH  z=Multi-headed attention from 'Attention Is All You Need' paperr^   r   c                 P  > [         TU ]  5         Xl        X l        [	        USUR
                  UR                  -  5      U l        UR                  UR                  -  U l	        U R                  S-  U l
        UR                  U l        SU l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR
                  UR                  U R                  -  UR                  S9U l        [        R                  " UR                  U R                  -  UR
                  UR                  S9U l        g )Nr  r   Tr  )rF   rG   r^   r   rT   rJ   r   r  r  r  r  attention_dropoutr  r   r   attention_biasq_projk_projv_projo_projr%  s      r7   rG   EvollaAttention.__init__K  sI   "
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii&&68J8JQWQfQf
r9   r   rY   rj   rM  r0  r  r   c                 4   UR                   S S n/ UQSPU R                  P7nU R                  U5      R                  U5      R	                  SS5      n	U R                  U5      R                  U5      R	                  SS5      n
U R                  U5      R                  U5      R	                  SS5      nUu  p[        XX5      u  pUb$  XUS.nUR                  XU R                  U5      u  p[        nU R                  R                  S:w  a  [        U R                  R                     nU" U U	U
UU4U R                  (       d  SOU R                  U R                   S.UD6u  nnUR"                  " / UQSP76 R%                  5       nU R'                  U5      nUU4$ )NrC   r&   r   )r   r   r0  r  rb   )rS   r  )r   r  r  r   r   r  r  r  updater   r  r^   r!  r   r  r  r  r  r   r  )r]   r   rY   rj   rM  r0  r  rv   r   r.  r  r  r   r   cache_kwargsattention_interfacer  r@  s                     r7   rp   EvollaAttention.forwardb  s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&#7RU#[ %#&nUL'5'<'<ZW[WeWegs't$J(?;;++w6"9$++:Z:Z"[$7	%
  $}}C$2H2HLL	%
 	%
!\ "));;;;FFHkk+.L((r9   )r  r^   r  r  r  r   r  r  r  r  r  r   )ry   rz   r{   r|   r}   r'   r.   rG   r/   r   r   r   r   r  r    r!   rp   r~   r   r   s   @r7   r  r  H  s    G
| 
 
8 +/59))||)) #5<<#=>)) !.	))
 !)) !!1!12)) +,)) 
u||U\\)	*)) ))r9   r  c                      ^  \ rS rSrS\S\4U 4S jjr            SS\R                  S\	\R                  \R                  4   S\
\R                     S\
\R                     S	\
\   S
\
\   S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\
\R                     S\	\R                     4S jjrSrU =r$ )EvollaDecoderLayeri  r^   r   c                   > [         TU ]  5         UR                  U l        [        XS9U l        [        U5      U l        [        UR                  UR                  S9U l	        [        UR                  UR                  S9U l
        US-   [        UR                  UR                  -  S5      -  S:X  a  [        UUR                  S9U l        g g )Nr^   r   r>   r&   r   )r  )rF   rG   rJ   r  	self_attnrx  mlpr#  rms_norm_epsinput_layernormpost_attention_layernormmaxrd  aligner_num_add_layersr  adapterr%  s      r7   rG   EvollaDecoderLayer.__init__  s    !--(LV$,V-?-?VEXEXY(5f6H6HfNaNa(b%MS!9!9V=Z=Z!Z\]^^bcc>$*$6$6DL dr9   r   rY   rj   rB   rM  	use_cacher0  rG  rH  rI  rJ  rK  rL  r2  r   c                    UnU R                  U5      nU R                  " SUUUUUUUS.UD6u  nnUU-   nUnU R                  U5      nU R                  U5      nUU-   n[	        U S5      (       a  U R                  UUU	U
UUUUS9nU$ )N)r   rj   rB   rM  r  r0  rY   r  )r.  rG  rH  rI  r2  rJ  rK  rL  rj  )r  r  r  r  r   r  )r]   r   rY   rj   rB   rM  r  r0  rG  rH  rI  rJ  rK  rL  r2  r  rQ  r  s                     r7   rp   EvollaDecoderLayer.forward  s    $ !,,];  >> 	
')%)) 3	
 	
q !=0 !55mD/ =04## LL*"3$7+ /#5%9- ) 	M r9   )r  rJ   r  r  r  r  )NNNFNNNNNNNN)ry   rz   r{   r|   r'   r.   rG   r/   r   r   r   r  r   r   rp   r~   r   r   s   @r7   r  r    s_   |  & 2637*.$)59486:04597;15265||5 #5<<#=>5 !.	5
 u//05 !5 D>5 !!1!125 $ELL15 &ell35  -5 %U\\25 'u||45 !.5 "%,,/5" 
u||	#5 5r9   r  c                   h   ^  \ rS rSr% \\S'   SrSr/ SQrS/r	Sr
SrSrSrSr\\S.rU 4S	 jrS
rU =r$ )EvollaPreTrainedModeli  r^   r
  T)r  r  r  past_key_valuesF)r   rm  c                   > U R                   R                  n[        TU ]  U5        [	        U[
        5      (       ad  UR                  R                  5         UR                  R                  5         UR                  R                  R                  R                  S5        g [	        U[        5      (       a%  UR                  R                  R                  SUS9  g g )Nr   rb   r  )r^   r  rF   r  r  r  r'  r  r(  r$  r  r  r  r  r  r  )r]   r  r  r_   s      r7   r  #EvollaPreTrainedModel._init_weights  s    kk++f%fABB!!'')OO!!#!!((--33C8 ABBNN''Sc': Cr9   rj  )ry   rz   r{   r|   r'   r  base_model_prefixsupports_gradient_checkpointingr  _skip_keys_device_placementr  _supports_sdpa_supports_flex_attn_can_compile_fullgraph_supports_attention_backendr  r  _can_record_outputsr  r~   r   r   s   @r7   r  r    s]    &*#
 $5"5N!"'+%
; ;r9   r  c            !         ^  \ rS rSrS\4U 4S jjrS rS r\\	             SS\
R                  S\\
R                     S\\
R                     S	\\   S
\\
R                     S\\   S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\
R                     S\\\4   4S jj5       5       rSrU =r$ )EvollaModeli  r^   c           
      6  > [         TU ]  U5        UR                  U l        UR                  U l        [
        R                  " U R                  UR                  U R                  5      U l        [        US9U l
        [
        R                  " [        UR                  5       Vs/ sH  n[        UUS9PM     sn5      U l        [!        UR                  UR"                  S9U l        ['        US9U l        [+        USS5      U l        U R/                  5         g s  snf )Nr	  r  r>   rg  F)rF   rG   rK   r4   rI   r   rH   rJ   embed_tokensr  protein_encoderrb  rc  rd  r  r  r#  r  r  rb  
rotary_embrT   rg  	post_initr%  s      r7   rG   EvollaModel.__init__  s     !.. ++LL&:L:LdN^N^_36Bmm "'v'?'?!@
 "AI	 #!' "A
 "&"4"4&:M:MN	/v>&-f6NPU&V#s   #Dc                     U R                   $ r   r  r  s    r7   r   EvollaModel.get_input_embeddings  s       r9   c                     Xl         g r   r  r  s     r7   r   EvollaModel.set_input_embeddings  s    !r9   r3   rj   rB   r  rk   r  r0  protein_input_idsprotein_attention_maskstructure_feats	msa_featsrK  rL  r   c                    USL USL-  (       a  [        S5      eUc  U R                  U5      nU(       a  Uc
  [        5       nUcD  Ub  UR                  5       OSn[        R
                  " XUR                  S   -   UR                  S9nUc  UR                  S5      nSnSnUbO  U	bL  U R                  UU	S9nUR                  n[        R                  " S/UR                  S   -  UR                  S9n[        U R                  UUUUS9nUnU R                  UU5      nU R                   H  nU" U4UUUUUUUU
UUUUUS	.UD6nM     U R!                  U5      n[#        UUS
9nU$ )a  
protein_input_ids (torch.LongTensor):
    The input IDs for the protein sequence in structure-aware tokens. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
protein_attention_mask (torch.Tensor):
    The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.
structure_feats (torch.FloatTensor):
    The input IDs for purely structure-based features. Should be of shape `(batch_size, structure_seq_length, structure_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
msa_feats (torch.FloatTensor):
    The input IDs for purely MSA-based features. Should be of shape `(batch_size, msa_seq_length, msa_feat_dim)` and type `torch.FloatTensor`. Dummy input for now.
structure_batch_mask (torch.Tensor):
    The batch mask to decide which protein sequences are purely structure-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `structure_feats`. Dummpy input for now.
msa_batch_mask (torch.Tensor):
    The batch mask to decide which protein sequences are purely MSA-based. Should be of shape `(batch_size)` and type `torch.Tensor`. Should be paired with `msa_feats`. Dummpy input for now.
Nz:You must specify exactly one of input_ids or inputs_embedsr   r&   r   r  T)r^   input_embedsrj   r0  r  )rj   rB   rM  r  r0  rY   rG  rH  rI  rJ  rK  rL  r2  )rl  r  )r   r  r   get_seq_lengthr/   rV   r   rt   re   r  r  r&  r   r^   r  r  r  r   )r]   r3   rj   rB   r  rk   r  r0  r  r  r  r  rK  rL  r  past_seen_tokensprotein_featsrJ  protein_outputsr  r   rY   decoder_layerr"  s                           r7   rp   EvollaModel.forward  s   B -t";<YZZ  --i8M0*nO!CRC^==?de"\\ ]5H5H5K"KTaThThN )33A6L!(-C-O"22+5 3 O ,FFM!&tf7H7N7Nq7Q.QZkZrZr!s(;;&))+
 & #oom\J![[M)*).#-$7"/$3'#5%9- . M )& 		-0(++
 r9   )r  rg  r  r  r4   r  r  rI   )NNNNNNNNNNNNN)ry   rz   r{   r|   r'   rG   r  r  r"   r%   r/   r  r   r   r   r   r   r   r   r   rp   r~   r   r   s   @r7   r  r    sw   | *!"  '+1537+/59$(598<9=7;157;15b##b !.b u//0	b
 "%b   1 12b D>b !!1!12b $E$4$45b !) 6b "%"3"34b E--.b 'u||4b !.b  
u--	.!b  br9   r  c                     ^  \ rS rSrU 4S jrS rS r\\       SS\	R                  S\\	R                     S\\	R                     S\\	R                     S	\	R                  S
\\	R                     S\\   4S jj5       5       rSrU =r$ )EvollaForProteinText2Textiz  c                    > [         TU ]  U5        [        U5      U l        UR                  U l        [
        R                  " UR                  U R                  SS9U l        U R                  5         g r  )
rF   rG   r  r
  rI   r   r   rJ   lm_headr  r\   s     r7   rG   "EvollaForProteinText2Text.__init__{  sQ      (
 ++yy!3!3T__5Qr9   c                 6    U R                   R                  5       $ r   )r
  r  r  s    r7   r  .EvollaForProteinText2Text.get_input_embeddings  s    zz..00r9   c                 8    U R                   R                  U5      $ r   )r
  r  r  s     r7   r  .EvollaForProteinText2Text.set_input_embeddings  s    zz..u55r9   r3   rj   rk   labelsr  r  r  c           
         U R                   " SUUUUUUS.UD6n	U	S   n
U R                  U
5      nSnUb  U R                  " SXU R                  S.UD6n[	        UUU	R
                  U	R                  U	R                  S9nU$ )a|  
protein_input_ids (torch.LongTensor):
    The input IDs for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.LongTensor`.
protein_attention_mask (torch.Tensor):
    The attention mask for the protein sequence. Should be of shape `(batch_size, protein_seq_length)` and type `torch.Tensor`.

Example:

```python
>>> from transformers import EvollaProcessor, EvollaForProteinText2Text
>>> model = EvollaForProteinText2Text.from_pretrained("westlake/Evolla-10B-hf")
>>> processor = EvollaProcessor.from_pretrained("westlake/Evolla-10B-hf")

>>> protein_information = {
    "aa_seq": "your amino acid sequence",
    "foldseek": "your foldseek sequence",
}
>>> question = "What is the function of this protein?"
>>> message = [
    {"role": "system", "content": "You are an AI expert that can answer any questions about protein."},
    {"role": "user", "content": question},
]

>>> inputs = processor(proteins=[protein_information], messages_list=[message], return_tensors="pt", padding="longest")
>>> outputs = model.generate(**inputs)

>>> print(processor.batch_decode(outputs, skip_special_tokens=True))
```)r3   rj   rk   r  r  r  r   N)logitsr  rI   )lossr  r  r   rm  rj  )r
  r  loss_functionrI   r   r  r   rm  )r]   r3   rj   rk   r  r  r  r  r  r   r   r  r  
lm_outputss                 r7   rp   !EvollaForProteinText2Text.forward  s    T ** 
)'/#9
 
  
m,%%iVtibhiD+#33!//))

 r9   )r  r
  rI   rS  )ry   rz   r{   r|   rG   r  r  r#   r"   r/   r  r   r   r   r   rp   r~   r   r   s   @r7   r  r  z  s    16  '+1559-1.29=$(?##? !.?   1 12	?
 ))*? !++? !) 6? D>?  ?r9   r  )r  r  r  )Nr&   )rb   )cr8  r  dataclassesr   typingr   r   r   r/   r   r   activationsr
   cache_utilsr   r   
generationr   integrationsr   masking_utilsr   modeling_flash_attention_utilsr   r   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   r   r   r   r   processing_utilsr    utilsr!   r"   r#   r$   utils.genericr%   configuration_evollar'   r(   r)   
get_loggerry   r	  r8   Moduler;   r   r   r   r   r   r   r   r  r:  r<  rD  rJ  r`  r{  r  r  r  r  r  r  r  r  r#  rb  rx  r  r  r.   r  rg   r  r  r  r  r  r  __all__rj  r9   r7   <module>r     sT  ,   ! , ,   ! . ) 7 / h 9  L  ' R R / < J 
		H	%4 ^=RYY ^=B(
2(
")) (
Vp		 pf
RYY 
m"; mb '4$  /BII /d;ryy 
 
G2 GT9
")) 9
x  */ * **_'!< _'D7 		 7 tA		 A'.		 '.T ?k ?  ?
299 
$k")) k\ Y'JBII J (J(<BII <D		  (6	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % % '(%4C)bii C)LE3 EP ;O ; ;@@' @FP 5 Pf Pr9   