
    <h,                     h    S r SSKrSSKJrJr  SSKJr  SSKJr  SSK	J
r
  / S	Qr " S
 S\5      rS/rg)z
Processor class for EVOLLA.
    N)OptionalUnion   )BatchFeature)ProcessorMixin   )AutoTokenizer)aa_seqfoldseekmsac            
         ^  \ rS rSrSrSS/rS/rSrSrSr	SU 4S jjr
SS jr SS	\4S
 jjr    SS\\\\   \4      S\\\\\      \\   4      S\\   S	\\   4S jjrS rS rS rS rU 4S jr\U 4S j5       rSrU =r$ )EvollaProcessor    a:  
Constructs a EVOLLA processor which wraps a LLama tokenizer and SaProt tokenizer (EsmTokenizer) into a single processor.

[`EvollaProcessor`] offers all the functionalities of [`EsmTokenizer`] and [`LlamaTokenizerFast`]. See the
docstring of [`~EvollaProcessor.__call__`] and [`~EvollaProcessor.decode`] for more information.

Args:
    protein_tokenizer (`EsmTokenizer`):
        An instance of [`EsmTokenizer`]. The protein tokenizer is a required input.
    tokenizer (`LlamaTokenizerFast`, *optional*):
        An instance of [`LlamaTokenizerFast`]. The tokenizer is a required input.
    protein_max_length (`int`, *optional*, defaults to 1024):
        The maximum length of the sequence to be generated.
    text_max_length (`int`, *optional*, defaults to 512):
        The maximum length of the text to be generated.
protein_tokenizer	tokenizersequence_max_lengthr	   c                    > Uc  [        S5      eUc  [        S5      e[        TU ]	  X5        SU R                  l        X0l        X@l        g )Nz+You need to specify an `protein_tokenizer`.z"You need to specify a `tokenizer`.z<|reserved_special_token_0|>)
ValueErrorsuper__init__r   	pad_tokenprotein_max_lengthtext_max_length)selfr   r   r   r   kwargs	__class__s         d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/evolla/processing_evolla.pyr   EvollaProcessor.__init__;   sM    $JKKABB*6#A "4.    c           
      b   / nU H  nUR                  S5      nUR                  S5      nSR                  [        XV5       VVs/ sH&  u  pxUR                  5       UR	                  5       -   PM(     snn5      n	UR                  U	5        M     U R                  R                  USSUSS9n
U
$ s  snnf )Nr
   r    ptT)return_tensors
truncation
max_lengthpadding)getjoinzipupperlowerappendr   batch_encode_plus)r   proteinsr   sa_sequencesproteinr
   r   sfsa_sequence	sa_tokenss              r   process_proteins EvollaProcessor.process_proteinsG   s    G[[*F{{:.H''SEZ"[EZTQ1779qwwy#8EZ"[\K,	   **<<$K]gk = 
	  #\s   ,B+r   c           	          / nU H/  nU R                   R                  USSS9nUR                  U5        M1     U R                  USSSSUS9nU$ )NFT)tokenizeadd_generation_promptr"   longest)add_special_tokensr#   r&   r$   r%   )r   apply_chat_templater,   )r   textsr   promptsmessagespromptprompt_inputss          r   process_textEvollaProcessor.process_textT   sr    
 H^^77&* 8 F
 NN6"  $& ' 
 r   r.   messages_listr   c                 J   Ub  Uc  [        S5      eUb  UOU R                  nUb  UOU R                  n[        U[        5      (       a  U/n[        U[
        [        45      (       a!  [        US   [
        [        45      (       d  U/n[        U[
        [        45      (       a"  [        S U 5       5      (       d  [        S5      e[        U[
        [        45      (       a;  [        S U 5       5      (       d$  [        SSR                  [        5       SU 35      e[        U[
        [        45      (       a  U H  n[        U[
        [        45      (       d  [        S	[        U5       S
35      e[        S U 5       5      (       d  [        S5      e[        S U 5       5      (       d  [        S U 5       5      (       d  M  [        SU 35      e   O[        S[        U5       S
35      eU R                  X5      nU R                  X$5      n[        US   US   US   US   S.S9$ )a  This method takes batched or non-batched proteins and messages_list and converts them into format that can be used by
the model.

Args:
    proteins (`Union[List[dict], dict]`):
        A list of dictionaries or a single dictionary containing the following keys:
            - `"aa_seq"` (`str`) -- The amino acid sequence of the protein.
            - `"foldseek"` (`str`) -- The foldseek string of the protein.
    messages_list (`Union[List[List[dict]], List[dict]]`):
        A list of lists of dictionaries or a list of dictionaries containing the following keys:
            - `"role"` (`str`) -- The role of the message.
            - `"content"` (`str`) -- The content of the message.
    protein_max_length (`int`, *optional*, defaults to 1024):
        The maximum length of the sequence to be generated.
    text_max_length (`int`, *optional*, defaults to 512):
        The maximum length of the text.

Return:
    a dict with following keys:
        - `protein_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the protein sequence.
        - `protein_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the protein sequence.
        - `text_input_ids` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The input IDs for the text sequence.
        - `text_attention_mask` (`torch.Tensor` of shape `(batch_size, sequence_length)`) -- The attention mask for the text sequence.
z3You need to specify `messages_list` and `proteins`.r   c              3   @   #    U H  n[        U[        5      v   M     g 7fN
isinstancedict.0ps     r   	<genexpr>+EvollaProcessor.__call__.<locals>.<genexpr>   s     :aX`ST:a;N;NX`   zUThe proteins should be a list of dictionaries, but not all elements are dictionaries.c              3   `   #    U H%  n[        S  UR                  5        5       5      v   M'     g7f)c              3   0   #    U H  o[         ;   v   M     g 7frG   )PROTEIN_VALID_KEYS)rL   ks     r   rN   5EvollaProcessor.__call__.<locals>.<genexpr>.<genexpr>   s     :A''s   N)allkeysrK   s     r   rN   rO      s'      ;
DLqC::::Hs   ,.z2There should be a list of dictionaries with keys: z, z for each protein.But got: z;Each messages in messages_list should be a list instead of .c              3   @   #    U H  n[        U[        5      v   M     g 7frG   rH   rL   ms     r   rN   rO      s     A1:a..rP   zfEach message in messages_list should be a list of dictionaries, but not all elements are dictionaries.c              3   X   #    U H!  n[        UR                  5       5      S :g  v   M#     g7f)r   N)lenrW   rZ   s     r   rN   rO      s     <8as1668})8s   (*c              3   \   #    U H#  n[        UR                  5       5      S S1:g  v   M%     g7f)rolecontentN)setrW   rZ   s     r   rN   rO      s(      DBJQCMfi%88(s   *,zlEach message in messages_list should be a list of dictionaries with two keys: 'role' and 'content'.But got: zFThe messages_list should be a list of lists of dictionaries, but it's 	input_idsattention_mask)protein_input_idsprotein_attention_maskrb   rc   )data)r   r   r   rI   rJ   listtuplerV   r(   rS   typeanyr5   rB   r   )	r   r.   rD   r   r   r   r?   r4   text_tokenss	            r   __call__EvollaProcessor.__call__l   sK   B }4RSS3E3Q/W[WnWn-<-H/dNbNb h%% zHmdE]33J}UVGWZ^`eYf<g<g*OMhu..s:aX`:a7a7atuuhu..s ;
DL;
 8
 8
 D99/01 2$:'  mdE]33)!(T5M::$'bcghpcqbrrs%tuuAAAA$ A  <8<<< DBJD A A %$$,:/  * XY]^kYlXmmno  ))(G	''G%.{%;*34D*E(5"-.>"?	
 	
r   c                 :    U R                   R                  " U0 UD6$ rG   )r   batch_decoder   argsr   s      r   ro   EvollaProcessor.batch_decode   s    ~~**D;F;;r   c                 :    U R                   R                  " U0 UD6$ rG   )r   decoderp   s      r   rt   EvollaProcessor.decode   s    ~~$$d5f55r   c                 :    U R                   R                  " U0 UD6$ rG   )r   ro   rp   s      r   protein_batch_decode$EvollaProcessor.protein_batch_decode   s    %%22DCFCCr   c                 :    U R                   R                  " U0 UD6$ rG   )r   rt   rp   s      r   protein_decodeEvollaProcessor.protein_decode   s    %%,,d=f==r   c                   > U R                   R                  [        R                  R	                  XR
                  5      5        SU R                  ;   nU(       a  U R                  R                  S5      OS nU(       a  Ub  U R                  R                  S5        [        TU ]  " U40 UD6nU(       a  Ub  U R                  R                  US5        U$ )Nr   )r   save_pretrainedospathr(   protein_tokenizer_dir_name
attributesindexremover   insert)r   save_directoryr   protein_tokenizer_presentprotein_tokenizer_indexoutputsr   s         r   r}   EvollaProcessor.save_pretrained   s    ..rww||NLkLk/lm %84??$J!Pi$//"7"78K"Los$)@)LOO""#67').CFC$)@)LOO""#:<OPr   c                    > [         TU ]  " U40 UD6n[        U[        5      (       a  US   n[        R                  " XR
                  S9nXCl        U$ )Nr   )	subfolder)r   from_pretrainedrI   rh   r	   r   r   )clspretrained_model_name_or_pathr   	processorr   r   s        r   r   EvollaProcessor.from_pretrained   sX    G+,ITVT	 i''!!I)99)5S5S
 '8#r   )r   r   )N      )r   )r   )NNNN)__name__
__module____qualname____firstlineno____doc__r   valid_kwargsprotein_tokenizer_classtokenizer_classr   r   r5   intrB   r   r   rg   rJ   rl   ro   rt   rw   rz   r}   classmethodr   __static_attributes____classcell__)r   s   @r   r   r       s    " &{3J)*L .%O!4
/   # 4 7;GK,0)-W
5dT!123W
  d4:&6T
&B CDW
 %SM	W

 "#W
r<6D>
(  r   r   )r   r~   typingr   r   feature_extraction_utilsr   processing_utilsr   autor	   rS   r   __all__ r   r   <module>r      s@    
 " 4 ! 3 Tn Tn 
r   