
    <h                         S r SSKrSSKJr  SSKJr  SSKJr  \R                  " \	5      r
SS0rS	 r " S
 S\5      rS/rg)zTokenization classes for ESM.    N)Optional   )PreTrainedTokenizer)logging
vocab_file	vocab.txtc                     [        U S5       nUR                  5       R                  5       nU Vs/ sH  o3R                  5       PM     snsS S S 5        $ s  snf ! , (       d  f       g = f)Nr)openread
splitlinesstrip)r   flinesls       `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/esm/tokenization_esm.pyload_vocab_filer      sL    	j#	!##%#()5a	5) 
	) 
	s   #AAAA
A'c            
         ^  \ rS rSrSr\rSS/r     SU 4S jjrS\	S\
4S jrS	\
S\	4S
 jrS rS rS	\
S\	4S jrS\	S\
4S jr SS\\	   S\\\	      S\\	   4S jjr SS\S\\   S\S\\	   4S jjrS r\S\	4S j5       rSrU =r$ )EsmTokenizer#   z
Constructs an ESM tokenizer.
	input_idsattention_maskc           	      X  > [        U5      U l        [        [        U R                  5      5      U l        [        U R                  5       VV	s0 sH  u  pX_M	     sn	nU l        [        T
U ]  " SUUUUUS.UD6  U R                  U l        U R                  U R                  5        g s  sn	nf )N)	unk_token	cls_token	pad_token
mask_token	eos_token )
r   
all_tokensdict	enumerate_id_to_token_token_to_idsuper__init__unique_no_split_tokens_update_trie)selfr   r   r   r   r   r   kwargsindtok	__class__s             r   r&   EsmTokenizer.__init__+   s     **5 4??!;<6?6PQ6P(#SX6PQ 	
!	
 	
 '+oo#$556 Rs   B&indexreturnc                 L    U R                   R                  XR                  5      $ Nr#   getr   r)   r/   s     r   _convert_id_to_token!EsmTokenizer._convert_id_to_tokenG         $$UNN;;    tokenc                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r2   r$   r4   r   r)   r:   s     r   _convert_token_to_id!EsmTokenizer._convert_token_to_idJ   .      $$U,=,=,A,A$..,QRRr9   c                 "    UR                  5       $ r2   )split)r)   textr*   s      r   	_tokenizeEsmTokenizer._tokenizeM   s    zz|r9   c                 p    U R                   R                  5       nUR                  U R                  5        U$ r2   )r$   copyupdateadded_tokens_encoder)r)   
base_vocabs     r   	get_vocabEsmTokenizer.get_vocabP   s0    &&++-
$334r9   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ r2   r<   r=   s     r   token_to_idEsmTokenizer.token_to_idU   r@   r9   c                 L    U R                   R                  XR                  5      $ r2   r3   r5   s     r   id_to_tokenEsmTokenizer.id_to_tokenX   r8   r9   token_ids_0token_ids_1c                     U R                   /nU R                  /nUc  U R                  c  X1-   $ X1-   U-   $ U R                  c  [        S5      eX1-   U-   U-   U-   $ )Nz=Cannot tokenize multiple sequences when EOS token is not set!)cls_token_ideos_token_id
ValueError)r)   rS   rT   clsseps        r    build_inputs_with_special_tokens-EsmTokenizer.build_inputs_with_special_tokens[   sy       !  !  ((((3..&\]] 3&4s::r9   already_has_special_tokensc                     U(       a1  Ub  [        S5      eU Vs/ sH  oDU R                  ;   a  SOSPM     sn$ S/S/[        U5      -  -   S/-   nUb  US/[        U5      -  S/-   -  nU$ s  snf )at  
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods.

Args:
    token_ids_0 (`list[int]`):
        List of ids of the first sequence.
    token_ids_1 (`list[int]`, *optional*):
        List of ids of the second sequence.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
zYou should not supply a second sequence if the provided sequence of ids is already formatted with special tokens for the model.   r   )rX   all_special_idslen)r)   rS   rT   r]   r:   masks         r   get_special_tokens_mask$EsmTokenizer.get_special_tokens_maski   s    $ && R 
 LWW;%$"6"66AA=;WWsqcC,,-3"QC#k**aS00D	 Xs   A+c                    [         R                  R                  X(       a  US-   OSS-   5      n[        US5       nUR	                  SR                  U R
                  5      5        S S S 5        U4$ ! , (       d  f       U4$ = f)N- r   w
)ospathjoinr   writer    )r)   save_directoryfilename_prefixr   r   s        r   save_vocabularyEsmTokenizer.save_vocabulary   si    WW\\.O?S3Hacgr2rs
*c"aGGDIIdoo./ #} #"}s   +A11
Bc                 ,    [        U R                  5      $ r2   )ra   r    )r)   s    r   
vocab_sizeEsmTokenizer.vocab_size   s    4??##r9   )r#   r$   r    r'   )z<unk>z<cls>z<pad>z<mask>z<eos>r2   )NF)__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESvocab_files_namesmodel_input_namesr&   intstrr6   r>   rD   rK   rN   rQ   listr   r[   boolrc   rp   propertyrs   __static_attributes____classcell__)r-   s   @r   r   r   #   s    *$&67
 78<# <# <S# S# S
S S S< < < JN;9;3;DI3F;	c; in.6tnae	c> $C $ $r9   r   )ry   rj   typingr   tokenization_utilsr   utilsr   
get_loggerru   loggerrz   r   r   __all__r   r9   r   <module>r      sR    $ 	  5  
		H	%!;/ *m$& m$` 
r9   