
    <h                     p    S r SSKJr  SSKJrJr  SSKJr  \R                  " \	5      r
 " S S\5      rS/rg)	zTokenization class for Dia.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingc            	          ^  \ rS rSrSrSS/r    SS\\   S\\   S\\   S\4U 4S	 jjjr	\
S
 5       rS rS\S\\   4S jrS rS rS\\   S\4S jrSS\S\\   S\\   4S jjrSrU =r$ )DiaTokenizer   a  
Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.

This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
refer to this superclass for more information regarding those methods.

Args:
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    unk_token (`str`, *optional*, defaults to `"<pad>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    max_length (`int`, *optional*, defaults to 1024):
        The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
    offset (`int`, *optional*, defaults to 0):
        The offset of the tokenizer.
	input_idsattention_mask	pad_token	unk_token
max_lengthoffsetc                 
  > [        U[        5      (       a  [        U5      OUn[        U[        5      (       a  [        U5      OUnSU l        U[        S5      [        S5      S.U l        X@l        [        TU ]  " SUUUS.UD6  g )N   z[S1]z[S2])r         )r   r   r    )
isinstancestrr   _utf_vocab_size_added_tokens_decoderr   super__init__)selfr   r   r   r   kwargs	__class__s         `/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/dia/tokenization_dia.pyr   DiaTokenizer.__init__/   s     .8	3-G-GJy)Y	-7	3-G-GJy)Y	#)2z&7IjY_N`%a" 	
!	
 		
    c                     U R                   $ N)r   )r   s    r   
vocab_sizeDiaTokenizer.vocab_sizeE   s    ###r!   c                     [        U R                  U R                  -   5       Vs0 sH  oR                  U5      U_M     nnUR	                  U R
                  5        U$ s  snf r#   )ranger$   r   convert_ids_to_tokensupdateadded_tokens_encoder)r   ivocabs      r   	get_vocabDiaTokenizer.get_vocabI   sX    ;@SWS^S^A^;_`;_a++A.1;_`T../ as   Atextreturnc                 b    UR                  S5       Vs/ sH  n[        U5      PM     nnU$ s  snf )zPTake as input a string and return a list of strings (tokens) for words/sub-wordsutf-8)encodechr)r   r/   r+   tokenss       r   	_tokenizeDiaTokenizer._tokenizeN   s/    "&++g"67"6Q#a&"67 8s   ,c                 \    [        U5      S:w  a  SnU$ [        U5      U R                  -   nU$ )z0Converts a token (str) in an id using the vocab.r   N)lenordr   )r   tokentoken_ids      r   _convert_token_to_id!DiaTokenizer._convert_token_to_idS   s4     u:?H  5zDKK/Hr!   c                 4    [        XR                  -
  5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r4   r   )r   indexr;   s      r   _convert_id_to_token!DiaTokenizer._convert_id_to_token]   s    EKK'(r!   r5   c                    SnU Hr  nX0R                   ;   a*  U R                   U   n[        U5      R                  S5      nO2X0R                  ;   a  UR                  S5      nOUR                  S5      nX%-  nMt     UR	                  SSS9nU$ )z:Converts a sequence of tokens (string) in a single string.r!   r2   ignore)errors)added_tokens_decoderr   r3   r*   decode)r   r5   bstringr;   added_token_obj
tok_stringstrings          r   convert_tokens_to_string%DiaTokenizer.convert_tokens_to_stringb   s    E111"&";";E"B 188A
333"\\'2
"\\'2
!G  9r!   save_directoryfilename_prefixc                     g)Nr   r   )r   rN   rO   s      r   save_vocabularyDiaTokenizer.save_vocabularyr   s    r!   )r   r   r   )<pad>rS   i   r   r#   )__name__
__module____qualname____firstlineno____doc__model_input_namesr   r   intr   propertyr$   r-   listr6   r=   rA   rL   tuplerQ   __static_attributes____classcell__)r   s   @r   r	   r	      s    $ %&67 $+#*$(
C=
 C=
 SM	

 
 
, $ $
c d3i 

tCy S  c HSM ]bcf]g  r!   r	   N)rX   typingr   tokenization_utilsr   r   utilsr   
get_loggerrT   loggerr	   __all__r   r!   r   <module>rf      s?    "  A  
		H	%Y& Yx 
r!   