
    <h1#                         S r SSKrSSKJr  SSKJrJr  SSKrSSK	J
r
  SSKJr  SSKJr  S	S
KJr  \R"                  " \5      rSS0r\" SS9 " S S\
5      5       rS/rg)z Tokenization class for SpeechT5.    N)copyfile)AnyOptional   )PreTrainedTokenizer)logging)requires   )EnglishNumberNormalizer
vocab_filezspm_char.model)sentencepiece)backendsc            
       j  ^  \ rS rSrSr\rSS/r      SS\\	\
\4      SS4U 4S jjjrSS	 jr\S
 5       r\S 5       r\R"                  S 5       rS rS rS rS\
S\\
   4S jrS rS rS rS S\\   4S jjr S!S\\   S\\\      S\S\\   4U 4S jjjrS S\
S\\
   S\\
   4S jjrSr U =r!$ )"SpeechT5Tokenizer"   a  
Construct a SpeechT5 tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
        contains the vocabulary necessary to instantiate a tokenizer.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The begin of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    normalize (`bool`, *optional*, defaults to `False`):
        Whether to convert numeric quantities in the text to their spelt-out english counterparts.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Attributes:
    sp_model (`SentencePieceProcessor`):
        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
	input_idsattention_maskNsp_model_kwargsreturnc           
        > Uc  0 OUU l         Xl        X`l        S U l        [        R
                  " S0 U R                   D6U l        U R                  R                  U5        [        T	U ]$  " SUUUUUU R                   S.UD6  g )N)	bos_token	eos_token	unk_token	pad_token	normalizer    )
r   r   r   _normalizerspmSentencePieceProcessorsp_modelLoadsuper__init__)
selfr   r   r   r   r   r   r   kwargs	__class__s
            j/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/speecht5/tokenization_speecht5.pyr#   SpeechT5Tokenizer.__init__Q   s     &5%<r/$"22JT5I5IJ:& 	
 00	
 	
    c                     UR                  SU R                  5      nU(       a  SU-   nU(       a  U R                  U5      nX4$ )Nr    )popr   
normalizer)r$   textis_split_into_wordsr%   r   s        r'   prepare_for_tokenization*SpeechT5Tokenizer.prepare_for_tokenizationn   s;    JJ{DNN;	:D??4(D~r)   c                 6    U R                   R                  5       $ N)r    get_piece_sizer$   s    r'   
vocab_sizeSpeechT5Tokenizer.vocab_sizev   s    }}++--r)   c                 R    U R                   c  [        5       U l         U R                   $ r3   )r   r   r5   s    r'   r-   SpeechT5Tokenizer.normalizerz   s%    #68Dr)   c                     Xl         g r3   )r   )r$   values     r'   r-   r9      s     r)   c                     [        U R                  5       Vs0 sH  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r3   )ranger6   convert_ids_to_tokensupdateadded_tokens_encoder)r$   ivocabs      r'   	get_vocabSpeechT5Tokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Ac                 D    U R                   R                  5       nS US'   U$ )Nr    )__dict__copy)r$   states     r'   __getstate__SpeechT5Tokenizer.__getstate__   s#    ""$ jr)   c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr   r   )rF   hasattrr   r   r   r    r!   r   )r$   ds     r'   __setstate__SpeechT5Tokenizer.__setstate__   sP     t.//#%D 22JT5I5IJ4??+r)   r.   c                 >    U R                   R                  U[        S9$ )zPTake as input a string and return a list of strings (tokens) for words/sub-words)out_type)r    encodestr)r$   r.   s     r'   	_tokenizeSpeechT5Tokenizer._tokenize   s    }}##D3#77r)   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r    piece_to_id)r$   tokens     r'   _convert_token_to_id&SpeechT5Tokenizer._convert_token_to_id   s    }}((//r)   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r    	IdToPiece)r$   indexrX   s      r'   _convert_id_to_token&SpeechT5Tokenizer._convert_id_to_token   s    ''.r)   c                 "   / nSnSnU HW  nXPR                   ;   a2  U(       d  US-  nX0R                  R                  U5      U-   -  nSn/ nMD  UR                  U5        SnMY     X0R                  R                  U5      -  nUR	                  5       $ )z:Converts a sequence of tokens (string) in a single string. Fr+   T)all_special_tokensr    decodeappendstrip)r$   tokenscurrent_sub_tokens
out_stringprev_is_specialrX   s         r'   convert_tokens_to_string*SpeechT5Tokenizer.convert_tokens_to_string   s    
E///&#%Jmm223EFNN
"&%'""))%0"'  	mm**+=>>
!!r)   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r$   token_ids_0token_ids_1s      r'    build_inputs_with_special_tokens2SpeechT5Tokenizer.build_inputs_with_special_tokens   s1    "3"3!444(D,=,=+>>>r)   rn   ro   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ S/nUc  S/[        U5      -  U-   $ S/[        U5      -  S/[        U5      -  -   U-   $ )NT)rn   ro   rr   r
   r   )r"   get_special_tokens_masklen)r$   rn   ro   rr   suffix_onesr&   s        r'   rt   )SpeechT5Tokenizer.get_special_tokens_mask   ss     &72']a 3   cC#k**k99c+&&A3[1A+AB[PPr)   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-ra   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr    serialized_model_protowrite)r$   rx   ry   out_vocab_fileficontent_spiece_models         r'   save_vocabulary!SpeechT5Tokenizer.save_vocabulary   s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	)rF   r   r   r    r   r   )z<s>z</s>z<unk>z<pad>FN)Fr3   )NF)"__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   dictrS   r   r#   r0   propertyr6   r-   setterrC   rI   rN   listrT   rY   r^   rj   intrp   boolrt   tupler   __static_attributes____classcell__)r&   s   @r'   r   r   "   s]   (T *$&67
 48
 "$sCx.1
 

 
: . .    
 ! !

,8c 8d3i 80"&?QUVYQZ ? sxQ9Q3;DI3FQkoQ	cQ Q!c !HSM !]bcf]g ! !r)   r   )r   r}   shutilr   typingr   r   r   r   tokenization_utilsr   utilsr   utils.import_utilsr	   number_normalizerr   
get_loggerr   r   r   r   __all__r   r)   r'   <module>r      sp    ' 	     5  * 6 
		H	%!#34  
%&y!+ y! 'y!x 
r)   