
    <h$                         S r SSKrSSKrSSKrSSKJrJrJr  SSKJ	r	  SSK
JrJrJr  \" 5       (       a  SSKr\" 5       (       a  SSKr\R"                  " \5      rSS0rS	 r " S
 S\	5      rS/rg)zTokenization class for VITS.    N)AnyOptionalUnion   )PreTrainedTokenizer)is_phonemizer_availableis_uroman_availablelogging
vocab_filez
vocab.jsonc                 \    [         R                  " S5      nUR                  U 5      nUS LnU$ )Nz[^\x00-\x7F])recompilesearch)input_stringnon_roman_patternmatchhas_non_romans       b/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vits/tokenization_vits.pyhas_non_roman_charactersr   %   s3    

?3 $$\2E%M    c                     ^  \ rS rSrSr\rSS/r       S SU 4S jjjr\	S 5       r
S	 rS
 rS r SS\S\S\\   S\\\\\4   4   4S jjrS\S\\   4S jrS\\   S\4S jrS rS rSS\S\\   S\\\   S4   4S jjrSrU =r$ )VitsTokenizer/   a|  
Construct a VITS tokenizer. Also supports MMS-TTS.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    language (`str`, *optional*):
        Language identifier.
    add_blank (`bool`, *optional*, defaults to `True`):
        Whether to insert token id 0 in between the other tokens.
    normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the input text by removing all casing and punctuation.
    phonemize (`bool`, *optional*, defaults to `True`):
        Whether to convert the input text into phonemes.
    is_uroman (`bool`, *optional*, defaults to `False`):
        Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
	input_idsattention_maskNreturnc	                 b  > [        USS9 n
[        R                  " U
5      U l        S S S 5        U R                  R	                  5        VVs0 sH  u  pX_M	     snnU l        X@l        XPl        X`l        Xpl	        Xl
        [        TU ]0  " SUUUUUUUS.U	D6  g ! , (       d  f       Nx= fs  snnf )Nutf-8encoding)	pad_token	unk_tokenlanguage	add_blank	normalize	phonemize	is_uroman )openjsonloadencoderitemsdecoderr#   r$   r%   r&   r'   super__init__)selfr   r!   r"   r#   r$   r%   r&   r'   kwargsvocab_handlekv	__class__s                r   r0   VitsTokenizer.__init__H   s     *w/<99\2DL 0 *.););)=>)=)=> """" 		
		
 		
 0/ ?s   BB+
B(c                 ,    [        U R                  5      $ N)lenr,   )r1   s    r   
vocab_sizeVitsTokenizer.vocab_sizej   s    4<<  r   c                     [        U R                  5       Vs0 sH  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r9   )ranger;   convert_ids_to_tokensupdateadded_tokens_encoder)r1   ivocabs      r   	get_vocabVitsTokenizer.get_vocabn   sL    ;@;QR;Qa++A.1;QRT../ Ss   Ac                    [        U R                  R                  5       5      [        U R                  R                  5       5      -   nSnSnU[	        U5      :  ag  SnU H-  nXU[	        U5      -    U:X  d  M  X6-  nU[	        U5      -  nSn  O   U(       d  X1U   R                  5       -  nUS-  nU[	        U5      :  a  Mg  U$ )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased. r   FT   )listr,   keysrA   r:   lower)r1   r   all_vocabularyfiltered_textrB   found_matchwords          r   normalize_textVitsTokenizer.normalize_texts   s    dll//12T$:S:S:X:X:Z5[[#l##K&AD	M2d:!)MTNA"&K ' a!6!6!88Q #l## r   c                 J    U R                   S:X  a  UR                  SS5      nU$ )z4Special treatment of characters in certain languagesronu   țu   ţ)r#   replace)r1   texts     r   _preprocess_charVitsTokenizer._preprocess_char   s#    ==E!<<d+Dr   rU   is_split_into_wordsr%   c           	        ^  Ub  UOT R                   nU(       a  T R                  U5      nT R                  U5      n[        U5      (       a\  T R                  (       aK  [        5       (       d  [        R                  S5        O&[        R                  " 5       nUR                  U5      nT R                  (       aN  [        5       (       d  [        S5      e[        R                  " USSSSSS9n[        R                   " SSU5      nXT4$ U(       a6  S	R#                  [%        ['        U 4S
 jU5      5      5      R)                  5       nXT4$ )am  
Performs any necessary transformations before tokenization.

This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.

Args:
    text (`str`):
        The text to prepare.
    is_split_into_words (`bool`, *optional*, defaults to `False`):
        Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
        tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
        which it will tokenize.
    normalize (`bool`, *optional*, defaults to `None`):
        Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
        trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
        text consists only of lower-case characters.
    kwargs (`dict[str, Any]`, *optional*):
        Keyword arguments to use for the tokenization.

Returns:
    `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
aC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usespeakT)r#   backendstrippreserve_punctuationwith_stressz\s+ rG   c                 "   > U TR                   ;   $ r9   )r,   )charr1   s    r   <lambda>8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>   s    TT\\=Qr   )r%   rP   rV   r   r'   r	   loggerwarningurUromanromanize_stringr&   r   ImportError
phonemizerr   subjoinrI   filterr\   )r1   rU   rX   r%   r2   rM   uromans   `      r   prepare_for_tokenization&VitsTokenizer.prepare_for_tokenization   s   4 "+!6IDNN	&&t,D--d3#M22t~~&((y  & 6 6} E>>*,,!"ijj&00  %) M FF63>M
 $$	 GGD0QS`)a$bciikM$$r   c                     [        U5      nU R                  (       a-  U R                  S5      /[        U5      S-  S-   -  nX#SSS2'   UnU$ )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r      rH   N)rI   r$   _convert_id_to_tokenr:   )r1   rU   tokensintersperseds       r   	_tokenizeVitsTokenizer._tokenize   sO    d>> 55a89S[1_q=PQL!'A!Fr   rt   c                 t    U R                   (       a  [        U5      S:  a  USS S2   nSR                  U5      $ )NrH   rr   rG   )r$   r:   rl   )r1   rt   s     r   convert_tokens_to_string&VitsTokenizer.convert_tokens_to_string   s0    >>c&kAoADqD\Fwwvr   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r,   getr"   )r1   tokens     r   _convert_token_to_id"VitsTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr   c                 8    U R                   R                  U5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r.   r|   )r1   indexs     r   rs   "VitsTokenizer._convert_id_to_token   s    ||&&r   save_directoryfilename_prefixc           
         [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS9 nUR                  [        R                  " U R                  S	S
SS9S-   5        S S S 5        U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-rG   r   wr   r   rr   TF)indent	sort_keysensure_ascii
)ospathisdirrd   errorrl   VOCAB_FILES_NAMESr)   writer*   dumpsr,   )r1   r   r   r   fs        r   save_vocabularyVitsTokenizer.save_vocabulary   s    ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 *cG4GGDJJt||ATYZ]aab 5 } 54 }s   ?4B>>
C)r$   r.   r,   r'   r#   r%   r&   )z<pad>z<unk>NTTTF)r   N)FNr9   )__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr0   propertyr;   rD   rP   rV   strboolr   tupledictr   ro   rI   rv   ry   r~   rs   r   r   __static_attributes____classcell__)r6   s   @r   r   r   /   s   * *$&67
  
 
 
  
D ! !
* Y]?%?%.2?%GOPT~?%	sDcN"	#?%B	c 	d3i 	tCy S 
I'c HSM ]bchilcmoscs]t  r   r   )r   r*   r   r   typingr   r   r   tokenization_utilsr   utilsr   r	   r
   rj   rn   rf   
get_loggerr   rd   r   r   r   __all__r(   r   r   <module>r      sv    #  	 	 ' ' 5 J J 			H	%!<0 D' DN 
r   