ó
    <±h™$  ã                   óâ   • S r SSKrSSKrSSKrSSKJrJrJr  SSKJ	r	  SSK
JrJrJr  \" 5       (       a  SSKr\" 5       (       a  SSKr\R"                  " \5      rSS0rS	 r " S
 S\	5      rS/rg)zTokenization class for VITS.é    N)ÚAnyÚOptionalÚUnioné   )ÚPreTrainedTokenizer)Úis_phonemizer_availableÚis_uroman_availableÚloggingÚ
vocab_filez
vocab.jsonc                 ó\   • [         R                  " S5      nUR                  U 5      nUS LnU$ )Nz[^\x00-\x7F])ÚreÚcompileÚsearch)Úinput_stringÚnon_roman_patternÚmatchÚhas_non_romans       Úb/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/vits/tokenization_vits.pyÚhas_non_roman_charactersr   %   s3   € äŸ
š
 ?Ó3Ðð ×$Ñ$ \Ó2€EØ Ð%€MØÐó    c                   ó  ^ • \ rS rSrSr\rSS/r       S SU 4S jjjr\	S 5       r
S	 rS
 rS r SS\S\S\\   S\\\\\4   4   4S jjrS\S\\   4S jrS\\   S\4S jrS rS rSS\S\\   S\\\   S4   4S jjrSrU =r$ )ÚVitsTokenizeré/   a|  
Construct a VITS tokenizer. Also supports MMS-TTS.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    language (`str`, *optional*):
        Language identifier.
    add_blank (`bool`, *optional*, defaults to `True`):
        Whether to insert token id 0 in between the other tokens.
    normalize (`bool`, *optional*, defaults to `True`):
        Whether to normalize the input text by removing all casing and punctuation.
    phonemize (`bool`, *optional*, defaults to `True`):
        Whether to convert the input text into phonemes.
    is_uroman (`bool`, *optional*, defaults to `False`):
        Whether the `uroman` Romanizer needs to be applied to the input text prior to tokenizing.
Ú	input_idsÚattention_maskNÚreturnc	                 ób  >• [        USS9 n
[        R                  " U
5      U l        S S S 5        U R                  R	                  5        VVs0 sH  u  p¼XË_M	     snnU l        X@l        XPl        X`l        Xpl	        X€l
        [        TU ]0  " SUUUUUUUS.U	D6  g ! , (       d  f       Nx= fs  snnf )Núutf-8©Úencoding)Ú	pad_tokenÚ	unk_tokenÚlanguageÚ	add_blankÚ	normalizeÚ	phonemizeÚ	is_uroman© )ÚopenÚjsonÚloadÚencoderÚitemsÚdecoderr#   r$   r%   r&   r'   ÚsuperÚ__init__)Úselfr   r!   r"   r#   r$   r%   r&   r'   ÚkwargsÚvocab_handleÚkÚvÚ	__class__s                €r   r0   ÚVitsTokenizer.__init__H   s¨   ø€ ô * wÒ/°<ÜŸ9š9 \Ó2ˆDŒL÷ 0ð *.¯©×);Ñ);Ô)=Ô>Ñ)=¡ ˜šÑ)=Ò>ˆŒØ ŒØ"ŒØ"ŒØ"Œà"Œä‰Òð 		
ØØØØØØØñ		
ð ó		
÷ 0Õ/üó ?s   ŒBÁB+Â
B(c                 ó,   • [        U R                  5      $ ©N)Úlenr,   )r1   s    r   Ú
vocab_sizeÚVitsTokenizer.vocab_sizej   s   € ä4—<‘<Ó Ð r   c                 ó¬   • [        U R                  5       Vs0 sH  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r9   )Úranger;   Úconvert_ids_to_tokensÚupdateÚadded_tokens_encoder)r1   ÚiÚvocabs      r   Ú	get_vocabÚVitsTokenizer.get_vocabn   sL   € Ü;@ÀÇÁÔ;QÓRÑ;Q°a×+Ñ+¨AÓ.°Ò1Ñ;QˆÐRØ‰T×.Ñ.Ô/Øˆùò Ss   ˜Ac                 óˆ  • [        U R                  R                  5       5      [        U R                  R                  5       5      -   nSnSnU[	        U5      :  ag  SnU H-  nXU[	        U5      -    U:X  d  M  X6-  nU[	        U5      -  nSn  O   U(       d  X1U   R                  5       -  nUS-  nU[	        U5      :  a  Mg  U$ )zfLowercase the input string, respecting any special token ids that may be part or entirely upper-cased.Ú r   FTé   )Úlistr,   ÚkeysrA   r:   Úlower)r1   r   Úall_vocabularyÚfiltered_textrB   Úfound_matchÚwords          r   Únormalize_textÚVitsTokenizer.normalize_texts   sÄ   € ä˜dŸl™l×/Ñ/Ó1Ó2´T¸$×:SÑ:S×:XÑ:XÓ:ZÓ5[Ñ[ˆØˆàˆØ”#lÓ#Ó#ØˆKÛ&Ø A¬¨D«	¡MÐ2°dÕ:Ø!Ñ)MØœ˜T›‘NAØ"&KÙñ 'ö Ø¨a¡×!6Ñ!6Ó!8Ñ8ØQ‘ð ”#lÓ#Õ#ð Ðr   c                 óJ   • U R                   S:X  a  UR                  SS5      nU$ )z4Special treatment of characters in certain languagesÚronu   È›u   Å£)r#   Úreplace)r1   Útexts     r   Ú_preprocess_charÚVitsTokenizer._preprocess_charˆ   s#   € à=‰=˜EÓ!Ø—<‘<  dÓ+ˆDØˆr   rU   Úis_split_into_wordsr%   c           	      óŽ  ^ • Ub  UOT R                   nU(       a  T R                  U5      nT R                  U5      n[        U5      (       a\  T R                  (       aK  [        5       (       d  [        R                  S5        O&[        R                  " 5       nUR                  U5      nT R                  (       aN  [        5       (       d  [        S5      e[        R                  " USSSSSS9n[        R                   " SSU5      nXT4$ U(       a6  S	R#                  [%        ['        U 4S
 jU5      5      5      R)                  5       nXT4$ )am  
Performs any necessary transformations before tokenization.

This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the
`kwargs` at the end of the encoding process to be sure all the arguments have been used.

Args:
    text (`str`):
        The text to prepare.
    is_split_into_words (`bool`, *optional*, defaults to `False`):
        Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
        tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
        which it will tokenize.
    normalize (`bool`, *optional*, defaults to `None`):
        Whether or not to apply punctuation and casing normalization to the text inputs. Typically, VITS is
        trained on lower-cased and un-punctuated text. Hence, normalization is used to ensure that the input
        text consists only of lower-case characters.
    kwargs (`dict[str, Any]`, *optional*):
        Keyword arguments to use for the tokenization.

Returns:
    `tuple[str, dict[str, Any]]`: The prepared text and the unused kwargs.
aC  Text to the tokenizer contains non-Roman characters. To apply the `uroman` pre-processing step automatically, ensure the `uroman` Romanizer is installed with: `pip install uroman` Note `uroman` requires python version >= 3.10Otherwise, apply the Romanizer manually as per the instructions: https://github.com/isi-nlp/uromanzEPlease install the `phonemizer` Python package to use this tokenizer.zen-usÚespeakT)r#   ÚbackendÚstripÚpreserve_punctuationÚwith_stressz\s+Ú rG   c                 ó"   >• U TR                   ;   $ r9   )r,   )Úcharr1   s    €r   Ú<lambda>Ú8VitsTokenizer.prepare_for_tokenization.<locals>.<lambda>Ë   s   ø€ ¸TÀTÇ\Á\Ò=Qr   )r%   rP   rV   r   r'   r	   ÚloggerÚwarningÚurÚUromanÚromanize_stringr&   r   ÚImportErrorÚ
phonemizerr   ÚsubÚjoinrI   Úfilterr\   )r1   rU   rX   r%   r2   rM   Úuromans   `      r   Úprepare_for_tokenizationÚ&VitsTokenizer.prepare_for_tokenizationŽ   s  ø€ ð4 "+Ñ!6‘I¸D¿N¹Nˆ	æà×&Ñ& tÓ,ˆDà×-Ñ-¨dÓ3ˆä# M×2Ñ2°t·~·~Ü&×(Ñ(Ü—‘ðyõô Ÿš›Ø &× 6Ñ 6°}Ó Eà>>Ü*×,Ñ,Ü!Ð"iÓjÐjä&×0Ò0ØØ Ø ØØ%)Ø ñˆMô ŸFšF 6¨3°Ó>ˆMð
 Ð$Ð$ö	 àŸG™G¤D¬Ô0QÐS`Ó)aÓ$bÓc×iÑiÓkˆMàÐ$Ð$r   c                 ó˜   • [        U5      nU R                  (       a-  U R                  S5      /[        U5      S-  S-   -  nX#SSS2'   UnU$ )z]Tokenize a string by inserting the `<pad>` token at the boundary between adjacent characters.r   é   rH   N)rI   r$   Ú_convert_id_to_tokenr:   )r1   rU   ÚtokensÚintersperseds       r   Ú	_tokenizeÚVitsTokenizer._tokenizeÏ   sO   € äd“ˆà>>Ø ×5Ñ5°aÓ8Ð9¼SÀ»[È1¹_ÈqÑ=PÑQˆLØ!'˜˜˜A˜ÑØ!ˆFàˆr   rt   c                 ót   • U R                   (       a  [        U5      S:”  a  USS S2   nSR                  U5      $ )NrH   rr   rG   )r$   r:   rl   )r1   rt   s     r   Úconvert_tokens_to_stringÚ&VitsTokenizer.convert_tokens_to_stringÚ   s0   € Ø>>œc &›k¨A›oØ˜A˜D˜q˜D‘\ˆFØw‰wv‹Ðr   c                 ó~   • U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r,   Úgetr"   )r1   Útokens     r   Ú_convert_token_to_idÚ"VitsTokenizer._convert_token_to_idß   s*   € à|‰|×Ñ §|¡|×'7Ñ'7¸¿¹Ó'GÓHÐHr   c                 ó8   • U R                   R                  U5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r.   r|   )r1   Úindexs     r   rs   Ú"VitsTokenizer._convert_id_to_tokenã   s   € à|‰|×Ñ Ó&Ð&r   Úsave_directoryÚfilename_prefixc           
      ó¢  • [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS9 nUR                  [        R                  " U R                  S	S
SS9S-   5        S S S 5        U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directoryÚ-rG   r   Úwr   r   rr   TF)ÚindentÚ	sort_keysÚensure_asciiÚ
)ÚosÚpathÚisdirrd   Úerrorrl   ÚVOCAB_FILES_NAMESr)   Úwriter*   Údumpsr,   )r1   rƒ   r„   r   Úfs        r   Úsave_vocabularyÚVitsTokenizer.save_vocabularyç   s´   € Üw‰w}‰}˜^×,Ñ,ÜL‰LÐ,¨^Ð,<Ð<SÐTÔUØä—W‘W—\‘\Øµo˜_¨sÒ2È2ÔQbÐcoÑQpÑpó
ˆ
ô *˜c¨GÒ4¸ØG‰G”D—J’J˜tŸ|™|°AÀÐTYÑZÐ]aÑaÔb÷ 5ð ˆ}Ð÷ 5Ô4ð ˆ}Ðús   Á?4B>Â>
C)r$   r.   r,   r'   r#   r%   r&   )z<pad>z<unk>NTTTF)r   N)FNr9   )Ú__name__Ú
__module__Ú__qualname__Ú__firstlineno__Ú__doc__r   Úvocab_files_namesÚmodel_input_namesr0   Úpropertyr;   rD   rP   rV   ÚstrÚboolr   ÚtupleÚdictr   ro   rI   rv   ry   r~   rs   r   r”   Ú__static_attributes__Ú__classcell__)r6   s   @r   r   r   /   s  ø† ñð* *ÐØ$Ð&6Ð7Ðð
 ØØØØØØð 
ð 
÷ 
ð  
ðD ñ!ó ð!òò
ò*ð Y]ñ?%Øð?%Ø.2ð?%ØGOÐPTÁ~ð?%à	ˆsD˜˜c˜‘NÐ"Ñ	#õ?%ðB	˜cð 	 d¨3¡iô 	ð¨t°C©yð ¸Sô ò
Iò'ñ¨cð ÀHÈSÁMð Ð]bÐchÐilÑcmÐosÐcsÑ]t÷ ó r   r   )rš   r*   rŒ   r   Útypingr   r   r   Útokenization_utilsr   Úutilsr   r	   r
   rj   rn   rf   Ú
get_loggerr–   rd   r   r   r   Ú__all__r(   r   r   Ú<module>r©      sv   ðñ #ã Û 	Û 	ß 'Ñ 'å 5ß JÑ Jñ ×ÑÛá×ÑÛà	×	Ò	˜HÓ	%€à! <Ð0Ð òôDÐ'ô DðN Ð
r   