
    <h                        S r SSKrSSKrSSKrSSKJr  SSKJr  SSKJ	r	J
r
Jr  SSKJr  SSKJr  SSKJr  SSKJr  SS	KJrJrJrJr  S
SKJr  S
SKJr  S
SKJ r   S
SK!J"r"  S
SK#J$r$J%r%J&r&J'r'J(r(J)r)J*r*J+r+J,r,J-r-  S
SK.J/r/J0r0J1r1  \1Rd                  " \35      r4Sr5Sr6Sr7Sr8Sr9\$S-  r$\\\\S.r:\5\8S.r;\0" \$5       " S S\)5      5       r<g)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)AnyOptionalUnion)Encoding)	Tokenizer)Decoder)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainer   )convert_slow_tokenizer)convert_gguf_tokenizer)load_gguf_checkpoint)PreTrainedTokenizer)
INIT_TOKENIZER_DOCSTRING
AddedTokenBatchEncodingPreTokenizedInputPreTokenizedInputPairPreTrainedTokenizerBaseSpecialTokensMixin	TextInputTextInputPairTruncationStrategy)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)BPEUnigram	WordLevel	WordPiece)tokenizer_file
vocab_filec            )         ^  \ rS rSr% Sr\rSr\\	S'   U 4S jr
\S\4S j5       r\S\4S j5       r\S\4S	 j5       rS\\\4   4S
 jr\S\\\4   4S j5       r\S\\\4   4S j5       r\S\\\4   4S j5       rS\\\4   4S jrS\4S jrS\4S jr\S\4S j5       r\S\4S j5       r       SHS\S\ \   S\ \   S\S\S\S\S\S\!\\\"4   \#\   4   4S jjr$S\%\\&\   4   S\%\\#\   4   4S jr'S \S\4S! jr(S"\S\ \   4S# jr)SIS$\#\%\\4      S\4S% jjr*SIS&\S\4S' jjr+ SIS(\%\\#\   4   S)\S\%\\#\   4   4S* jjr,SJS+\S&\ \   S,\S\#\   4S- jjr-S.\.S/\/S0\S1\S2\ \   S3\ \   4S4 jr0S\.Rb                  \/Rd                  SS5SSSSSSSSSSSS4S6\%\#\3   \#\4   \#\5   \#\6   4   S,\S.\.S/\/S0\ \   S1\S7\S2\ \   S3\ \   S8\ \   S\ \   S\ \   S\S\S\S\S\S9\S\74&S: jjr8SS\.Rb                  \/Rd                  SS5SSSSSSSSSSSS4S+\%\3\54   S;\ \%\3\54      S,\S.\.S/\/S0\ \   S1\S7\S2\ \   S3\ \   S8\ \   S\ \   S\ \   S\S\S\S\S\S9\S\74(S< jjr9S\#\   S\4S= jr:  SKS>\%\\#\   4   S)\S?\ \   S\4S@ jjr;  SLSA\%\\<Rz                  4   SB\!\   SC\ \   SD\ \   S\!\   4
SE jjr>   SMSF jr?SGr@U =rA$ )NPreTrainedTokenizerFastQ   a5  
Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

Handles all the shared methods for tokenization and special tokens, as well as methods for
downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
Nslow_tokenizer_classc           	        > UR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS 5      nUR                  SS5      nUR                  S0 5      nUR                  SS5      U l        U(       a  Uc  U R                  c  [	        S	5      eUb  [
        R                  " U5      n	GOUb  U(       d  [        R                  " U5      n	OU(       a  [        U5      n	OUbk  [        UR                  S
5      5      n
U
S   S   nU
S   nU
S   n[        X5      u  pUR                  U5        [        U5      S:  a  UR                  U5        O|U R                  b#  USLa  U R                  " U0 UD6n[        U5      n	OLU(       d:  UR                  S
5      U l        UR                  S/ 5      U l        [        U SS9n	S nO[	        S5      eXl        Ub  UR                  UR"                  5        SU l        U R                   R&                  nUbq  U R                   R(                  " S$0 UD6  UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        OU R                   R-                  5         U R                   R.                  nUb  U R                   R0                  " S$0 UD6  UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        UR+                  SUS   5        [2        TU ]h  " S$0 UD6  U R6                  U R                   l        U R:                   Vs1 sH  n[=        [?        U5      5      iM     nn[A        URC                  5       S  S!9 VVs/ sH"  u  nn[=        [?        U5      5      U;  d  M   UPM$     nnn[E        U RF                  RI                  5       5      U Vs/ sH  n[K        U5      PM     sn-   nUU RL                   Vs/ sH  nUU;  d  M  UU;  d  M  UPM     sn-  n[        U5      S:  a  / nU RN                  nU H  n[Q        U[R        5      (       a!  URT                  =(       d    [K        U5      U;   O[K        U5      U;   n[Q        U[J        5      (       a  [S        UUS"9nOUUl*        URW                  U5        M     U(       a  U RY                  U5         [Z        R\                  " U R^                  R`                  Rc                  5       5      nUR                  SU R                  5      U R                  :w  aF  [e        [f        UR                  S#5      5      nU R                  US'   U" S$0 UD6U R^                  l0        g g s  snf s  snnf s  snf s  snf ! [h         a     g f = f)%Ntokenizer_object__slow_tokenizer	gguf_filer%   	from_slowFadded_tokens_decoderadd_prefix_spacezCannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you have sentencepiece installed.r&   config
model_type	tokenizertokenizer_configr   additional_special_tokensT)from_tiktokena9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.
max_lengthtruncation_side	directionstridetruncation_strategystrategy	pad_tokenpad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofc                     U S   $ Nr    )xs    \/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/tokenization_utils_fast.py<lambda>2PreTrainedTokenizerFast.__init__.<locals>.<lambda>   s    STUVSW    key)specialtyperF   )5popgetr1   r*   
ValueErrorcopydeepcopyTokenizerFast	from_filer   r   r   updatelenr&   r6   
_tokenizerinit_kwargs_decode_use_source_tokenizer
truncationenable_truncation
setdefaultno_truncationpaddingenable_paddingsuper__init__split_special_tokensencode_special_tokensr0   hashreprsorteditemslistadded_tokens_encoderkeysstrall_special_tokens_extendedall_special_tokens
isinstancer   rN   append
add_tokensjsonloadsbackend_tokenizerpre_tokenizer__getstate__getattrpre_tokenizers_fast	Exception)selfargskwargsr,   slow_tokenizerr.   fast_tokenizer_filer/   r0   fast_tokenizer
gguf_paramarchitecturetokenizer_dictr5   additional_kwargs_truncation_paddingtokenadded_tokens_decoder_hashindextokens_to_addencodertokensspecial_tokens
is_specialpre_tok_statepre_tok_class	__class__s                              rH   rc    PreTrainedTokenizerFast.__init__b   s~   !::&8$?$6=JJ{D1	$jj)94@JJ{E2	%zz*@"E &

+=u E/D4M4M4U0 
 '!]]+;<N ,Y*445HIN3NCN"-fjj.FGJ%h/=L'4N)*<=0F|0d-NMM*+$%)/0&&2~U7R!66GGN3NCN$jj6DO-3ZZ8SUW-XD*3DMN!Nr  )%MM.445,1)oo00"OO--<<lK,EF/[1IJhH(=>3[5LMOO))+??**OO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS 	"6"040I0I-DHD]D]$^D]5T$u+%6D]!$^ !'';'A'A'C X
 XuDK (AA  X 	 

 t005578Ta;bTa5CJTa;bb#??
?e5PWCWE\aiv\vE?
 	
 }!F!44N& "%44 ]]Bc%jN&BU~5 
 eS))&ujAE$.EMe$ ' '
	 JJt'='='K'K'X'X'Z[M  !3T5J5JKtOdOdd '(;]=N=Nv=V W484I4I017D7U}7U&&4 e? %_

 <c
6  	 		s=   V>W#WW	:	WWWB'W 
W W returnc                     g)NTrF   r{   s    rH   is_fastPreTrainedTokenizerFast.is_fast   s    rK   c                    SU R                   ;   ao  U R                   S   R                  S5      (       aL  [        U S5      (       a:  U R                  (       a)  [        R
                  R                  U R                  5      $ gg)z
`bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
r&   z.modelFT)vocab_files_namesendswithhasattrr&   ospathisfiler   s    rH   can_save_slow_tokenizer/PreTrainedTokenizerFast.can_save_slow_tokenizer   s^     4111d6L6L\6Z6c6cdl6m6mt\**tww~~doo66rK   c                 4    U R                   R                  SS9$ )z@
`int`: Size of the base vocabulary (without the added tokens).
Fwith_added_tokensrY   get_vocab_sizer   s    rH   
vocab_size"PreTrainedTokenizerFast.vocab_size   s    
 ---FFrK   c                 4    U R                   R                  SS9$ )NTr   )rY   	get_vocabr   s    rH   r   !PreTrainedTokenizerFast.get_vocab   s    ((4(@@rK   c                 "    U R                  5       $ N)r   r   s    rH   vocabPreTrainedTokenizerFast.vocab   s    ~~rK   c                     [        U R                  R                  5       S S9 VVs0 sH  u  pUR                  U_M     snn$ s  snnf )z
Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
optimisation in `self._added_tokens_encoder` for the slow tokenizers.
c                     U S   $ rE   rF   items    rH   rI   >PreTrainedTokenizerFast.added_tokens_encoder.<locals>.<lambda>      dhijdkrK   rL   rh   r0   ri   contentr{   vks      rH   rk   ,PreTrainedTokenizerFast.added_tokens_encoder   s?     *00I0I0O0O0QWk)lm)l		1)lmmm   Ac                 6    U R                   R                  5       $ )z
Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

Returns:
    `dict[str, int]`: The added tokens.
)rY   get_added_tokens_decoderr   s    rH   r0   ,PreTrainedTokenizerFast.added_tokens_decoder  s     7799rK   c                     [        U R                  R                  5       S S9 VVs0 sH  u  pUR                  U_M     snn$ s  snnf )z
Returns the added tokens in the vocabulary as a dictionary of token to index.

Returns:
    `dict[str, int]`: The added tokens.
c                     U S   $ rE   rF   r   s    rH   rI   9PreTrainedTokenizerFast.get_added_vocab.<locals>.<lambda>  r   rK   rL   r   r   s      rH   get_added_vocab'PreTrainedTokenizerFast.get_added_vocab  s?     *00I0I0O0O0QWk)lm)l		1)lmmmr   c                     g)z>
Returns True, to avoid expensive `assert tokenizer` gotchas.
TrF   r   s    rH   __bool__ PreTrainedTokenizerFast.__bool__  s     rK   c                 4    U R                   R                  SS9$ )z4
Size of the full vocabulary with the added tokens.
Tr   r   r   s    rH   __len__PreTrainedTokenizerFast.__len__  s     ---EErK   c                     U R                   $ )zS
`tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
)rY   r   s    rH   ru   )PreTrainedTokenizerFast.backend_tokenizer%  s    
 rK   c                 .    U R                   R                  $ )zE
`tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
)rY   decoderr   s    rH   r   PreTrainedTokenizerFast.decoder,  s    
 &&&rK   FTencodingreturn_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                    Uc  SU R                   ;   nUc  SU R                   ;   nU(       a  UR                  b  U/UR                  -   n	OU/n	[        [        5      n
U	 H  nU
S   R	                  UR
                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       a  U
S   R	                  UR                  5        U(       d  M  U
S   R	                  [        UR
                  5      5        M     X4$ )ar  
Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
of encodings, take care of building a batch from overflowing tokens.

Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
lists (overflows) of lists (tokens).

Output shape: (overflows, sequence length)
token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingrB   )model_input_namesoverflowingr   rj   rq   idstype_idsr   r   offsetsrX   )r{   r   r   r   r   r   r   r   r   	encodingsencoding_dictes               rH   _convert_encoding)PreTrainedTokenizerFast._convert_encoding3  s   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D)A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyA}h'..s155z:  ''rK   r   c                     [        U[        5      (       a  U R                  U5      $ U Vs/ sH  o R                  U5      PM     sn$ s  snf )a   
Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the
vocabulary.

Args:
    tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s).

Returns:
    `int` or `list[int]`: The token id or list of token ids.
)rp   rm   #_convert_token_to_id_with_added_voc)r{   r   r   s      rH   convert_tokens_to_ids-PreTrainedTokenizerFast.convert_tokens_to_idsb  sC     fc"";;FCCMSTVE88?VTTTs   Ar   c                 Z    U R                   R                  U5      nUc  U R                  $ U$ r   )rY   token_to_idunk_token_id)r{   r   r   s      rH   r   ;PreTrainedTokenizerFast._convert_token_to_id_with_added_vocr  s,    ++E2=$$$rK   r   c                 J    U R                   R                  [        U5      5      $ r   )rY   id_to_tokenint)r{   r   s     rH   _convert_id_to_token,PreTrainedTokenizerFast._convert_id_to_tokenx  s    **3u:66rK   
new_tokensc                 |    U(       a  U R                   R                  U5      $ U R                   R                  U5      $ r   )rY   add_special_tokensrr   )r{   r   r   s      rH   _add_tokens#PreTrainedTokenizerFast._add_tokens{  s/    ??55jAA))*55rK   pairc                 8    U R                   R                  U5      $ )a  
Returns the number of added tokens when encoding a sequence with special tokens.

<Tip>

This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
this inside your training loop.

</Tip>

Args:
    pair (`bool`, *optional*, defaults to `False`):
        Whether the number of added tokens should be computed in the case of a sequence pair or a single
        sequence.

Returns:
    `int`: Number of special tokens added to sequences.
)rY   num_special_tokens_to_add)r{   r   s     rH   r   1PreTrainedTokenizerFast.num_special_tokens_to_add  s    & 88>>rK   r   skip_special_tokensc                 @   [        U[        5      (       a  U R                  R                  U5      $ / nU(       a  [	        U R
                  5      O	[	        5       nU H?  n[        U5      nXT;   a  M  UR                  U R                  R                  U5      5        MA     U$ )a  
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.

Args:
    ids (`int` or `list[int]`):
        The token id (or token ids) to convert to tokens.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.

Returns:
    `str` or `list[str]`: The decoded token(s).
)rp   r   rY   r   setall_special_idsrq   )r{   r   r   r   ids_to_skipr   s         rH   convert_ids_to_tokens-PreTrainedTokenizerFast.convert_ids_to_tokens  s      c3??..s333Fc$../CEEJE#MM$//55e<=	 
 rK   textr   c                 H    U R                   " SXUS.UD6R                  5       $ )N)r  	text_pairr   rF   )encode_plusr   )r{   r  r   r   r}   s        rH   tokenize PreTrainedTokenizerFast.tokenize  s(    kTN`kdjkrrttrK   padding_strategyr<   r8   r;   rC   rA   c                    U R                   R                  nU R                   R                  nU[        R                  :X  a  Ub  U R                   R                  5         OcUUUR                  U R                  S.n	Uc  Sn
O!U	 Vs0 sH  oUR                  US5      _M     n
nX:w  a  U R                   R                  " S0 U	D6  U[        R                  :X  a  Ub  U R                   R                  5         ggU[        R                  :X  a  UOSnUUb  UOU R                  U R                  U R                   U R"                  US.n	X:w  a  U R                   R$                  " S0 U	D6  ggs  snf )a  
Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
library) and restore the tokenizer settings afterwards.

The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
section.

Args:
    padding_strategy ([`~utils.PaddingStrategy`]):
        The kind of padding that will be applied to the input
    truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
        The kind of truncation that will be applied to the input
    max_length (`int`):
        The maximum size of a sequence.
    stride (`int`):
        The stride to use when handling overflow.
    pad_to_multiple_of (`int`, *optional*):
        If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
        the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
    padding_side (`str`, *optional*):
        The side on which the model should have padding applied. Should be selected between ['right', 'left'].
        Default value is picked from the class attribute of the same name.
N)r8   r;   r=   r:   )rB   r:   pad_idr>   r@   rC   rF   )rY   r\   r`   r   DO_NOT_TRUNCATEr_   valuer9   rQ   r]   r   
DO_NOT_PAD
no_padding
MAX_LENGTHrA   pad_token_idr>   r?   ra   )r{   r
  r<   r8   r;   rC   rA   r   r   targetcurrentr   rB   s                rH   set_truncation_and_padding2PreTrainedTokenizerFast.set_truncation_and_padding  sR   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG1kooa66G 11;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F !..88 "% Hs   E&r   batch_text_or_text_pairsis_split_into_wordsreturn_tensorsrd   c                    [        U[        [        45      (       d  [        S[	        U5       S35      eU R                  UUUUUU	S9  U R                  R                  U:w  a  UU R                  l        U R                  R                  UUUS9nU Vs/ sH  nU R                  UUUUUUUUS9PM     nn0 nUS   S    H,  nU VVVs/ sH  u  nnUU    H  nUPM     M     nnnnUUU'   M.     U VVVs/ sH  u  nnU H  nUPM     M     nnnnU(       a4  / n[        U5       H  u  nu  nnUU/[        US   5      -  -  nM      UUS'   US    H  n U R                  U UU5        M     [        UUU
S	9$ s  snf s  snnnf s  snnnf )
Nz:batch_text_or_text_pairs has to be a list or a tuple (got ))r
  r<   r8   r;   rC   rA   )r   is_pretokenized)r   r   r   r   r   r   r   r   r   r   overflow_to_sample_mapping)tensor_type)rp   tuplerj   	TypeErrorrO   r  rY   re   encode_batchr   	enumeraterX   &_eventual_warn_about_too_long_sequencer   )!r{   r  r   r
  r<   r8   r;   r  rC   rA   r  r   r   r   r   r   r   r   rd   r   r   tokens_and_encodingssanitized_tokensrM   r   _r   stacksanitized_encodingsr  itoksr   s!                                    rH   _batch_encode_plus*PreTrainedTokenizerFast._batch_encode_plus   s   . 2UDMBBLTRjMkLllmn 
 	''- 3!1% 	( 	
 ??004HH4HDOO1OO00$1/ 1 
	. & 
 & ""!&;&;*C+E'=+ # 	 & 	  
( '*1-C&:N&:74DIqQIQ&:EN$)S! . 1ES0DWQdqdq0DS %)+& )*> ?9D!*qcC[8I4J.JJ* !@=W9:)+6I77	:wW 7-/BP^__I 
, OSs   E3E8
3E?r  c                    U(       a  X4/OU/nU R                   " U40 SU_SU_SU_SU_SU_SU_SU	_SU
_S	U_S
U_SU_SU_SU_SU_SU_SU_SU_UD6nUcm  U(       df  [        UR                  5        VVs0 sH5  u  nnU[        U5      S:  a  [	        US   [
        5      (       a  US   OU_M7     snnUR                  5      nU R                  US   UU5        U$ s  snnf )Nr  r   r
  r<   r8   r;   rC   rA   r  r   r   r   r   r   r   r   rd   r   r   )r+  r   ri   rX   rp   rj   r   r#  )r{   r  r  r   r
  r<   r8   r;   r  rC   rA   r  r   r   r   r   r   r   r   rd   r}   batched_inputbatched_outputrM   r  s                            rH   _encode_plus$PreTrainedTokenizerFast._encode_plus[  sk   . 09$*+tf00
 3
  2
 .	

 !4
 "
 
  2
 &
 *
 #8
 #8
 '@
 (B
 $:
  (!
" #
$ "6'
0 !*C* '5&:&:&<&<
U c%j1nE!Hd9S9S%(Y^^&< ((N 	33N;4OQ[]des   7;C
c                     U R                   R                  b%  U R                   R                  R                  U5      $ SR                  U5      $ )N )ru   r   decodejoin)r{   r   s     rH   convert_tokens_to_string0PreTrainedTokenizerFast.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
rK   	token_idsclean_up_tokenization_spacesc                     UR                  SS5      U l        [        U[        5      (       a  U/nU R                  R                  XS9nUb  UOU R                  nU(       a  U R                  U5      nU$ U$ )Nuse_source_tokenizerF)r   )rP   r[   rp   r   rY   r4  r9  clean_up_tokenization)r{   r8  r   r9  r}   r  
clean_texts          rH   _decodePreTrainedTokenizerFast._decode  s~     -3JJ7Mu,U)i%%"I%%i%Y ,7 )22 	%
 (33D9JKrK   save_directory
file_nameslegacy_formatfilename_prefixc                 j   [        U5      nU R                  c  USL a  [        S5      eUSL =(       d    USL =(       a!    U R                  SL=(       a    U R                  nUSL =(       d    USL nU(       a  [        R
                  R                  X(       a  US-   OS[        -   5      nU R                  R                  5        VV	s0 sH  u  pXR                  :  d  M  X_M     n
nn	U
(       a?  [        USSS	9 n[        R                  " U
S
SSS9S-   nUR                  U5        SSS5        U R                  XS9nX--   U4-   nU(       aR  [        R
                  R                  X(       a  US-   OS[         -   5      nU R"                  R%                  U5        X.4-   nU$ s  sn	nf ! , (       d  f       N= f)z
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON
file containing {config + vocab + added-tokens}.
NTzYour tokenizer does not have a legacy version defined and therefore cannot register this version. You might consider leaving the legacy_format at `None` or setting it to `False`.F- wzutf-8)r      )indent	sort_keysensure_ascii
)rC  )rm   r*   rR   r   r   r   r5  ADDED_TOKENS_FILErk   ri   r   openrs   dumpswritesave_vocabularyTOKENIZER_FILEru   save)r{   r@  rA  rB  rC  	save_slow	save_fastadded_tokens_filetokr   added_vocabfout_strvocab_filesr%   s                  rH   _save_pretrained(PreTrainedTokenizerFast._save_pretrained  s    ^,$$,$1F`  d";mt&; -))5-,, 	
 "T)C]e-C	 "/3!6rUf f! 9=8Q8Q8W8W8Yv8Y*#]bfufu]u:3:8YKv+S7Cq"jjQ$]bcfjjGGGG$ D ..~._K#15F4HHJWW\\/3!6rUc cN ""''7#&77J! wCCs   FF8,F$$
F2c           
      X
   [         R                  " U R                  R                  5       5      nUR	                  S5      nUR	                  S5      n	Sn
US   S   S:X  a  0 US   S'   / US   S'   OuUS   S   S	:X  a?  US   S
   b5  US   S
   nUS   S   U   S   n
Ub	  X;   a  XZ   n
SUS   S
'   U
S//US   S'   O*US   S   S;   a	  0 US   S'   O[        SUS   S    S35      eUb%  SUS   ;   a  US   S   U;   a  XWS   S      US   S'   [        R                  " [         R                  " U5      5      n/ nU Hl  nUR	                  SS5      nUR	                  SS5      nUS   S   S	:w  a	  U(       d  M<  Ub  US   U;   a
  X^S      US'   UR                  [        S'0 UD65        Mn     Ub  UR                  U5        US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S:X  a  SU;  a  US   S   b  US   S   US'   US   S   S	:X  a  U
b  XS'   US   b_  US   S   S:X  d2  US   S   S:X  aG  SUS   ;   a>  [        S US   S    5       5      (       a!  [        R                  R                  5       US'   [         US   S      nU" S'X-S.UD6nUR#                  XUS9  U	Gb>  [         R                  " UR                  5       5      nSU	;   a  U	S    H  nU	S   U   S   nUb!  U Vs/ sH  nUR%                  UU5      PM     nnUU	S   U   S'   U H"  nUR'                  U5      nUb  M  [        S 5      e   U Vs/ sH  nUR'                  U5      PM     snU	S   U   S!'   M     S" HG  nUU	;   d  M  U	U   u  nnUb  UU;   a  UU   nUR'                  U5      nUc  [        S 5      eUU/U	U'   MI     U	US'   [        R                  " [         R                  " U5      5      nU R(                  R+                  5       n[,        R.                  R+                  5       nUR1                  S#5        U H  n[3        U U5      c  M  [3        U U5      nUb  UU;   a  UU   nU R4                  R%                  US5      n[7        U[        5      (       a;  [        UUR8                  UR:                  UR<                  UR>                  S$S%9UU'   M  UUU'   M     U R@                  nUb  UR                  U5        [C        U5      S:  a  UUS#'   U RD                  " S'S&U0UD6$ s  snf s  snf )(u  
Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
as the current one.

Args:
    text_iterator (generator of `list[str]`):
        The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
        if you have everything in memory.
    vocab_size (`int`):
        The size of the vocabulary you want for your tokenizer.
    length (`int`, *optional*):
        The total number of sequences in the iterator. This is used to provide meaningful progress tracking
    new_special_tokens (list of `str` or `AddedToken`, *optional*):
        A list of new special tokens to add to the tokenizer you are training.
    special_tokens_map (`dict[str, str]`, *optional*):
        If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
        token name to new special token name in this argument.
    kwargs (`dict[str, Any]`, *optional*):
        Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

Returns:
    [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
    `text_iterator`.

added_tokenspost_processorNmodelrO   r!   r   mergesr"   unk_idr   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.	unk_tokenrN   idr   continuing_subword_prefixend_of_word_suffixrv   	ByteLevelSequencepretokenizersc              3   0   #    U H  nUS    S:H  v   M     g7f)rO   rh  NrF   ).0pretokenizers     rH   	<genexpr>BPreTrainedTokenizerFast.train_new_from_iterator.<locals>.<genexpr>P  s!      (X !(K7(Xs   initial_alphabet)r   r   )rB   trainerr   r   zQAttempted to set a token in the post processor that does not exist in the mappingr   )clssepr6   T)single_wordlstriprstrip
normalizedrN   r,   rF   )#rs   rt   rY   to_strrP   rR   rU   from_strrO  rq   r   extendanyry   rh  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrQ   r   rZ   rS   r   SPECIAL_TOKENS_ATTRIBUTESremoverx   _special_tokens_maprp   rt  ru  rv  rw  r6   rX   r   )r{   text_iteratorr   rB   new_special_tokensspecial_tokens_mapr}   tokenizer_jsonr_  r`  rd  rc  r4   r   added_tokenrN   r&  trainer_classrq  trained_tokenizer_jsonrM   r   r   token_idspecial_tokenspecial_tokens_listspecial_token_fullr6   s                               rH   train_new_from_iterator/PreTrainedTokenizerFast.train_new_from_iterator  sd   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1i6U 2 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EU\F]^iFj3kN7#K0!**4::n+EF	 'K!ooi6Gd+Ag&v.);G!-+i2HL^2^);	<R)SI&!!*";{";< ( )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+;/*6/7;F!/26:jH#~o'FF (6(G(X  
 .A-J-J-S-S-U)*01H1PQ_:_X^_%%mG%T%%)ZZ	0@0@0B%C">1)*:;C+,<=cB8LF)5TZ![TZ5"4"8"8"FTZ![FLN#34S9(C!'#,#8#8#?#+", s#  "( ouCuntejIDYDYZ_D`ntCuN#34S9%@ < "0 N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1 "0 8F"#34%..tzz:P/QRI!!&&(0JJOOQ""#>?(EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*==$.%$6$B$B188188#5#@#@ $%F5M %2F5M% )( %)$B$B!)%,,-?@()A-2KF./~~CyCFCCq "\ Dvs   4T"T')r[   rY   r1   r6   r&   )NNFFFFT)F)NF)FN)NN)NNN)B__name__
__module____qualname____firstlineno____doc__VOCAB_FILES_NAMESr   r*   r   __annotations__rc   propertyboolr   r   r   r   dictrm   r   r   rk   r   r0   r   r   r   rU   ru   DecoderFastr   EncodingFastr   r  r   rj   r   r   r   r   r   r   r   r   r  r  r   r   r  r  r  r   r   r   r   r   r+  r0  r6  r>  r   PathLiker\  r  __static_attributes____classcell__)r   s   @rH   r(   r(   Q   s   
 *04-4zx       GC G GA4S> A  tCH~     nd38n n n :d3
?&; : :nc3h n$ F F =   ' ' ' 1504*/+0',#-(-(  (~-(  (~	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^UE#x}2D,E U%PSUYZ]U^P^J_ U   7# 7(3- 76d5j+A&B 6]` 6?d ?s ?, GLd3i(?C	sDI~	8uS u uRV umqrumv uI9)I9 0I9 	I9
 I9 %SMI9 smI9` $(,;,F,F2D2T2T$($),0&*(,0404*/+0',#%*+Y`"'OT-0$7H2I4PeKff#
Y`
 !Y` *Y` 0Y` SMY` Y` "Y` %SMY` smY` !Y`  (~Y`  (~Y`  $(!Y`" %)#Y`$ !%%Y`& 'Y`( )Y`* #+Y`, 
-Y`| DH#',;,F,F2D2T2T$($),0&*)-0404*/+0',#%*);I001; E)->">?@; !	;
 *; 0; SM; ; "; %SM; sm; !;  (~;  (~; $(;  %)!;" !%#;$ %;& ';( #);, 
-;z
tCy 
S 
 %*7;	d3i( " '/tn	 
8 )-)-/c2;;.// #J/  ~	/
 "#/ 
s/j rD rDrK   r(   )=r  rS   rs   r   collectionsr   collections.abcr   typingr   r   r   tokenizers.pre_tokenizerspre_tokenizersry   
tokenizersr   r  r	   rU   tokenizers.decodersr
   r  tokenizers.trainersr   r   r   r   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utilsr   tokenization_utils_baser   r   r   r   r   r   r   r   r   r   utilsr   r   r    
get_loggerr  loggerrR  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILErM  r}  r  r(   rF   rK   rH   <module>r     s   
   	 # $ ' ' 7 / 1 6 ^ ^ : 5 = 3   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-HD5 HD .HDrK   