
    <h7                         S SK r S SKJr  S SKJrJrJr  S SKrSSK	J
r
Jr  SSKJr  SSKJr  \(       a  SSKJr  \R$                  " \5      rS	S
0rSr\" SS9 " S S\5      5       rS/rg)    N)copyfile)TYPE_CHECKINGAnyOptional   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r          S S\\	\
\4      4U 4S jjjrS rS r\S	 5       rS
 rSSS\\
   4U 4S jjrS rS rS rS rS!S\\
   S\\
   4S jjrS!S jr S"S\\   S\\\      S\S\\   4U 4S jjjr S!S\\   S\\\      S\\   4S jjr  S#S\\   S\S\S\
4S jjrSr U =r!$ )$GemmaTokenizer+   aa
  
Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
no padding token in the original model.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
    eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
        The end of sequence token.
    pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
        A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
        attention mechanisms or loss computation.
    sp_model_kwargs (`dict[str, Any]`, `Optional`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

    add_bos_token (`bool`, *optional*, defaults to `True`):
        Whether or not to add an `bos_token` at the start of sequences.
    add_eos_token (`bool`, *optional*, defaults to `False`):
        Whether or not to add an `eos_token` at the end of sequences.
    clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
        Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
        extra spaces.
    use_default_system_prompt (`bool`, *optional*, defaults to `False`):
        Whether or not the default system prompt for Gemma should be used.
    spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to add spaces between special tokens.
	input_idsattention_masksp_model_kwargsc                   > Uc  0 OUU l         [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUnXl        Xpl        Xl        Xl        [        R                  " S0 U R                   D6U l
        U R                  R                  U5        [        TU ]4  " SUUUUUUUU	U
US.
UD6  g )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r   r   r    spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r   r   r    r!   kwargs	__class__s                d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/gemma/tokenization_gemma.pyr*   GemmaTokenizer.__init__^   s    &5%<r/MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	MWXacfMgMgJyUDImv	$**)B&22JT5I5IJ:& 	
''+)E&?*G	
 	
    c                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )Nr'   sp_model_proto)__dict__copyr'   serialized_model_proto)r+   states     r.   __getstate__GemmaTokenizer.__getstate__   s;    ""$ j"&--"F"F"Hr0   c                     U R                   R                  U5        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr"   )r3   updater%   r&   r   r'   LoadFromSerializedProtor2   )r+   ds     r.   __setstate__GemmaTokenizer.__setstate__   sG    Q22JT5I5IJ--d.A.ABr0   c                 6    U R                   R                  5       $ )zReturns vocab size)r'   get_piece_size)r+   s    r.   
vocab_sizeGemmaTokenizer.vocab_size   s     }}++--r0   c                     [        U R                  5       Vs0 sH  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf )zReturns vocab as a dict)rangerA   convert_ids_to_tokensr:   added_tokens_encoder)r+   ivocabs      r.   	get_vocabGemmaTokenizer.get_vocab   sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextr   returnc                 &   > [         TU ]  " U40 UD6$ )zE
Args:
    text: TextInput
Simply calls PreTrainedTokenizer's method
)r)   tokenize)r+   rK   r,   r-   s      r.   rN   GemmaTokenizer.tokenize   s     w///r0   c                 >    U R                   R                  U[        S9$ )zf
Args:
    text: TextInput
Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
)out_type)r'   encoder$   )r+   rK   r,   s      r.   	_tokenizeGemmaTokenizer._tokenize   s     }}##D3#77r0   c                 8    U R                   R                  U5      $ )z0Converts a token (str) in an id using the vocab.)r'   piece_to_id)r+   tokens     r.   _convert_token_to_id#GemmaTokenizer._convert_token_to_id   s    }}((//r0   c                 <    U R                   R                  U5      nU$ )z=Converts an index (integer) in a token (str) using the vocab.)r'   	IdToPiece)r+   indexrW   s      r.   _convert_id_to_token#GemmaTokenizer._convert_id_to_token   s    ''.r0   c                     / nSnU HG  nX@R                   ;   a$  X0R                  R                  U5      U-   -  n/ nM6  UR                  U5        MI     X0R                  R                  U5      -  nU$ )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr'   decodeappend)r+   tokenscurrent_sub_tokens
out_stringrW   s        r.   convert_tokens_to_string'GemmaTokenizer.convert_tokens_to_string   st    
E222mm223EFNN
%'""))%0  	mm**+=>>
r0   filename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g[         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        SSS5        U4$ U4$ ! , (       d  f       U4$ = f)z
Save the vocabulary and special tokens file to a directory.

Args:
    save_directory (`str`):
        The directory in which to save the vocabulary.

Returns:
    `Tuple(str)`: Paths to the files saved.
zVocabulary path (z) should be a directoryN-r`   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr'   r5   write)r+   save_directoryri   out_vocab_fileficontent_spiece_models         r.   save_vocabularyGemmaTokenizer.save_vocabulary   s.    ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F	c                     U R                   (       a  U R                  /O/ nU R                  (       a  U R                  /O/ nX1-   U-   nUb
  XS-   U-   U-   nU$ N)r   bos_token_idr   eos_token_idr+   token_ids_0token_ids_1r   r   outputs         r.    build_inputs_with_special_tokens/GemmaTokenizer.build_inputs_with_special_tokens   s\    .2.@.@))*b.2.@.@))*b+l:"*[8<GFr0   r   r   already_has_special_tokensc                   > U(       a  [         TU ]  XSS9$ U R                  (       a  S/O/ nU R                  (       a  S/O/ nUc  US/[	        U5      -  -   U-   $ US/[	        U5      -  -   U-   U-   S/[	        U5      -  -   U-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r      r   )r)   get_special_tokens_maskr   r   len)r+   r   r   r   r   r   r-   s         r.   r   &GemmaTokenizer.get_special_tokens_mask   s    $ &72']a 3   #00sb"00sbA3[)9#9:\IIsS%%'  sS%%	'
 	
r0   c                     U R                   (       a  U R                  /O/ nU R                  (       a  U R                  /O/ nS/[	        X1-   U-   5      -  nUb  US/[	        X2-   U-   5      -  -  nU$ )aM  
Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
sequence pair mask has the following format:

```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence    | second sequence |
```

if token_ids_1 is None, only returns the first portion of the mask (0s).

Args:
    token_ids_0 (`list[int]`):
        List of ids.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
r   r   )r   r   r   r   r   r   s         r.   $create_token_type_ids_from_sequences3GemmaTokenizer.create_token_type_ids_from_sequences  sv    . /3.@.@))*b.2.@.@))*bs<5DEE"qcC :\ IJJJFr0   	token_idsskip_special_tokensr!   c                 (   / n/ nU H  nU(       a  XpR                   ;   a  M  XpR                  ;   a]  U(       a*  UR                  U R                  R	                  U5      5        UR                  U R                  U   R
                  5        / nM  UR                  U5        M     U(       a*  UR                  U R                  R	                  U5      5        U(       a  SR                  U5      nOSR                  U5      nUR                  [        S5      $ )N r`   )	all_special_ids_added_tokens_decoderrc   r'   rb   contentrr   replaceSPIECE_UNDERLINE)r+   r   r   r!   r,   	sub_textscurrent_sub_textidss           r.   _decodeGemmaTokenizer._decode1  s     	C"s.B.B'B000#$$T]]%9%9:J%KL  !;!;C!@!H!HI#%  '',  T]]112BCD(+I	*I  !1377r0   )r   r   r'   r   r    r   )
z<unk>z<bos>z<eos>z<pad>NTFFFFr   )NF)FF)"__name__
__module____qualname____firstlineno____doc__rs   vocab_files_namesmodel_input_namesr   dictr$   r   r*   r7   r=   propertyrA   rI   listrN   rS   rX   r]   rg   tupler|   r   intboolr   r   r   __static_attributes____classcell__)r-   s   @r.   r   r   +   s   ,\ *$&67
 48%*"'&+(
 "$sCx.1(
 (
TC
 . .0[ 0tCy 080
!x} !X]^aXb !6	 sx#
9#
3;DI3F#
ko#
	c#
 #
L JN93;DI3F	cH %*.3	898 "8 (,	8 
8 8r0   r   )rm   shutilr   typingr   r   r   r   r%   tokenization_utilsr   r	   utilsr
   utils.import_utilsr   tokenization_utils_baser   
get_loggerr   rp   rs   r   r   __all__r"   r0   r.   <module>r      sy   , 
  / /  A  * 4			H	%!#45   
%&`8( `8 '`8F	 
r0   