
    <h                      &   % S r SSKJr  SSKJrJr  SSKJr  \R                  " \	5      r
SrSrSrSrS	rS
rSr\S\S\S\S\S\S0r\\\4   \S'   \R/                  5        V Vs0 sH  u  pX_M	     snn r\\\4   \S'    " S S\5      rS/rgs  snn f )z Tokenization classes for CANINE.    )Optional   )
AddedTokenPreTrainedTokenizer)loggingi   i   i  i  i  i  z[CLS]z[SEP]z[BOS]z[MASK]z[PAD]z
[RESERVED]SPECIAL_CODEPOINTSSPECIAL_CODEPOINTS_BY_NAMEc            
       v  ^  \ rS rSrSr\" \5      \" \5      \" \5      \" \5      \" \5      \" \	5      SS4U 4S jjr
\S\4S j5       rS rS	\S\\   4S
 jrS\S\4S jrS\S\4S jrS r SS\\   S\\\      S\\   4S jjr SS\\   S\\\      S\S\\   4U 4S jjjrSS\S\\   4S jjrSrU =r$ )CanineTokenizer:   a  
Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then
converts each character into its Unicode code point.

[`CanineTokenizer`] inherits from [`PreTrainedTokenizer`].

Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters.

Args:
    model_max_length (`int`, *optional*, defaults to 2048):
            The maximum sentence length the model accepts.
Fi   c	                   > [        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn[        U[        5      (       a  [        USSS9OUn0 U l        [        R                  5        H  u  pXR                  U'   M     U R                  R                  5        VV
s0 sH  u  pX_M	     sn
nU l        [        U l        [        U R                  5      U l
        [        TU ]0  " SUUUUUUUUS.U	D6  g s  sn
nf )NF)lstriprstripT)	bos_token	eos_token	sep_token	cls_token	pad_token
mask_tokenadd_prefix_spacemodel_max_length )
isinstancestrr   _special_codepointsr   items_special_codepoint_stringsUNICODE_VOCAB_SIZE_unicode_vocab_sizelen_num_special_tokenssuper__init__)selfr   r   r   r   r   r   r   r   kwargs	codepointname	__class__s               f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/canine/tokenization_canine.pyr#   CanineTokenizer.__init__H   sz    JTT]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	IST]_bIcIcJyuEir	 KUU_adJeJeZ
4Fku
 46 1779OI-6$$T*  :
 483K3K3Q3Q3S;
3SIO3S;
' $6 #&t'?'?#@  
	
!--
	
 
	
;
s   E2returnc                     U R                   $ N)r   )r$   s    r)   
vocab_sizeCanineTokenizer.vocab_sizev   s    '''    c                     [        U R                  5       Vs0 sH  n[        U5      U_M     nnUR                  U R                  5        U$ s  snf r-   )ranger.   chrupdateadded_tokens_encoder)r$   ivocabs      r)   	get_vocabCanineTokenizer.get_vocabz   sE    $)$//$:;$:qQ$:;T../ <s   Atextc                     [        U5      $ )z5Tokenize a string (i.e. perform character splitting).)list)r$   r:   s     r)   	_tokenizeCanineTokenizer._tokenize   s    Dzr0   tokenc                 T     [        U5      $ ! [         a    [        SU S35      ef = f)zaConverts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).zinvalid token: '')ord	TypeError
ValueError)r$   r?   s     r)   _convert_token_to_id$CanineTokenizer._convert_token_to_id   s5    	:u: 	:/wa899	:s   
 'indexc                 x     U[         ;   a	  [         U   $ [        U5      $ ! [         a    [        SU 35      ef = f)z
Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to
human-readable format.
zinvalid id: )r   r3   rC   rD   )r$   rG   s     r)   _convert_id_to_token$CanineTokenizer._convert_id_to_token   sF    
	5**)%00u: 	5|E7344	5s     
  9c                 $    SR                  U5      $ )N )join)r$   tokenss     r)   convert_tokens_to_string(CanineTokenizer.convert_tokens_to_string   s    wwvr0   token_ids_0token_ids_1c                 \    U R                   /nU R                  /nXA-   U-   nUb  XRU-   -  nU$ )a8  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CANINE sequence has the following format:

- single sequence: `[CLS] X [SEP]`
- pair of sequences: `[CLS] A [SEP] B [SEP]`

Args:
    token_ids_0 (`List[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)sep_token_idcls_token_id)r$   rQ   rR   sepclsresults         r)    build_inputs_with_special_tokens0CanineTokenizer.build_inputs_with_special_tokens   sE    &   !  !"S("C''Fr0   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ S/S/[        U5      -  -   S/-   nUb  US/[        U5      -  S/-   -  nU$ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`List[int]`):
        List of IDs.
    token_ids_1 (`List[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)rQ   rR   r[      r   )r"   get_special_tokens_maskr    )r$   rQ   rR   r[   rX   r(   s        r)   r^   'CanineTokenizer.get_special_tokens_mask   sn    $ &72']a 3   c+../1#5"sS--!44Fr0   save_directoryfilename_prefixc                     g)Nr   r   )r$   r`   ra   s      r)   save_vocabularyCanineTokenizer.save_vocabulary   s    r0   )r!   r   r   r   r-   )NF)__name__
__module____qualname____firstlineno____doc__r3   CLSSEPPADMASKr#   propertyintr.   r8   r   r<   r=   rE   rI   rO   r   rY   boolr^   rc   __static_attributes____classcell__)r(   s   @r)   r   r   :   s5    c(c(c(c(c(t9,
\ (C ( (
c d3i :# :# :
5# 
5# 
5 JN93;DI3F	c8 sx93;DI3Fko	c :c HSM  r0   r   N)ri   typingr   tokenization_utilsr   r   utilsr   
get_loggerre   loggerr   rl   rj   rk   BOSrm   RESERVEDr   dictro   r   __annotations__r   r	   r   __all__)r&   r'   s   00r)   <module>r}      s    '  A  
		H	%    (l& DcN   VhUmUmUo-pUo/)doUo-p DcN pX) Xv 
} .qs   #B