
    <h&                         S r SSKrSSKrSSKJr  SSKJrJr  \" 5       (       a  SSKrSSK	J
r
  SSKJr  \R                  " \5      rSS	0rS
 r " S S5      r " S S\
5      rS/rg)z Tokenization classes for CPMAnt.    N)Optional)is_jieba_availablerequires_backends   )PreTrainedTokenizer)logging
vocab_filez	vocab.txtc                     [         R                  " 5       n[        U SSS9 nUR                  5       nSSS5        [	        W5       H  u  pEUR                  S5      nXAU'   M     U$ ! , (       d  f       N9= f)z*Loads a vocabulary file into a dictionary.rutf-8encodingN
)collectionsOrderedDictopen	readlines	enumeraterstrip)r	   vocabreadertokensindextokens         f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/cpmant/tokenization_cpmant.py
load_vocabr   $   sg    ##%E	j#	0F!!# 
1!&)T"e * L 
1	0s   A%%
A3c                   $    \ rS rSrSS jrS rSrg)WordpieceTokenizer/   c                 (    Xl         X l        X0l        g N)r   	unk_tokenmax_input_chars_per_word)selfr   r"   r#   s       r   __init__WordpieceTokenizer.__init__0   s    
"(@%    c                    [        U5      n[        U5      U R                  :  a  U R                  /$ Sn/ nU[        U5      :  a  [        U5      nS nX5:  a1  SR	                  X#U 5      nXpR
                  ;   a  UnOUS-  nX5:  a  M1  Uc!  UR                  U R                  5        US-  nOUR                  U5        UnU[        U5      :  a  M  U$ )Nr       )listlenr#   r"   joinr   append)r$   r   charsstart
sub_tokensend
cur_substrsubstrs           r   tokenizeWordpieceTokenizer.tokenize5   s    Uu:555NN##
c%j e*CJ+S!12ZZ'!'Jq + !!!$..1
!!*- c%j   r'   )r#   r"   r   N)<unk>   )__name__
__module____qualname____firstlineno__r%   r5   __static_attributes__ r'   r   r   r   /   s    A
r'   r   c            
       t  ^  \ rS rSrSr\rSS/rSr         SU 4S jjr	\
S 5       r\
S 5       r\
S	 5       r\
S
\4S j5       rS rS rU 4S jrS rS\\   S
\4S jrS rS rSS\S\\   S
\\   4S jjr SS\\   S\\\      S
\\   4S jjr SS\\   S\\\      S\S
\\   4U 4S jjjrSrU =r $ ) CpmAntTokenizerO   a^  
Construct a CPMAnt tokenizer. Based on byte-level Byte-Pair-Encoding.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    bod_token (`str`, *optional*, defaults to `"<d>"`):
        The beginning of document token.
    eod_token (`str`, *optional*, defaults to `"</d>"`):
        The end of document token.
    bos_token (`str`, *optional*, defaults to `"<s>"`):
        The beginning of sequence token.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token.
    line_token (`str`, *optional*, defaults to `"</n>"`):
        The line token.
    space_token (`str`, *optional*, defaults to `"</_>"`):
        The space token.
	input_idsattention_maskFc                 L  > [        U S/5        X l        X0l        [        U5      U l        U R                  U	   U R                  S'   U R                  U   U R                  S'   U R                  U		 U R                  U	 [
        R                  " [        U R                  R                  5       S S95      U l        U R                  R                  5        VVs0 sH  u  pX_M	     snnU l	        [        U R                  US9U l        [        TU ]4  " SUUUUUUUU	U
S.	UD6  g s  snnf )	Njieba r   c                     U S   $ Nr*   r>   xs    r   <lambda>*CpmAntTokenizer.__init__.<locals>.<lambda>       Z[\]Z^r'   key)r   r"   )		bod_token	eod_token	bos_token	eos_token	pad_tokenr"   
line_tokenspace_tokenpadding_sider>   )r   rP   rQ   r   encoderr   r   sorteditemsdecoderr   wordpiece_tokenizersuperr%   )r$   r	   rP   rQ   rR   rS   rT   r"   rU   rV   rW   kwargskv	__class__s                 r   r%   CpmAntTokenizer.__init__l   s    	$	*""!*- LL5S!\\*5TLL%LL$"..vdll6H6H6JP^/_`)-););)=>)=)=>#5DLLT]#^  	
!#%	
 	
	 ?s   D c                 4    U R                   U R                     $ r!   )rX   rP   r$   s    r   bod_token_idCpmAntTokenizer.bod_token_id       ||DNN++r'   c                 4    U R                   U R                     $ r!   )rX   rQ   rd   s    r   eod_token_idCpmAntTokenizer.eod_token_id   rg   r'   c                      U R                   S   $ )Nr   rX   rd   s    r   
newline_idCpmAntTokenizer.newline_id   s    ||D!!r'   returnc                 ,    [        U R                  5      $ r!   )r,   rX   rd   s    r   
vocab_sizeCpmAntTokenizer.vocab_size   s    4<<  r'   c                 B    [        U R                  40 U R                  D6$ r!   )dictrX   added_tokens_encoderrd   s    r   	get_vocabCpmAntTokenizer.get_vocab   s    DLL>D$=$=>>r'   c                     / n[         R                  " USS9 H-  nUR                  U R                  R	                  U5      5        M/     U$ )zTokenize a string.F)cut_all)rE   cutextendr\   r5   )r$   textoutput_tokensrJ   s       r   	_tokenizeCpmAntTokenizer._tokenize   sA    4/A  !9!9!B!B1!EF 0r'   c                    > U Vs/ sH  o3S:  d  M
  UPM     nnU Vs/ sH8  oDU R                   :w  d  M  X@R                  :w  d  M%  X@R                  :w  d  M6  UPM:     nn[        TU ]  " U40 UD6$ s  snf s  snf )zDecode ids into a string.r   )pad_token_ideos_token_idbos_token_idr]   _decode)r$   	token_idsr^   irJ   ra   s        r   r   CpmAntTokenizer._decode   s}     )4	1!VQ		4 
 !):):$:AqDUDU?UAZ[_p_pZpAy 	 
 wy3F33	 5
s    A/A/A4A4A4A4c                     XR                   ;   $ r!   rl   r$   r   s     r   checkCpmAntTokenizer.check   s    $$r'   r   c                 $    SR                  U5      $ )Nr)   )r-   )r$   r   s     r   convert_tokens_to_string(CpmAntTokenizer.convert_tokens_to_string   s    wwvr'   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)rX   getr"   r   s     r   _convert_token_to_id$CpmAntTokenizer._convert_token_to_id   s*    ||||'7'7'GHHr'   c                 L    U R                   R                  XR                  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)r[   r   r"   )r$   r   s     r   _convert_id_to_token$CpmAntTokenizer._convert_id_to_token   s    ||~~66r'   save_directoryfilename_prefixc                 D   [         R                  R                  U5      (       a6  [         R                  R                  X(       a  US-   OS[        S   -   5      nOU(       a  US-   OSU-   nSnSU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 SU R
                  ;   a)  U R
                  S   U R
                  S'   U R
                  S	 [        R                  " [        U R
                  R                  5       S	 S
95      U l        [        USSS9 nU R
                  R                  5        H>  u  pgXG:w  a  [        R                  SU S35        UnUR                  US-   5        US-  nM@     S S S 5        U4$ ! , (       d  f       U4$ = f)N-r)   r	   r   rF   </_>r   </n>c                     U S   $ rH   r>   rI   s    r   rK   1CpmAntTokenizer.save_vocabulary.<locals>.<lambda>   rM   r'   rN   wr   r   zSaving vocabulary to z\: vocabulary indices are not consecutive. Please check that the vocabulary is not corrupted!r*   )ospathisdirr-   VOCAB_FILES_NAMESrX   r   r   rY   rZ   r   loggerwarningwrite)r$   r   r   r	   r   writerr   token_indexs           r   save_vocabularyCpmAntTokenizer.save_vocabulary   sp   77==((/3!6rUfgsUt tJ 4C/C/n\J$,,#'<<#4DLL S!4<<#'<<#5DLL T""..vdll6H6H6JP^/_`*cG4&*ll&8&8&:"'NN/
| <N N (EUT\*
 '; 5 } 54 }s   'AF
Ftoken_ids_0token_ids_1c                 j    Uc  U R                   /U-   $ U R                   /U-   U R                   /-   U-   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A CPMAnt sequence has the following format:

- single sequence: `[BOS] Sequence`.

Args:
    token_ids_0 (`list[int]`): The first tokenized sequence that special tokens will be added.
    token_ids_1 (`list[int]`): The optional second tokenized sequence that special tokens will be added.

Returns:
    `list[int]`: The model input with special tokens.
)r   )r$   r   r   s      r    build_inputs_with_special_tokens0CpmAntTokenizer.build_inputs_with_special_tokens   sE      %%&44!!"[0D4E4E3FFTTr'   already_has_special_tokensc                    > U(       a  [         TU ]  XSS9$ Ub'  S/S/[        U5      -  -   S/-   S/[        U5      -  -   $ S/S/[        U5      -  -   $ )aT  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`list[int]`): List of IDs.
    token_ids_1 (`list[int]`, *optional*): Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r*   r   )r]   get_special_tokens_maskr,   )r$   r   r   r   ra   s       r   r   'CpmAntTokenizer.get_special_tokens_mask   sw    " &72']a 3   "31#K 001QC7A3[AQ;QRRsqcC,,--r'   )rP   r[   rX   rQ   r\   )	z<d>z</d>z<s>z</s>z<pad>r7   r   r   leftr!   )NF)!r9   r:   r;   r<   __doc__r   vocab_files_namesmodel_input_namesadd_prefix_spacer%   propertyre   ri   rm   intrq   rv   r~   r   r   r+   strr   r   r   r   tupler   r   boolr   r=   __classcell__)ra   s   @r   r@   r@   O   sr   0 *$&67
 (
T , , , , " " !C ! !?4%tCy S I7c HSM ]bcf]g 8 JNU9U3;DI3FU	cU* sx.9.3;DI3F.ko.	c. .r'   r@   )r   r   r   typingr   transformers.utilsr   r   rE   tokenization_utilsr   utilsr   
get_loggerr9   r   r   r   r   r@   __all__r>   r'   r   <module>r      sq    '  	  D  5  
		H	%!;/  @~.) ~.B 
r'   