
    <h7;                         S r SSKrSSKrSSKrSSKrSSKJr  SSKJrJ	r	J
r
Jr  SSKJr  \R                  " \5      rSSS	.rS
 r " S S5      rS rS r " S S\5      rS/rg)z$Tokenization classes for OpenAI GPT.    N)Optional   )PreTrainedTokenizer_is_control_is_punctuation_is_whitespace)loggingz
vocab.jsonz
merges.txt)
vocab_filemerges_filec                 X    U R                  5       n U (       d  / $ U R                  5       nU$ )z@Runs basic whitespace cleaning and splitting on a piece of text.)stripsplit)texttokenss     f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/openai/tokenization_openai.pywhitespace_tokenizer   $   s%    ::<D	ZZ\FM    c                   X    \ rS rSrSr     SS jrSS jrS rSS jrS r	S	 r
S
 rSrg)BasicTokenizer.   ab  
Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).

Args:
    do_lower_case (`bool`, *optional*, defaults to `True`):
        Whether or not to lowercase the input when tokenizing.
    never_split (`Iterable`, *optional*):
        Collection of tokens which will never be split during tokenization. Only has an effect when
        `do_basic_tokenize=True`
    tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
        Whether or not to tokenize Chinese characters.

        This should likely be deactivated for Japanese (see this
        [issue](https://github.com/huggingface/transformers/issues/328)).
    strip_accents (`bool`, *optional*):
        Whether or not to strip all accents. If this option is not specified, then it will be determined by the
        value for `lowercase` (as in the original BERT).
    do_split_on_punc (`bool`, *optional*, defaults to `True`):
        In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
        the full context of the words, such as contractions.
Nc                 ^    Uc  / nXl         [        U5      U l        X0l        X@l        XPl        g N)do_lower_casesetnever_splittokenize_chinese_charsstrip_accentsdo_split_on_punc)selfr   r   r   r   r   s         r   __init__BasicTokenizer.__init__E   s4     K*{+&<#* 0r   c                 z   U(       a$  U R                   R                  [        U5      5      OU R                   nU R                  U5      nU R                  (       a  U R                  U5      n[        R                  " SU5      n[        U5      n/ nU H  nXb;  ad  U R                  (       a1  UR                  5       nU R                  SLa  U R                  U5      nO"U R                  (       a  U R                  U5      nUR                  U R                  Xb5      5        M     [        SR                  U5      5      nU$ )a:  
Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.

Args:
    never_split (`List[str]`, *optional*)
        Kept for backward compatibility purposes. Now implemented directly at the base class level (see
        [`PreTrainedTokenizer.tokenize`]) List of token not to split.
NFCF )r   unionr   _clean_textr   _tokenize_chinese_charsunicodedata	normalizer   r   lowerr   _run_strip_accentsextend_run_split_on_puncjoin)r   r   r   unicode_normalized_textorig_tokenssplit_tokenstokenoutput_tokenss           r   tokenizeBasicTokenizer.tokenizeU   s
    CNd&&,,S-=>SWScSc% &&//5D"-"7"7t"D)*AB E'%%!KKME))6 $ 7 7 >'' 33E:E 7 7 KL ! ,CHH\,BCr   c                     [         R                  " SU5      n/ nU H2  n[         R                  " U5      nUS:X  a  M!  UR                  U5        M4     SR	                  U5      $ )z$Strips accents from a piece of text.NFDMn )r(   r)   categoryappendr.   )r   r   outputcharcats        r   r+   !BasicTokenizer._run_strip_accents{   sY    $$UD1D&&t,Cd{MM$	 
 wwvr   c                    U R                   (       a  Ub  X;   a  U/$ [        U5      nSnSn/ nU[        U5      :  am  X4   n[        U5      (       a  UR	                  U/5        SnO.U(       a  UR	                  / 5        SnUS   R	                  U5        US-  nU[        U5      :  a  Mm  U Vs/ sH  nSR                  U5      PM     sn$ s  snf )z&Splits punctuation on a piece of text.r   TF   r9   )r   listlenr   r;   r.   )	r   r   r   charsistart_new_wordr<   r=   xs	            r   r-   !BasicTokenizer._run_split_on_punc   s    $$)@TEX6MT
#e*n8Dt$$tf%!%!MM"%!&r
!!$'FA #e*n %++Fq
F+++s   .Cc                    / nU Hj  n[        U5      nU R                  U5      (       a5  UR                  S5        UR                  U5        UR                  S5        MY  UR                  U5        Ml     SR                  U5      $ )z)Adds whitespace around any CJK character.r$   r9   )ord_is_chinese_charr;   r.   r   r   r<   r=   cps        r   r'   &BasicTokenizer._tokenize_chinese_chars   sk    DTB$$R((c"d#c"d#  wwvr   c                     US:  a  US::  dT  US:  a  US::  dH  US:  a  US::  d<  US:  a  US::  d0  US	:  a  US
::  d$  US:  a  US::  d  US:  a  US::  d  US:  a  US::  a  gg)z6Checks whether CP is the codepoint of a CJK character.i N  i  i 4  iM  i   iߦ i  i? i@ i i  i i   i  i  i TF )r   rN   s     r   rL   BasicTokenizer._is_chinese_char   sr     6\bFlfvg"-g"-g"-g"-fvg"-r   c                     / nU H`  n[        U5      nUS:X  d  US:X  d  [        U5      (       a  M,  [        U5      (       a  UR                  S5        MO  UR                  U5        Mb     SR	                  U5      $ )zBPerforms invalid character removal and whitespace cleanup on text.r   i  r$   r9   )rK   r   r   r;   r.   rM   s        r   r&   BasicTokenizer._clean_text   sg    DTBQw",+d*;*;d##c"d#  wwvr   )r   r   r   r   r   )TNTNTr   )__name__
__module____qualname____firstlineno____doc__r    r4   r+   r-   r'   rL   r&   __static_attributes__rQ   r   r   r   r   .   s<    0 #1 $L	,,0r   r   c                 d    [        5       nU S   nU SS  H  nUR                  X#45        UnM     U$ )zw
Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
strings)
r   rB   N)r   add)wordpairs	prev_charr=   s       r   	get_pairsr`      s?    
 EEQIQR		9#$	  Lr   c                 f   U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n U R                  SS5      n [        R                  " S	S
U 5      n [        R                  " SSU 5      n [        R                  " SSU 5      n U R                  5       $ )ze
fixes some issues the spacy tokenizer had on books corpus also does some whitespace standardization
u   —-u   –u   ―u   …z...   ´'zD(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)z \1 z\s*\n\s*z 
 z[^\S\n]+r$   )replaceresubr   )r   s    r   text_standardizerh      s     <<s#D<<s#D<<s#D<<u%D<<c"D66]_fhlmD66+vt,D66+sD)D::<r   c                      ^  \ rS rSrSr\rSS/rSU 4S jjr\	S 5       r
\	S 5       rS rS	 rS
 rS rS rS rSS\S\\   S\\   4S jjrSrU =r$ )OpenAIGPTTokenizer   a  
Construct a GPT Tokenizer. Based on Byte-Pair-Encoding with the following peculiarities:

- lowercases all inputs,
- uses `SpaCy` tokenizer and `ftfy` for pre-BPE tokenization if they are installed, fallback to BERT's
  `BasicTokenizer` if not.

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    vocab_file (`str`):
        Path to the vocabulary file.
    merges_file (`str`):
        Path to the merges file.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
	input_idsattention_maskc           
      F  >  SS K nSSKJn  U" 5       nUR                  U l        UR
                  U l        [        USS9 n[        R                  " U5      U l        S S S 5        U R                  R                  5        V	V
s0 sH  u  pX_M	     sn
n	U l        [        USS9 nUR!                  5       R#                  S5      S	S
 nS S S 5        W Vs/ sH  n[%        UR#                  5       5      PM     nn['        [)        U[+        [-        U5      5      5      5      U l        0 U l        [2        TU ]h  " SSU0UD6  g ! [         a.    [        R                  S5        [        SS9U l        S U l         GN;f = f! , (       d  f       GN = fs  sn
n	f ! , (       d  f       N= fs  snf )Nr   )EnglishzQftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.T)r   utf-8encoding
rB   rA   	unk_tokenrQ   )ftfyspacy.lang.enro   	tokenizernlpfix_textImportErrorloggerwarningr   openjsonloadencoderitemsdecoderreadr   tupledictziprangerD   	bpe_rankscachesuperr    )r   r
   r   rt   kwargsru   ro   _nlpvocab_handlekvmerges_handlemergesmerge	__class__s                 r   r    OpenAIGPTTokenizer.__init__  sD   
	!-9D~~DH MMDM *w/<99\2DL 0)-););)=>)=)=>+0M"'')//5a;F 14:;F5%&F;c&%F*<=>
7977  	!NNno%D9DH DM	!
 0/>00;s5   3D:  E5F!#F"F:4E21E25
F
Fc                     g)NTrQ   r   s    r   r    OpenAIGPTTokenizer.do_lower_case  s    r   c                 ,    [        U R                  5      $ r   )rD   r   r   s    r   
vocab_sizeOpenAIGPTTokenizer.vocab_size  s    4<<  r   c                 B    [        U R                  40 U R                  D6$ r   )r   r   added_tokens_encoderr   s    r   	get_vocabOpenAIGPTTokenizer.get_vocab#  s    DLL>D$=$=>>r   c                   ^  [        US S 5      US   S-   4-   nUT R                  ;   a  T R                  U   $ [        U5      nU(       d  US-   $  [        UU 4S jS9nUT R                  ;  a  OUu  pV/ nSnU[        U5      :  a   UR                  XX5      n	UR                  X(U	 5        U	nX(   U:X  a6  U[        U5      S-
  :  a$  X(S-      U:X  a  UR                  XV-   5        US-  nOUR                  X(   5        US-  nU[        U5      :  a  M  [        U5      nUn[        U5      S:X  a  O[        U5      nM  SR                  U5      nUS	:X  a  S
nUT R                  U'   U$ ! [         a    UR                  X(S  5         Mt  f = f)NrA   </w>c                 N   > TR                   R                  U [        S5      5      $ )Ninf)r   getfloat)pairr   s    r   <lambda>(OpenAIGPTTokenizer.bpe.<locals>.<lambda>0  s    1C1CD%PU,1Wr   keyr   rB      r$   z
  </w>z
</w>)r   r   r`   minr   rD   indexr,   
ValueErrorr;   r.   )
r   r2   r]   r^   bigramfirstsecondnew_wordrF   js
   `         r   bpeOpenAIGPTTokenizer.bpe&  s   U3BZ E"I$6#88DJJ::e$$$6>!$WXFT^^+"MEHAc$i-

5,A
 OOD1I.A7e#CIM(9dq5kV>SOOEN3FAOODG,FA c$i-  XHD4yA~!$9 : xx~:D 

5/ " OODH-s   E) )F
	F
c           	         / nU R                   c^  U R                  R                  U5      nU H;  nUR                  [	        U R                  U5      R                  S5      5      5        M=     U$ U R                  [        U R                  U5      5      5      nU HS  nUR                  [	        U R                  UR                  R                  5       5      R                  S5      5      5        MU     U$ )zTokenize a string.r$   )
ry   rx   r4   r,   rC   r   r   rh   r   r*   )r   r   r1   r2   s       r   	_tokenizeOpenAIGPTTokenizer._tokenizeR  s    == 88$$T*D##D%)>)>s)C$DE   88,T]]4-@ABD##D%**2B2B2D)E)K)KC)P$QR r   c                 ~    U R                   R                  XR                   R                  U R                  5      5      $ )z0Converts a token (str) in an id using the vocab.)r   r   rt   )r   r2   s     r   _convert_token_to_id'OpenAIGPTTokenizer._convert_token_to_ida  s*    ||||'7'7'GHHr   c                 L    U R                   R                  XR                  5      $ )z0Converts an id in a token (BPE) using the vocab.)r   r   rt   )r   r   s     r   _convert_id_to_token'OpenAIGPTTokenizer._convert_id_to_tokene  s    ||~~66r   c                 d    SR                  U5      R                  SS5      R                  5       nU$ )z:Converts a sequence of tokens (string) in a single string.r9   r   r$   )r.   re   r   )r   r   
out_strings      r   convert_tokens_to_string+OpenAIGPTTokenizer.convert_tokens_to_stringi  s+    WWV_,,VS9??A
r   save_directoryfilename_prefixreturnc           
      d   [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  X(       a  US-   OS[        S   -   5      n[        USSS	9 nUR                  [        R                  " U R                  S
SSS9S-   5        S S S 5        Sn[        USSS	9 nUR                  S5        [        U R                  R                  5       S S9 HM  u  pXi:w  a  [        R                  SU S35        U	nUR                  SR                  U5      S-   5        US-  nMO     S S S 5        X44$ ! , (       d  f       N= f! , (       d  f       X44$ = f)NzVocabulary path (z) should be a directoryrb   r9   r
   r   wrp   rq   r   TF)indent	sort_keysensure_asciirs   r   z#version: 0.2
c                     U S   $ )NrB   rQ   )kvs    r   r   4OpenAIGPTTokenizer.save_vocabulary.<locals>.<lambda>  s    Y[\]Y^r   r   zSaving vocabulary to zZ: BPE merge indices are not consecutive. Please check that the tokenizer is not corrupted!r$   rB   )ospathisdirr{   errorr.   VOCAB_FILES_NAMESr}   writer~   dumpsr   sortedr   r   r|   )
r   r   r   r
   
merge_filefr   writer
bpe_tokenstoken_indexs
             r   save_vocabulary"OpenAIGPTTokenizer.save_vocabularyn  s   ww}}^,,LL,^,<<STUWW\\o_s22QbcoQpp

 WW\\o_s22QbcpQqq

 *cG4GGDJJt||ATYZ]aab 5 *cG4LL*++1$..2F2F2HN^+_'
'NN/
| <M M (ESXXj1D89
 ,` 5 %%! 54 54 %%s   44F=BF
F
F/)r   r   r   r   ry   rx   )z<unk>r   )rU   rV   rW   rX   rY   r   vocab_files_namesmodel_input_namesr    propertyr   r   r   r   r   r   r   r   strr   r   r   rZ   __classcell__)r   s   @r   rj   rj      s    ( *$&6780   ! !?*XI7
&c &HSM &]bcf]g & &r   rj   )rY   r~   r   rf   r(   typingr   tokenization_utilsr   r   r   r   utilsr	   
get_loggerrU   r{   r   r   r   r`   rh   rj   __all__rQ   r   r   <module>r      sz    +  	 	   c c  
		H	%  ^ ^B
^&, ^&B  
 r   