o
    
sh                     @   s  d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
mZmZmZ ddlmZmZmZ ddlmZmZmZmZ dd	lmZ eeZdddZdedefddZdd Z G dd dZ!G dd de!Z"dedefddZ#G dd dZ$G dd de$Z%G dd de$Z&G d d! d!e$Z'G d"d# d#e$Z(G d$d% d%e$Z)G d&d' d'e$Z*G d(d) d)e$Z+G d*d+ d+e$Z,G d,d- d-e$Z-G d.d/ d/e$Z.G d0d1 d1e$Z/G d2d3 d3e$Z0G d4d5 d5e0Z1G d6d7 d7e0Z2G d8d9 d9e0Z3G d:d; d;e0Z4G d<d= d=e0Z5G d>d? d?e0Z6G d@dA dAe0Z7G dBdC dCe0Z8G dDdE dEe0Z9G dFdG dGe0Z:G dHdI dIe0Z;G dJdK dKe0Z<G dLdM dMe0Z=G dNdO dOe0Z>G dPdQ dQe0Z?G dRdS dSe0Z@G dTdU dUe$ZAG dVdW dWe0ZBG dXdY dYe$ZCG dZd[ d[e$ZDG d\d] d]e$ZEG d^d_ d_e0ZFG d`da dae0ZGG dbdc dce0ZHG ddde dee$ZIG dfdg dge0ZJG dhdi die0ZKdjdk ZLG dldm dmZMi dne1doe-dpe2dqe%dreBdseEdte3dueCdve*dwe%dxe/dye4dze%d{e%d|e%d}e%d~e%i de1de'de*de+de%de%de-de9de-de-de%deIde5de6de(de%de-i de7de)de>de,de%de;de<de%de-de.de8de%de?de@deAde9de:e&eFeHeHeGeHdZNdde	fddZOdS )z
Utilities to convert slow tokenizers in their fast tokenizers counterparts.

All the conversions are grouped here to gather SentencePiece dependencies outside of the fast tokenizers files and
allow to make our dependency on SentencePiece optional.
    N)Optional)version)
AddedTokenRegex	Tokenizerdecodersnormalizerspre_tokenizers
processors)BPEUnigram	WordPiece   )is_protobuf_availableis_sentencepiece_availableloggingrequires_backends)PROTOBUF_IMPORT_ERROR c                 C   sj   t  rddlm} |S t r.dd l}t|jjtdk r&ddl	m} |S ddl	m
} |S tt| )Nr   )sentencepiece_model_pb2z4.0.0)sentencepiece_model_pb2_new)r   sentencepiecer   r   google.protobufr   parseprotobuf__version__transformers.utilsr   ImportErrorr   format)error_messager   google r!   a/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/transformers/convert_slow_tokenizer.pyimport_protobuf#   s   r#   add_prefix_spacereturnc                 C   s$   | rd}t |ddsd}|S d}|S )NalwayslegacyTfirstnever)getattr)r$   original_tokenizerprepend_schemer!   r!   r"   _get_prepend_scheme4   s   r-   c           
         s   |d u}|r
t |n }g }| D ]<\}}g }tdt|D ]}|d | ||d  }}	| v r>|	 v r>|||	|f qt| fddd}|| qt|dd |d}dd |D }|S )	Nr   c                        | d   | d  fS Nr   r   r!   xvocabr!   r"   <lambda>I       z!generate_merges.<locals>.<lambda>keyc                 S   s   | d t | d t | d fS )N   r   r   )lenvalr!   r!   r"   r4   L   s    r7   reversec                 S   s   g | ]
}|d  |d fqS r   r   r!   .0r;   r!   r!   r"   
<listcomp>M       z#generate_merges.<locals>.<listcomp>)dictitemsranger9   appendsortedextend)
r3   vocab_scoresr=   mergesmergepiece_scorelocalindexpiece_lpiece_rr!   r2   r"   generate_merges>   s   rQ   c                   @   sB   e Zd ZdZdefddZd	deeeef e	e f fddZ
dS )
SentencePieceExtractorzl
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
    modelc                 C   s.   t | d ddlm} | | _| j| d S )Nr   r   )SentencePieceProcessor)r   r   rT   spLoad)selfrS   rT   r!   r!   r"   __init__V   s   
zSentencePieceExtractor.__init__Nr%   c                    s2   | j   fddt  D }t||}||fS )
        By default will return vocab and merges with respect to their order, by sending `vocab_scores` we're going to
        order the merges with respect to the piece scores instead.
        c                       i | ]}  ||qS r!   id_to_piecer@   rN   rU   r!   r"   
<dictcomp>c   r5   z2SentencePieceExtractor.extract.<locals>.<dictcomp>)rU   rE   GetPieceSizerQ   rW   rI   r3   rJ   r!   r^   r"   extract]   s   
zSentencePieceExtractor.extractN)__name__
__module____qualname____doc__strrX   tuplerC   intlistrb   r!   r!   r!   r"   rR   Q   s    (rR   c                   @   s0   e Zd Zddeeeef ee f fddZdS )GemmaSentencePieceExtractorNr%   c                    sH   | j   fddt  D }d|vr|d|d< t||}||fS )rY   c                    rZ   r!   r[   r]   r^   r!   r"   r_   q   r5   z7GemmaSentencePieceExtractor.extract.<locals>.<dictcomp>	<0x09>)rU   rE   r`   getrQ   ra   r!   r^   r"   rb   k   s   
z#GemmaSentencePieceExtractor.extractrc   )	rd   re   rf   ri   rC   rh   rj   rk   rb   r!   r!   r!   r"   rl   j   s    (rl   piecec                 C   s&   t | dk p| d dkp| d   S )Nr8   ,)r9   isdigit)rp   r!   r!   r"   check_number_comma{   s   &ru   c                   @   s"   e Zd Zdd ZdefddZdS )	Converterc                 C   s
   || _ d S rc   )r+   )rW   r+   r!   r!   r"   rX      s   
zConverter.__init__r%   c                 C   s   t  rc   )NotImplementedErrorrW   r!   r!   r"   	converted   s   zConverter.convertedN)rd   re   rf   rX   r   ry   r!   r!   r!   r"   rv      s    rv   c                   @      e Zd ZdefddZdS )BertConverterr%   c           
      C      | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )N	unk_tokenFbasic_tokenizerT
clean_texthandle_chinese_charsstrip_accents	lowercase:0 $A:0 :0:0 $B:1 :1singlepairspecial_tokens##prefixr+   r3   r   r   rh   r~   hasattrr   tokenize_chinese_charsr   do_lower_caser   BertNormalizer
normalizerr	   BertPreTokenizerpre_tokenizer	cls_token	sep_tokencls_token_idsep_token_idr
   TemplateProcessingpost_processorr   decoder
rW   r3   	tokenizerr   r   r   clssepr   r   r!   r!   r"   ry      :   



zBertConverter.convertedNrd   re   rf   r   ry   r!   r!   r!   r"   r{          r{   c                   @   rz   )SplinterConverterr%   c              
   C   sZ  | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}t| j j}d}	| j j}
| j j}| j j}| j d}| j jdkrx| d| d	|	 d	| d
| d
}n| d| d
| d	|	 d	| d
}tj| d| d|||
f||f||f|	|fgd|_tjdd|_|S )Nr}   Fr   Tr   .rightr    r   r   r   r   r   r   )r+   r3   r   r   rh   r~   r   r   r   r   r   r   r   r   r	   r   r   r   r   question_tokenr   r   question_token_idconvert_tokens_to_idspadding_sider
   r   r   r   r   )rW   r3   r   r   r   r   r   r   questiondotr   r   r   dot_token_idr   r!   r!   r"   ry      sL   



$"
zSplinterConverter.convertedNr   r!   r!   r!   r"   r      r   r   c                   @   rz   )FunnelConverterr%   c           
      C   r|   )Nr}   Fr   Tr   z:2 $A:0 r   r   r   r   r   r   r   r   r!   r!   r"   ry      r   zFunnelConverter.convertedNr   r!   r!   r!   r"   r      r   r   c                   @   rz   )MPNetConverterr%   c           
   
   C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	| d
||f||	fgd|_tjdd|_|S )Nr}   Fr   Tr   r   r   z:0 r   r   r   r   r   r   r   r!   r!   r"   ry     s:   



zMPNetConverter.convertedNr   r!   r!   r!   r"   r     r   r   c                   @   rz   )OpenAIGPTConverterr%   c              	   C   s   | j j}t| j j }| j j}tt||d t|ddd}|	t|d ur/|
t|g tjdd|_t |_tjdd|_|S )N</w>F)r3   rJ   dropoutr~   end_of_word_suffixfuse_unkT)r   suffix)r+   encoderrk   	bpe_rankskeysr~   r   r   rh   token_to_idadd_special_tokensr   r   r   r	   r   r   r   
BPEDecoderr   rW   r3   rJ   r~   r   r!   r!   r"   ry   /  s&   
zOpenAIGPTConverter.convertedNr   r!   r!   r!   r"   r   .  r   r   c                	   @   B   e Zd Z	ddeeeef  deeeeef   de	fddZ
dS )GPT2ConverterNr3   rJ   r%   c              	   C   s   |s| j j}|st| j j}tt||d dddd}t| j dd}tj|d|_	t
 |_t| j ddrP| j j}| j j}tj| d| d||fgd	|_|S tjdd
|_|S )Nr   Fr3   rJ   r   continuing_subword_prefixr   r   r$   r$   add_bos_tokenz:0 $A:0z:0 $A:0 $B:1r   trim_offsets)r+   r   rk   r   r   r   r*   r	   	ByteLevelr   r   r   	bos_tokenbos_token_idr
   r   r   )rW   r3   rJ   r   r$   bosr   r!   r!   r"   ry   J  s:   
zGPT2Converter.convertedNNrd   re   rf   r   rC   rh   rj   rk   ri   r   ry   r!   r!   r!   r"   r   I      r   c                   @   rz   )HerbertConverterr%   c                 C   s   d}d}| j j}t| j j }||d d v r|dd  }tt||d | j j|d}tj	ddd|_
t |_tj|d|_tj| j j| j jf| j j| j jfd	|_|S )
Nz	#version:r   r   r   )r   r~   r   F)r   r   r   )r   r   )r+   r   rk   r   r   r   r   r~   r   r   r   r	   r   r   r   r   r   r
   BertProcessingr   r   r   r   r   )rW   tokenizer_info_strtoken_suffixr3   rJ   r   r!   r!   r"   ry   r  s.   

zHerbertConverter.convertedNr   r!   r!   r!   r"   r   q  r   r   c                	   @   r   )Qwen2ConverterNr3   rJ   r%   c                 C   s   |s| j j}|st| j j }tt||d d ddddd}t |_	t
t
jtddddt
jt| j ddddg|_t |_tjdd	|_|S )
Nr   F)r3   rJ   r   r~   r   r   r   byte_fallbackzn(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+isolatedbehaviorinvertr$   r$   	use_regexr   )r+   r   rk   r   r   r   r   r   NFCr   r	   SequenceSplitr   r   r*   r   r   r   r
   r   )rW   r3   rJ   r   r!   r!   r"   ry     sD   

zQwen2Converter.convertedr   r   r!   r!   r!   r"   r     r   r   c                   @   rz   )RobertaConverterr%   c              	   C   sv   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tj|j|jf|j|jf|j	dd|_|S )Nr   Fr   r   Tr   r   r$   r   )r+   r   rk   r   r   r   r   r	   r   r$   r   r   r   r
   RobertaProcessingr   r   r   r   r   rW   otr3   rJ   r   r!   r!   r"   ry     s,   


zRobertaConverter.convertedNr   r!   r!   r!   r"   r     r   r   c                   @   rz   )RoFormerConverterr%   c           
      C   s   ddl m} | jj}tt|t| jjd}d}d}t| jdr*| jj	j
}| jj	j}tjdd||d|_tj|||_t| jj}t| jj}| jj}| jj}	tj| d| d	| d| d
| d||f||	fgd|_tjdd|_|S )Nr   )JiebaPreTokenizerr}   Fr   Tr   r   r   r   r   r   r   r   )"models.roformer.tokenization_utilsr   r+   r3   r   r   rh   r~   r   r   r   r   r   r   r   r	   PreTokenizercustomr   r   r   r   r   r
   r   r   r   r   )
rW   r   r3   r   r   r   r   r   r   r   r!   r!   r"   ry     s8   

zRoFormerConverter.convertedNr   r!   r!   r!   r"   r     r   r   c                   @   rz   )DebertaConverterr%   c              	   C   s~   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjddd| j dfd| j dfgd	|_|S )
Nr   Fr   r   [CLS]:0 $A:0 [SEP]:0![CLS]:0 $A:0 [SEP]:0 $B:1 [SEP]:1[CLS][SEP]r   )r+   r   rk   r   r   r   r   r	   r   r$   r   r   r   r
   r   r   r   r   r!   r!   r"   ry     s.   
	zDebertaConverter.convertedNr   r!   r!   r!   r"   r     r   r   c                       sn   e Zd ZdZeZi Z fddZdd Zdd Z	dd	 Z
d
d Zdd Zdd Zdd ZdefddZ  ZS )SpmConverterFc                    s   t | d t j|  t }| }t| jjd}||	  W d    n1 s+w   Y  || _
| j
jjrB| jsDtd d S d S d S )Nr   rba  The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option which is not implemented in the fast tokenizers. In practice this means that the fast version of the tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these unknown tokens into a sequence of byte tokens matching the original piece of text.)r   superrX   r#   
ModelProtoopenr+   
vocab_fileParseFromStringreadprototrainer_specr   handle_byte_fallbackwarningswarn)rW   args	model_pb2mf	__class__r!   r"   rX   &  s   
zSpmConverter.__init__c                 C      dd |j D S )Nc                 S      g | ]}|j |jfqS r!   rp   scorer@   rp   r!   r!   r"   rA   <  r5   z&SpmConverter.vocab.<locals>.<listcomp>piecesrW   r   r!   r!   r"   r3   ;     zSpmConverter.vocabc                 C   s   |j jS rc   )r   unk_idr  r!   r!   r"   r
  >     zSpmConverter.unk_idc           	   	      s   |j j} |}|dkrtt| | jd}n-|dkrD  jj	
|\}}dd t|D }tt|||j jd jd d}ntd fd	d
t|jD }|dd
 t|dd dD  |S )Nr   r
  r   r8   c                 S   s   i | ]	\}\}}||qS r!   r!   )r@   iwordr  r!   r!   r"   r_   P      z*SpmConverter.tokenizer.<locals>.<dictcomp>T)r~   r   r   r   z]You're trying to run a `Unigram` model but you're file was trained with a different algorithmc                    8   g | ]\}}|j d v r||j|j dkp|j jv fqS )      r  typerp   r   r@   idprx   r!   r"   rA   e  
    
z*SpmConverter.tokenizer.<locals>.<listcomp>c                 S   s    g | ]\}}}t |d |dqS )F
normalizedspecialr   r@   r  tokenr  r!   r!   r"   rA   k  s    c                 S      | d S Nr   r!   r0   r!   r!   r"   r4   m      z(SpmConverter.tokenizer.<locals>.<lambda>r6   )r   
model_typer3   r   r   r
  r   SpmExtractorr+   r   rb   	enumerater   	unk_piece	Exceptionr  
add_tokensrG   )	rW   r   r#  rI   r   _rJ   	bpe_vocabspm_added_tokensr!   rx   r"   r   A  sF   

zSpmConverter.tokenizerc                 C   sJ   |j j}tjdddttddg}|st|S tt|g| S )NFT)leftr    {2,}   ▁)normalizer_specprecompiled_charsmapr   StripReplacer   r   PrecompiledrW   r   r0  _normalizersr!   r!   r"   r   s  s   
zSpmConverter.normalizerc                 C      t || j}tj||dS Nreplacementr,   )r-   r+   r	   	MetaspacerW   r9  r$   r,   r!   r!   r"   r   ~     zSpmConverter.pre_tokenizerc                 C      d S rc   r!   rx   r!   r!   r"   r        zSpmConverter.post_processorc                 C   r6  r7  )r-   r+   r   r:  r;  r!   r!   r"   r     r<  zSpmConverter.decoderr%   c                 C   s   |  | j}| | j}|d ur||_d}d}t| jdr!| jj}| ||}|d ur.||_| |||_|  }|r>||_|S )Nr.  Tr$   )	r   r   r   r   r+   r$   r   r   r   )rW   r   r   r9  r$   r   r   r!   r!   r"   ry     s    zSpmConverter.converted)rd   re   rf   r   rR   r$  r   rX   r3   r
  r   r   r   r   r   r   ry   __classcell__r!   r!   r   r"   r   !  s    2r   c                   @   $   e Zd Zdd Zdd Zdd ZdS )AlbertConverterc                 C   r  )Nc                 S   2   g | ]}t |jr|j|jfn|j|jd  fqS d   ru   rp   r  r  r!   r!   r"   rA         $z)AlbertConverter.vocab.<locals>.<listcomp>r  r  r!   r!   r"   r3        zAlbertConverter.vocabc                 C      t ddt ddg}| jjs|t   |t   | jjr)|t   |j	j
}|r7|t | |t tdd t |S Nz``"z''r-  r   r   r2  r+   keep_accentsrF   NFKDStripAccentsr   	Lowercaser/  r0  r3  r   r   rW   r   list_normalizersr0  r!   r!   r"   r        


zAlbertConverter.normalizerc                 C   ,   t jddd| jdfd| jdfgdS Nr   r   r   r   r   r
   r   r+   r   rx   r!   r!   r"   r        zAlbertConverter.post_processorNrd   re   rf   r3   r   r   r!   r!   r!   r"   rA        rA  c                   @      e Zd Zdd Zdd ZdS )BarthezConverterc                 C      d}|S Nr  r!   rW   r   r
  r!   r!   r"   r
       zBarthezConverter.unk_idc                 C   rS  Nz<s> $A </s>z<s> $A </s> </s> $B </s><s></s>r   rU  rx   r!   r!   r"   r     rV  zBarthezConverter.post_processorN)rd   re   rf   r
  r   r!   r!   r!   r"   rZ    s    rZ  c                   @   r@  )CamembertConverterc                 C   2   g d}|dd |j dd  D 7 }|dg7 }|S )N))z
<s>NOTUSED        <pad>rd  )z</s>NOTUSEDrd  z<unk>rd  )z<unk>NOTUSEDic                 S   r  r!   r  r  r!   r!   r"   rA     r5   z,CamembertConverter.vocab.<locals>.<listcomp>r   z<mask>rd  r  rW   r   r3   r!   r!   r"   r3     s   
zCamembertConverter.vocabc                 C      dS r\  r!   r  r!   r!   r"   r
       zCamembertConverter.unk_idc                 C   rS  r_  rU  rx   r!   r!   r"   r     rV  z!CamembertConverter.post_processorNrd   re   rf   r3   r
  r   r!   r!   r!   r"   rb    s    rb  c                   @   r@  )DebertaV2Converterc                 C   sH   g }| j jr|tjdd t|| j }|tj||d t|S )Nr   )r   r8  )r+   split_by_punctrF   r	   Punctuationr-   r:  r   )rW   r9  r$   list_pretokenizersr,   r!   r!   r"   r     s   
z DebertaV2Converter.pre_tokenizerc                 C   sd   g }| j jr|t  |t  |jj}|r"|t| |t	t
dd t|S )Nr-  r   )r+   r   rF   r   rO  r1  r/  r0  r3  r2  r   r   rP  r!   r!   r"   r     s   
zDebertaV2Converter.normalizerc                 C   rS  rT  rU  rx   r!   r!   r"   r   
  rV  z!DebertaV2Converter.post_processorN)rd   re   rf   r   r   r   r!   r!   r!   r"   rm    s    rm  c                   @   r@  )MBartConverterc                 C   >   g d}|dd |j dd  D 7 }|g d7 }|dg7 }|S )Nr`  rd  re  ra  rd  rg  c                 S   r  r!   r  r  r!   r!   r"   rA     r5   z(MBartConverter.vocab.<locals>.<listcomp>r  )ar_ARrd  cs_CZrd  de_DErd  en_XXrd  es_XXrd  et_EErd  fi_FIrd  fr_XXrd  gu_INrd  hi_INrd  it_ITrd  ja_XXrd  kk_KZrd  ko_KRrd  lt_LTrd  lv_LVrd  my_MMrd  ne_NPrd  nl_XXrd  ro_ROrd  ru_RUrd  si_LKrd  tr_TRrd  vi_VNrd  zh_CNrd  rh  r  ri  r!   r!   r"   r3     s
   
zMBartConverter.vocabc                 C   rj  r\  r!   r  r!   r!   r"   r
  <  r>  zMBartConverter.unk_idc                 C   rS  )Nz$A </s> en_XXz$A $B </s> en_XXr}  ra  r   rU  rx   r!   r!   r"   r   ?  rV  zMBartConverter.post_processorNrl  r!   r!   r!   r"   rq    s    &rq  c                   @   r@  )MBart50Converterc                 C   rr  )Nrs  c                 S   r  r!   r  r  r!   r!   r"   rA   R  r5   z*MBart50Converter.vocab.<locals>.<listcomp>r  )4rv  rx  rz  r|  r~  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  )af_ZArd  )az_AZrd  )bn_INrd  )fa_IRrd  )he_ILrd  )hr_HRrd  )id_IDrd  )ka_GErd  )km_KHrd  )mk_MKrd  )ml_INrd  )mn_MNrd  )mr_INrd  )pl_PLrd  )ps_AFrd  )pt_XXrd  )sv_SErd  )sw_KErd  )ta_INrd  )te_INrd  )th_THrd  )tl_XXrd  )uk_UArd  )ur_PKrd  )xh_ZArd  )gl_ESrd  )sl_SIrd  rh  r  ri  r!   r!   r"   r3   K  s
   
zMBart50Converter.vocabc                 C   rj  r\  r!   r  r!   r!   r"   r
  W  r>  zMBart50Converter.unk_idc                 C   rS  )Nzen_XX $A </s>zen_XX $A $B </s>r}  ra  r   rU  rx   r!   r!   r"   r   Z  rV  zMBart50Converter.post_processorNrl  r!   r!   r!   r"   r  J  s    r  c                   @   r@  )NllbConverterc                 C   (   g d}|dd |j dd  D 7 }|S )Nrs  c                 S   r  r!   r  r  r!   r!   r"   rA   m  r5   z'NllbConverter.vocab.<locals>.<listcomp>r  r  ri  r!   r!   r"   r3   f     zNllbConverter.vocabc                 C   rj  r\  r!   r  r!   r!   r"   r
  p  r>  zNllbConverter.unk_idc                 C   rS  )Nzeng_Latn $A </s>zeng_Latn $A $B </s>eng_Latnra  r   rU  rx   r!   r!   r"   r   s  rV  zNllbConverter.post_processorNrl  r!   r!   r!   r"   r  e      
r  c                   @   r@  )SeamlessM4TConverterc                 C   r  )N)re  rg  rt  ru  c                 S   r  r!   r  r  r!   r!   r"   rA     r5   z.SeamlessM4TConverter.vocab.<locals>.<listcomp>r  r  ri  r!   r!   r"   r3     r  zSeamlessM4TConverter.vocabc                 C   s   | j jS rc   )r+   unk_token_idr  r!   r!   r"   r
    r  zSeamlessM4TConverter.unk_idc                 C   rS  )Nz__eng__ $A </s>z__eng__ $A $B </s>__eng__ra  r   rU  rx   r!   r!   r"   r     rV  z#SeamlessM4TConverter.post_processorNrl  r!   r!   r!   r"   r  ~  r  r  c                   @   r@  )XLMRobertaConverterc                 C   rc  )Nrs  c                 S   r  r!   r  r  r!   r!   r"   rA     r5   z-XLMRobertaConverter.vocab.<locals>.<listcomp>r  rh  r  ri  r!   r!   r"   r3     s   
zXLMRobertaConverter.vocabc                 C   r[  r\  r!   r]  r!   r!   r"   r
    r^  zXLMRobertaConverter.unk_idc                 C   rS  r_  rU  rx   r!   r!   r"   r     rV  z"XLMRobertaConverter.post_processorNrl  r!   r!   r!   r"   r        r  c                   @   r@  )XLNetConverterc                 C   r  )Nc                 S   rB  rC  rE  r  r!   r!   r"   rA     rF  z(XLNetConverter.vocab.<locals>.<listcomp>r  r  r!   r!   r"   r3     rG  zXLNetConverter.vocabc                 C   rH  rI  rK  rP  r!   r!   r"   r     rR  zXLNetConverter.normalizerc                 C   rS  )Nz$A:0 <sep>:0 <cls>:2z!$A:0 <sep>:0 $B:1 <sep>:1 <cls>:2z<sep>z<cls>r   rU  rx   r!   r!   r"   r     rV  zXLNetConverter.post_processorNrW  r!   r!   r!   r"   r    rX  r  c                   @      e Zd ZdS )ReformerConverterNrd   re   rf   r!   r!   r!   r"   r        r  c                   @   rY  )RemBertConverterc                 C   s   t ddt ddt tddg}| jjs%|t   |t   | jjr0|t 	  |j
j}|r>|t | t |S rI  )r   r2  r   r+   rL  rF   rM  rN  r   rO  r/  r0  r3  r   rP  r!   r!   r"   r     s   


zRemBertConverter.normalizerc                 C   rS  rT  rU  rx   r!   r!   r"   r     rV  zRemBertConverter.post_processorN)rd   re   rf   r   r   r!   r!   r!   r"   r    s    r  c                   @   r  )BertGenerationConverterNr  r!   r!   r!   r"   r    r  r  c                   @   s,   e Zd Zdd Zdd Zdd Zdd Zd	S )
PegasusConverterc                 C   s   | j jdf| j jdfg}| j jd ur|| j jdfg7 }| j jd ur2| j j| j jk r2|| j jdfg7 }|dd td| j jD 7 }|dd |jdd  D 7 }|S )Nrd  c                 S      g | ]
}d | ddfqS )z<unk_>g      Yr!   r@   r  r!   r!   r"   rA     rB   z*PegasusConverter.vocab.<locals>.<listcomp>r8   c                 S   r  r!   r  r  r!   r!   r"   rA     r5   )	r+   	pad_token	eos_tokenmask_token_sent
mask_tokenmask_token_idoffsetrE   r  ri  r!   r!   r"   r3      s   

zPegasusConverter.vocabc                 C   s   |j j| jj S rc   )r   r
  r+   r  r  r!   r!   r"   r
    r	  zPegasusConverter.unk_idc                 C   s(   t || j}tt tj||dgS r7  )r-   r+   r	   r   WhitespaceSplitr:  r;  r!   r!   r"   r     s   zPegasusConverter.pre_tokenizerc                 C   s0   | j j}|| j jfg}tjd|gdd|g|dS )N$A$Br   )r+   r  eos_token_idr
   r   )rW   eosr   r!   r!   r"   r     s   
zPegasusConverter.post_processorN)rd   re   rf   r3   r
  r   r   r!   r!   r!   r"   r    s
    	r  c                   @   rY  )T5Converterc                 C   s:   | j j}dd |jD }|dd t|d ddD 7 }|S )Nc                 S   r  r!   r  r  r!   r!   r"   rA   *  r5   z%T5Converter.vocab.<locals>.<listcomp>c                 S   r  )z
<extra_id_r  rd  r!   r  r!   r!   r"   rA   +  rB   r   rq   )r+   
_extra_idsr  rE   )rW   r   num_extra_idsr3   r!   r!   r"   r3   (  s   zT5Converter.vocabc                 C   &   t jddgg dd| jdfgdS Nr  ra  )r  ra  r  ra  r   rU  rx   r!   r!   r"   r   .     zT5Converter.post_processorN)rd   re   rf   r3   r   r!   r!   r!   r"   r  '  s    r  c                   @      e Zd Zdd ZdS )UdopConverterc                 C   r  r  rU  rx   r!   r!   r"   r   9  r  zUdopConverter.post_processorNrd   re   rf   r   r!   r!   r!   r"   r  8      r  c                   @   rz   )WhisperConverterr%   c           	   	   C   s   | j j}t| j j }tt||d dddd}tj| j j	d|_
t |_| j j}| j |}| j j}| j j}ddd |D }tj| d| d	| d
| d||fgt||d|_|S )Nr   Fr   r   r   c                 S   s   g | ]}| d qS )r   r!   r@   r  r!   r!   r"   rA   Z  s    z.WhisperConverter.converted.<locals>.<listcomp>z $A:0 r   z $A:0 $B:1 r   r   )r+   r   rk   r   r   r   r   r	   r   r$   r   r   r   prefix_tokensconvert_ids_to_tokensr  r  joinr
   r   zipr   )	rW   r3   rJ   r   prefix_token_idsprefixesr  r  prefix_templater!   r!   r"   ry   D  s8   
	zWhisperConverter.convertedNr   r!   r!   r!   r"   r  C  r   r  c                   @   r  )BigBirdConverterc                 C   rS  rT  rU  rx   r!   r!   r"   r   h  rV  zBigBirdConverter.post_processorNr  r!   r!   r!   r"   r  g  r  r  c                   @   rz   )CLIPConverterr%   c              
   C   s   | j j}t| j j }| j j}tt||d dddt|d}t	
t	 t	tddt	 g|_t
tjtddd	d
tjddg|_t |_tj| j j| j jf| j j| j jfddd|_|S )Nr   r   Fr3   rJ   r   r   r   r   r~   z\s+r   z9's|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+removedTr   r   r   )r+   r   rk   r   r   r~   r   r   rh   r   r   r   r2  r   rO  r   r	   r   r   r   r   r   r
   r   r  r  r   r   r   r   r!   r!   r"   ry   t  sD   


zCLIPConverter.convertedNr   r!   r!   r!   r"   r  s  r   r  c                   @   rz   )LayoutLMv2Converterr%   c           
      C   s   | j j}tt|t| j jd}d}d}d}t| j dr+| j jj}| j jj	}| j jj
}tjd|||d|_t |_t| j j}t| j j}| j j}| j j}	tj| d| d| d| d| d	||f||	fgd
|_tjdd|_|S )Nr}   FTr   r   r   r   r   r   r   r   r   r   r   r!   r!   r"   ry     r   zLayoutLMv2Converter.convertedNr   r!   r!   r!   r"   r    r   r  c                   @   rz   )BlenderbotConverterr%   c              	   C   st   | j }|j}t|j }tt||d dddd}tj|j	d|_
t |_tjd|j d|j|jfgd|_|S )Nr   Fr   r   z$A:0 r   )r   r   )r+   r   rk   r   r   r   r   r	   r   r$   r   r   r   r
   r   r  r  r   r   r!   r!   r"   ry     s*   

zBlenderbotConverter.convertedNr   r!   r!   r!   r"   r    r   r  c                   @   r@  )XGLMConverterc                 C   s4   g d}|dd |j dd  D 7 }|g d7 }|S )Nrs  c                 S   r  r!   r  r  r!   r!   r"   rA     r5   z'XGLMConverter.vocab.<locals>.<listcomp>r  ))z<madeupword0>rd  )z<madeupword1>rd  )z<madeupword2>rd  )z<madeupword3>rd  )z<madeupword4>rd  )z<madeupword5>rd  )z<madeupword6>rd  r  ri  r!   r!   r"   r3     s   zXGLMConverter.vocabc                 C   r[  r\  r!   r]  r!   r!   r"   r
    r^  zXGLMConverter.unk_idc                 C   rS  )Nz</s> $Az</s> $A </s> </s> $Br`  ra  r   rU  rx   r!   r!   r"   r     rV  zXGLMConverter.post_processorNrl  r!   r!   r!   r"   r    r  r  c                   @   sF   e Zd ZdZeZddhZ	 dd Zdd Zdd	 Z	d
d Z
dd ZdS )GemmaConverterTz<start_of_turn>z<end_of_turn>c                 C      t ddS Nr   r.  )r   r2  r  r!   r!   r"   r        zGemmaConverter.normalizerc                 C   s|   | j jdf| j jdf| j jdfg}|dd |jdd  D 7 }tdd |D s<tdd t|D d }|d ur<d||< |S )	Nrd  c                 S   r  r!   r  r  r!   r!   r"   rA     r5   z(GemmaConverter.vocab.<locals>.<listcomp>r  c                 s   s    | ]	}|d  dkV  qdS )r   rm   Nr!   )r@   r1   r!   r!   r"   	<genexpr>  s    z'GemmaConverter.vocab.<locals>.<genexpr>c                 s   s$    | ]\}}|d  dkr|V  qdS )r   rn   Nr!   )r@   r  r1   r!   r!   r"   r    s   " )rm   rd  )r+   r  r  r   r  anynextr%  )rW   r   r3   override_indexr!   r!   r"   r3     s   


zGemmaConverter.vocabc                 C   r  )Nr   merged_with_previous)r	   r   rW   r9  r$   r!   r!   r"   r      r  zGemmaConverter.pre_tokenizerc                 C   r[  r\  r!   r]  r!   r!   r"   r
  #  r^  zGemmaConverter.unk_idc                 C   s    t t ddt  t  gS )Nr.  r   )r   r   r2  ByteFallbackFuser  r!   r!   r"   r   '  s   
zGemmaConverter.decoderN)rd   re   rf   r   rl   r$  r   r   r3   r   r
  r   r!   r!   r!   r"   r    s    
r  c                   @   s@   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dS )LlamaConverterTc                 C   sN   | j ddf| j ddf| j ddfg}|dd |jdd  D 7 }|S )Nr   rd  r   r8   c                 S   r  r!   r  r  r!   r!   r"   rA   :  r5   z(LlamaConverter.vocab.<locals>.<listcomp>r  )r+   r  r  ri  r!   r!   r"   r3   4  s   zLlamaConverter.vocabc                 C   r[  r!  r!   r]  r!   r!   r"   r
  =  r^  zLlamaConverter.unk_idc                 C   <   t ddt  t  g}|r|t jdddg7 }t |S Nr.  r   r   )contentr,  r   r2  r  r	  r1  r   rW   r9  r$   sequencer!   r!   r"   r   A     

zLlamaConverter.decoderc                 C   sT   t | jddr(g }t | jddr|tjddg7 }|tjdddg7 }t|S d S )Nr'   Tr$   r.  )prependr   )patternr  )r*   r+   r   Prependr2  r   )rW   r   r  r!   r!   r"   r   K  s   
zLlamaConverter.normalizerc                 C   s.   t | jddst|| j}tj||ddS d S )Nr'   TFr9  r,   split)r*   r+   r-   r	   r:  r;  r!   r!   r"   r   T  s   zLlamaConverter.pre_tokenizerc                 C   r=  rc   r!   rx   r!   r!   r"   r   Z  rk  zLlamaConverter.post_processorN)
rd   re   rf   r   r3   r
  r   r   r   r   r!   r!   r!   r"   r
  1  s    	
	r
  c                   @   rz   )MarkupLMConverterr%   c           	   
   C   s   | j }|j}t|j }tt||d ddd| j jd}tj	|j
d|_t	 |_t| j j}t| j j}| j j}| j j}tj| d| | d| d| ||f||fgd|_|S )Nr   Fr  r   z $A z $B r   )r+   r   rk   r   r   r   r   r~   r	   r   r$   r   r   r   rh   r   r   r   r   r
   r   r   )	rW   r   r3   rJ   r   r   r   r   r   r!   r!   r"   ry   `  s8   
	zMarkupLMConverter.convertedNr   r!   r!   r!   r"   r  _  r   r  c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )MoshiConverterTNc                 K   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S Nr   r   	r   rv   rX   r#   r   r   r   r   r   )rW   r   model_max_lengthkwargsr   r   r   r!   r!   r"   rX     s   

zMoshiConverter.__init__c                 C   s:   |j j}tddg}|st|S tt|g| S r   )r/  r0  r   r2  r   r3  r4  r!   r!   r"   r     s   

zMoshiConverter.normalizerc                 C   r  r  r  r  r!   r!   r"   r     r  zMoshiConverter.decoderc                 C   s   d}t j||ddS )Nr(   Fr  )r	   r:  r;  r!   r!   r"   r     s   zMoshiConverter.pre_tokenizerrc   )rd   re   rf   r   rX   r   r   r   r!   r!   r!   r"   r    s    


r  c                   @   sR   e Zd ZdZdddZdd Zdd Zd	d
 Zdd Zdd Z	dd Z
dd ZdS )HeliumConverterTNc                 G   sf   t | d t| | t }| }t|d}||  W d    n1 s)w   Y  || _d S r  r  )rW   r   r   r   r   r   r!   r!   r"   rX     s   

zHeliumConverter.__init__c                    s     |}tt| | jd} fddt|jD }|dd t|dd dD  |t	dd	d	d
g |j
ddd |S )Nr  c                    r  r  r  r  rx   r!   r"   rA     r  z-HeliumConverter.tokenizer.<locals>.<listcomp>c                 S   s"   g | ]\}}}t |d |ddqS )FT)r  r  single_wordr  r  r!   r!   r"   rA     s    c                 S   r   r!  r!   r0   r!   r!   r"   r4     r"  z+HeliumConverter.tokenizer.<locals>.<lambda>r6   
Fr  rf  r  )r  pad_id)r3   r   r   r
  r   r%  r  r(  rG   r   enable_padding)rW   r   rI   r   r+  r!   rx   r"   r     s&   

zHeliumConverter.tokenizerc                 C   sB   g }|j D ]}|jdkr|d|jfg7 }q||j|jfg7 }q|S )Nz<0x0A>r  )r  rp   r  )rW   r   r3   rp   r!   r!   r"   r3     s   

zHeliumConverter.vocabc                 C   r[  r!  r!   r]  r!   r!   r"   r
    r^  zHeliumConverter.unk_idc                 C   s8   t ddt  t  g}|t jdddg7 }t |S r  r  r  r!   r!   r"   r     s   

zHeliumConverter.decoderc                 C   s   t t dt ddgS r   )r   r   r  r2  r  r!   r!   r"   r     s   zHeliumConverter.normalizerc                 C   s   t t ddgS )Nr  
contiguous)r	   r   r   r  r!   r!   r"   r     s   zHeliumConverter.pre_tokenizerc                 C   s   t jddgg ddgdS )Nr`  r  )r`  r  r`  r  )r`  r   r   )r
   r   rx   r!   r!   r"   r     s   zHeliumConverter.post_processorrc   )rd   re   rf   r   rX   r   r3   r
  r   r   r   r   r!   r!   r!   r"   r    s    
		r  c                  C   s   t ttdtdd t ttdtdd  t ttdtdd  } | dd }d	}td
D ]}|| vrI| | |d
|  |d7 }q3dd |D }tt| |S )a8  
    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
    characters the bpe code barfs on.

    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
    tables between utf-8 bytes and unicode strings.
    !~r      ¡   ¬   ®   ÿNr      c                 S   s   g | ]}t |qS r!   )chr)r@   nr!   r!   r"   rA     s    z$bytes_to_unicode.<locals>.<listcomp>)rk   rE   ordrF   rC   r  )bscsr+  br!   r!   r"   bytes_to_unicode  s   L
r0  c                       sN   e Zd ZdZ				d fdd	Zdefdd	Zd
d ZdefddZ	  Z
S )TikTokenConverterz'
    A general tiktoken converter.
    Ns(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+Fc                    s@   t  j|  || _|| _|| _t|tr| | _d S || _d S rc   )	r   rX   r   r  r$   
isinstancerC   r   additional_special_tokens)rW   r   r  r$   r4  r   r  r   r!   r"   rX   %  s   	zTikTokenConverter.__init__tiktoken_urlc                    s   zddl m} W n ty   tdw || t fddg }i }  D ]P\}}|||< t|dkr:q)g }tdt|D ]%}|d | ||d  }	}
|	 v rh|
 v rh|	|
  v rh||	|
|f qCt	| fddd	d
}|
| q)t	|dd d	d
}fdd|D }||fS )Nr   )load_tiktoken_bpezY`tiktoken` is required to read a `tiktoken` file. Install it with `pip install tiktoken`.c                    s   d  fdd| dD S )Nr   c                    s   g | ]} t | qS r!   )r,  )r@   charbyte_encoderr!   r"   rA   D  r5   zdTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_string.<locals>.<listcomp>zlatin-1)r  decode)r/  r8  r!   r"   token_bytes_to_stringC  s   zPTikTokenConverter.extract_vocab_merges_from_model.<locals>.token_bytes_to_stringr   c                    r.   r/   r!   r0   )r   r!   r"   r4   Q  r5   zCTikTokenConverter.extract_vocab_merges_from_model.<locals>.<lambda>Fr<   c                 S   r   )Nr8   r!   r:   r!   r!   r"   r4   S  r"  c                    s$   g | ]} |d   |d fqS r>   r!   r?   )r;  r!   r"   rA   T  s   $ zETikTokenConverter.extract_vocab_merges_from_model.<locals>.<listcomp>)tiktoken.loadr6  r'  
ValueErrorr0  rD   r9   rE   rF   rG   rH   )rW   r5  r6  rJ   r3   r  rankrM   rN   rO   rP   r!   )r   r9  r;  r"   extract_vocab_merges_from_model8  s6   z1TikTokenConverter.extract_vocab_merges_from_modelc                 C   s:   |  | j\}}tt||dd}t|jdrd|j_|S )NF)r   ignore_mergesT)r?  r   r   r   r   rS   r@  )rW   rI   rJ   r   r!   r!   r"   r   W  s
   zTikTokenConverter.tokenizerr%   c                 C   sh   |   }ttjt| jdddtj| jddg|_t	 |_
|dd | jD  tjdd|_|S )Nr   Fr   r   c                 S   s   g | ]	}t |d ddqS )FTr  r  r  r!   r!   r"   rA   i  r  z/TikTokenConverter.converted.<locals>.<listcomp>r   )r   r	   r   r   r   r  r   r$   r   r   r   r   r4  r
   r   )rW   r   r!   r!   r"   ry   ^  s   
zTikTokenConverter.converted)Nr2  FN)rd   re   rf   rg   rX   rh   r?  r   r   ry   r?  r!   r!   r   r"   r1     s    r1  AlbertTokenizerBartTokenizerBarthezTokenizerBertTokenizerBigBirdTokenizerBlenderbotTokenizerCamembertTokenizerCLIPTokenizerCodeGenTokenizerConvBertTokenizerDebertaTokenizerDebertaV2TokenizerDistilBertTokenizerDPRReaderTokenizerDPRQuestionEncoderTokenizerDPRContextEncoderTokenizerElectraTokenizerFNetTokenizerFunnelTokenizerGPT2TokenizerHerbertTokenizerLayoutLMTokenizerLayoutLMv2TokenizerLayoutLMv3TokenizerLayoutXLMTokenizerLongformerTokenizerLEDTokenizerLxmertTokenizerMarkupLMTokenizerMBartTokenizerMBart50TokenizerMPNetTokenizerMobileBertTokenizerMvpTokenizerNllbTokenizerOpenAIGPTTokenizerPegasusTokenizerQwen2TokenizerRealmTokenizerReformerTokenizerRemBertTokenizerRetriBertTokenizerRobertaTokenizerRoFormerTokenizerSeamlessM4TTokenizerSqueezeBertTokenizerT5TokenizerUdopTokenizerWhisperTokenizerXLMRobertaTokenizerXLNetTokenizer)SplinterTokenizerXGLMTokenizerLlamaTokenizerCodeLlamaTokenizerGemmaTokenizerPhi3TokenizerFc                 C   sn   | j j}|tv r|st| }||  S ztd t| j| jd W S  t	y6   t
dtt  w )a  
    Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

    Args:
        transformer_tokenizer ([`~tokenization_utils_base.PreTrainedTokenizer`]):
            Instance of a slow tokenizer to convert in the backend tokenizer for
            [`~tokenization_utils_base.PreTrainedTokenizerFast`].
       from_tiktoken (bool, optional): Whether to use the `tiktoken` library to convert the tokenizer instead of sentencepiece.
            Defaults to False.

    Return:
        A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
        [`~tokenization_utils_base.PreTrainedTokenizerFast`]
    zConverting from Tiktoken)r   r4  zConverting from SentencePiece and Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast converters: )r   rd   SLOW_TO_FAST_CONVERTERSry   loggerinfor1  r   r4  r'  r=  rk   r   )transformer_tokenizerfrom_tiktokentokenizer_class_nameconverter_classr!   r!   r"   convert_slow_tokenizer  s&   

r  )r   )F)Prg   r   typingr   	packagingr   
tokenizersr   r   r   r   r   r	   r
   tokenizers.modelsr   r   r   utilsr   r   r   r   utils.import_utilsr   
get_loggerrd   r{  r#   boolrh   r-   rQ   rR   rl   ru   rv   r{   r   r   r   r   r   r   r   r   r   r   r   rA  rZ  rb  rm  rq  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r
  r  r  r  r0  r1  rz  r  r!   r!   r!   r"   <module>   sR  $


'2''(.' %!5% ($+'4.&)ZQ	
 !"#$%&'()*+,-./01234=