
    <hA                     <   S SK r S SKrS SKrS SKJr  S SKJr  S SKJrJ	r	J
r
  S SKrSSKJr  SSKJr  SSKJr  \R$                  " \5      rS	S
SSSS.rSr\" SS9 " S S\5      5       rS\S\\\4   S\R4                  4S jrS\SS4S jrS\S\
\\4   4S jrS/rg)    N)Path)copyfile)AnyOptionalUnion   )PreTrainedTokenizer)logging)requiresz
source.spmz
target.spmz
vocab.jsonztarget_vocab.jsonztokenizer_config.json)
source_spm
target_spmvocabtarget_vocab_filetokenizer_config_fileu   ▁)sentencepiece)backendsc            
         ^  \ rS rSrSr\rSS/r         S*S\\	\
\4      SS4U 4S jjjrS	 rS
\
S\
4S jrS rS\
4S jrS\
S\\
   4S jrS\S\
4S jrU 4S jrU 4S jrS\\
   S\
4S jrS+S\\   4S jjrS rS r\S\4S j5       rS+S\
S\\
   S\\
   4S jjrS\	4S jrS r S r!S\	4S  jr"S!\	SS4S" jr#S# r$S$ r% S,S%\S&\\   S'\&S\\   4S( jjr'S)r(U =r)$ )-MarianTokenizer,   a  
Construct a Marian tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).

This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
this superclass for more information regarding those methods.

Args:
    source_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the source language.
    target_spm (`str`):
        [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm extension) that
        contains the vocabulary for the target language.
    source_lang (`str`, *optional*):
        A string representing the source language.
    target_lang (`str`, *optional*):
        A string representing the target language.
    unk_token (`str`, *optional*, defaults to `"<unk>"`):
        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
        token instead.
    eos_token (`str`, *optional*, defaults to `"</s>"`):
        The end of sequence token.
    pad_token (`str`, *optional*, defaults to `"<pad>"`):
        The token used for padding, for example when batching sequences of different lengths.
    model_max_length (`int`, *optional*, defaults to 512):
        The maximum sentence length the model accepts.
    additional_special_tokens (`list[str]`, *optional*, defaults to `["<eop>", "<eod>"]`):
        Additional special tokens used by the tokenizer.
    sp_model_kwargs (`dict`, *optional*):
        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
        to set:

        - `enable_sampling`: Enable subword regularization.
        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

          - `nbest_size = {0,1}`: No sampling is performed.
          - `nbest_size > 1`: samples from the nbest_size results.
          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
            using forward-filtering-and-backward-sampling algorithm.

        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
          BPE-dropout.

Examples:

```python
>>> from transformers import MarianForCausalLM, MarianTokenizer

>>> model = MarianForCausalLM.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> tokenizer = MarianTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
>>> src_texts = ["I am a small frog.", "Tom asked his teacher for advice."]
>>> tgt_texts = ["Ich bin ein kleiner Frosch.", "Tom bat seinen Lehrer um Rat."]  # optional
>>> inputs = tokenizer(src_texts, text_target=tgt_texts, return_tensors="pt", padding=True)

>>> outputs = model(**inputs)  # should work
```	input_idsattention_maskNsp_model_kwargsreturnc                    > Uc  0 OUU l         [        U5      R                  5       (       d
   SU 35       eXl        [	        U5      U l        [        U5      U R
                  ;  a  [        S5      e[        U	5      U R
                  ;   d   eU(       aK  [	        U5      U l        U R                  R                  5        VVs0 sH  u  pX_M	     snnU l
        / U l        OU R
                  R                  5        VVs0 sH  u  pX_M	     snnU l
        U R
                   Vs/ sH4  oR                  S5      (       d  M  UR                  S5      (       d  M2  UPM6     snU l        XPl        X`l        X/U l        [#        XR                   5      U l        [#        X R                   5      U l        U R$                  U l        U R
                  U l        U R-                  5         [.        TU ]`  " SUUUUU	U
U R                   UUS.	UD6  g s  snnf s  snnf s  snf )Nzcannot find spm source z <unk> token must be in the vocab>><<)	source_langtarget_lang	unk_token	eos_token	pad_tokenmodel_max_lengthr   r   separate_vocabs )r   r   existsr#   	load_jsonencoderstrKeyErrortarget_encoderitemsdecodersupported_language_codes
startswithendswithr   r   	spm_filesload_spm
spm_source
spm_targetcurrent_spmcurrent_encoder_setup_normalizersuper__init__)selfr   r   r   r   r   r   r   r    r!   r"   r   r#   kwargskv	__class__s                   f/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/marian/tokenization_marian.pyr8   MarianTokenizer.__init__k   s     &5%<r/J&&((P,CJ<*PP(. 'y>-=>>9~---"+,=">D-1-@-@-F-F-HI-HTQAD-HIDL,.D)-1\\-?-?-AB-ATQAD-ABDL>Bll2vlll[_N`1efeoeopteu1l2vD)&&$1 #:/C/CD":/C/CD??#|| 	  	
##- 00/+	
 	
) J C2vs   ?G?:HH:HHc                      SSK Jn  U" U R                  5      R                  U l        g ! [
        [        4 a!    [        R                  " S5        S U l         g f = f)Nr   )MosesPunctNormalizerz$Recommended: pip install sacremoses.c                     U $ Nr$   )xs    r>   <lambda>3MarianTokenizer._setup_normalizer.<locals>.<lambda>   s    Q    )	
sacremosesrA   r   	normalizepunc_normalizerImportErrorFileNotFoundErrorwarningswarn)r9   rA   s     r>   r6   !MarianTokenizer._setup_normalizer   sM    	/7#78H8H#I#S#SD ./ 	/MM@A#.D 	/s   '* .AArD   c                 6    U(       a  U R                  U5      $ S$ )zHCover moses empty string edge case. They return empty list for '' input! )rJ   )r9   rD   s     r>   rI   MarianTokenizer.normalize   s    *+t##A&33rG   c                 f    U R                   R                  XR                   U R                     5      $ rC   )r5   getr   )r9   tokens     r>   _convert_token_to_id$MarianTokenizer._convert_token_to_id   s(    ##''/C/CDNN/STTrG   textc                     / nUR                  S5      (       a5  UR                  S5      =nS:w  a  UR                  USUS-    5        XS-   S nX!4$ )z6Remove language codes like >>fr<< before sentencepiecer   r   N   )r.   findappend)r9   rX   codeend_locs       r>   remove_language_code$MarianTokenizer.remove_language_code   sW    ??4  4&@gR%GKK]w{+,!&DzrG   c                 l    U R                  U5      u  p!U R                  R                  U[        S9nX#-   $ )N)out_type)r`   r4   encoder(   )r9   rX   r^   piecess       r>   	_tokenizeMarianTokenizer._tokenize   s7    ..t4
!!(((<}rG   indexc                 L    U R                   R                  XR                  5      $ )z?Converts an index (integer) in a token (str) using the decoder.)r,   rT   r   )r9   rh   s     r>   _convert_id_to_token$MarianTokenizer._convert_id_to_token   s    ||~~66rG   c                 &   > [         TU ]  " U40 UD6$ )a  
Convert a list of lists of token ids into a list of strings by calling decode.

Args:
    sequences (`Union[list[int], list[list[int]], np.ndarray, torch.Tensor, tf.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `list[str]`: The list of decoded sentences.
)r7   batch_decode)r9   	sequencesr:   r=   s      r>   rm   MarianTokenizer.batch_decode   s    * w#I888rG   c                 &   > [         TU ]  " U40 UD6$ )aj  
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
tokens and clean up tokenization spaces.

Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.

Args:
    token_ids (`Union[int, list[int], np.ndarray, torch.Tensor, tf.Tensor]`):
        List of tokenized input ids. Can be obtained using the `__call__` method.
    skip_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not to remove special tokens in the decoding.
    clean_up_tokenization_spaces (`bool`, *optional*):
        Whether or not to clean up the tokenization spaces. If `None`, will default to
        `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
    use_source_tokenizer (`bool`, *optional*, defaults to `False`):
        Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence
        problems).
    kwargs (additional keyword arguments, *optional*):
        Will be passed to the underlying model specific decode method.

Returns:
    `str`: The decoded sentence.
)r7   decode)r9   	token_idsr:   r=   s      r>   rq   MarianTokenizer.decode   s    0 w~i2622rG   tokensc                 Z   U R                   (       a  U R                  OU R                  n/ nSnU H@  nXPR                  ;   a  XBR	                  U5      U-   S-   -  n/ nM/  UR                  U5        MB     XBR	                  U5      -  nUR                  [        S5      nUR                  5       $ )zQUses source spm if _decode_use_source_tokenizer is True, and target spm otherwiserQ    )	_decode_use_source_tokenizerr2   r3   all_special_tokensdecode_piecesr]   replaceSPIECE_UNDERLINEstrip)r9   rt   sp_modelcurrent_sub_tokens
out_stringrU   s         r>   convert_tokens_to_string(MarianTokenizer.convert_tokens_to_string   s    &*&G&G4??T__
E///445GH5PSVVV
%'""))%0  	,,-?@@
''(8#>
!!rG   c                 J    Uc  XR                   /-   $ X-   U R                   /-   $ )z=Build model inputs from a sequence by appending eos_token_id.)eos_token_id)r9   token_ids_0token_ids_1s      r>    build_inputs_with_special_tokens0MarianTokenizer.build_inputs_with_special_tokens  s1    "3"3!444(D,=,=+>>>rG   c                 H    U R                   U l        U R                  U l        g rC   )r2   r4   r'   r5   r9   s    r>   _switch_to_input_mode%MarianTokenizer._switch_to_input_mode  s    ??#||rG   c                 l    U R                   U l        U R                  (       a  U R                  U l        g g rC   )r3   r4   r#   r*   r5   r   s    r>   _switch_to_target_mode&MarianTokenizer._switch_to_target_mode  s*    ??#'#6#6D   rG   c                 ,    [        U R                  5      $ rC   )lenr'   r   s    r>   
vocab_sizeMarianTokenizer.vocab_size  s    4<<  rG   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g / nU R
                  (       a  [         R                  R                  UU(       a  US-   OS[        S   -   5      n[         R                  R                  UU(       a  US-   OS[        S   -   5      n[        U R                  U5        [        U R                  U5        UR                  U5        UR                  U5        O\[         R                  R                  X(       a  US-   OS[        S   -   5      n[        U R                  U5        UR                  U5        [        [        S   [        S   /U R                  U R                  U R                  /5       GH$  u  pxn	[         R                  R                  X(       a  US-   OSU-   5      n
[         R                  R!                  U5      [         R                  R!                  U
5      :w  aB  [         R                  R#                  U5      (       a  [%        X5        UR                  U
5        M  [         R                  R#                  U5      (       a  M  ['        U
S	5       nU	R)                  5       nUR+                  U5        S S S 5        UR                  U
5        GM'     [-        U5      $ ! , (       d  f       N/= f)
NzVocabulary path (z) should be a directory-rQ   r   r   r   r   wb)ospathisdirloggererrorr#   joinVOCAB_FILES_NAMES	save_jsonr'   r*   r]   zipr0   r2   r3   abspathisfiler   openserialized_model_protowritetuple)r9   r   r   saved_filesout_src_vocab_fileout_tgt_vocab_fileout_vocab_filespm_save_filenamespm_orig_path	spm_modelspm_save_pathficontent_spiece_models                r>   save_vocabularyMarianTokenizer.save_vocabulary  sE   ww}}^,,LL,^,<<STU!#*93&rEVW^E__" "$*93&rEVWjEkk" dll$67d))+=>1212WW\\/3!6rUfgnUo oN dllN3~.;>|,.?.MNNN__doo.<
7i
 GGLL/3!6rUf fM ww}-1OOTVT[T[TbTbcpTqTq6""=1WW^^M22-."+4+K+K+M(HH12 / ""=1<
" [!! /.s   "K
K(	c                 "    U R                  5       $ rC   )get_src_vocabr   s    r>   	get_vocabMarianTokenizer.get_vocabL  s    !!##rG   c                 B    [        U R                  40 U R                  D6$ rC   )dictr'   added_tokens_encoderr   s    r>   r   MarianTokenizer.get_src_vocabO  s    DLL>D$=$=>>rG   c                 B    [        U R                  40 U R                  D6$ rC   )r   r*   added_tokens_decoderr   s    r>   get_tgt_vocabMarianTokenizer.get_tgt_vocabR  s    D''E4+D+DEErG   c                     U R                   R                  5       nUR                  [        R	                  / SQ5      5        U$ )N)r2   r3   r4   rJ   r   )__dict__copyupdater   fromkeys)r9   states     r>   __getstate__MarianTokenizer.__getstate__U  s4    ""$MMmn	
 rG   dc                    ^  UT l         [        T S5      (       d  0 T l        U 4S jT R                   5       u  T l        T l        T R                  T l        T R                  5         g )Nr   c              3   N   >#    U H  n[        UTR                  5      v   M     g 7frC   )r1   r   ).0fr9   s     r>   	<genexpr>/MarianTokenizer.__setstate__.<locals>.<genexpr>c  s$     +fWeRSHQ8L8L,M,MWes   "%)r   hasattrr   r0   r2   r3   r4   r6   )r9   r   s   ` r>   __setstate__MarianTokenizer.__setstate__\  sT     t.//#%D +fW[WeWe+f(?? rG   c                     g)zJust EOS   r$   )r9   argsr:   s      r>   num_special_tokens_to_add)MarianTokenizer.num_special_tokens_to_addg  s    rG   c                     [        U R                  5      nUR                  U R                  5        U Vs/ sH  o3U;   a  SOSPM     sn$ s  snf )Nr   r   )setall_special_idsremoveunk_token_id)r9   seqr   rD   s       r>   _special_token_mask#MarianTokenizer._special_token_maskk  sH    d223t001:=>#Q/)q0#>>>s   A
r   r   already_has_special_tokensc                     U(       a  U R                  U5      $ Uc  U R                  U5      S/-   $ U R                  X-   5      S/-   $ )zCGet list where entries are [1] if a token is [eos] or [pad] else 0.r   )r   )r9   r   r   r   s       r>   get_special_tokens_mask'MarianTokenizer.get_special_tokens_maskp  sQ     &++K88 ++K8A3>>++K,EF!LLrG   )r   r5   r4   r,   r'   rJ   r#   r   r   r0   r2   r3   r-   r*   r   )	NNNz<unk>z</s>z<pad>i   NFrC   )NF)*__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr   r   r(   r   r8   r6   rI   rV   r`   listrf   intrj   rm   rq   r   r   r   r   propertyr   r   r   r   r   r   r   r   r   r   boolr   __static_attributes____classcell__)r=   s   @r>   r   r   ,   s   8t *$&67 48<
 "$sCx.1<
 
<
 <
|/43 43 4U c d3i 
7# 7# 79.34"tCy "S " ?QUVYQZ ?,7
 !C ! !+"c +"HSM +"]bcf]g +"Z$4 $?Fd 	!d 	!t 	!? in	M	M.6tn	Mae	M	c	M 	MrG   r   r   r   r   c                 T    [         R                  " S0 UD6nUR                  U 5        U$ )Nr$   )r   SentencePieceProcessorLoad)r   r   spms      r>   r1   r1   |  s%    

.
.
A
ACHHTNJrG   c                 z    [        US5       n[        R                  " XSS9  S S S 5        g ! , (       d  f       g = f)Nwr[   )indent)r   jsondump)datar   r   s      r>   r   r     s%    	dCA		$!$ 
s   ,
:c                 |    [        U S5       n[        R                  " U5      sS S S 5        $ ! , (       d  f       g = f)Nr)r   r   load)r   r   s     r>   r&   r&     s"    	dCAyy| 
s   -
;) r   r   rM   pathlibr   shutilr   typingr   r   r   r   tokenization_utilsr	   utilsr
   utils.import_utilsr   
get_loggerr   r   r   r{   r   r(   r   r   r1   r   r   r&   __all__r$   rG   r>   <module>r     s     	    ' '  5  * 
		H	% ,4   
 
%&LM) LM 'LM^
3 c3h M<`<` %# %$ %
C E$*- 
 
rG   