
    <hx7                         S SK r S SKJr  S SKJrJr  S SKrSSKJ	r	J
r
Jr  SSKJr  SSKJr  \R                   " \5      rSrS	S
0r/ SQr\" SS9 " S S\5      5       rS/rg)    N)copyfile)AnyOptional   )
AddedTokenBatchEncodingPreTrainedTokenizer)logging)requiresu   ▁
vocab_filezsentencepiece.bpe.model)ar_ARcs_CZde_DEen_XXes_XXet_EEfi_FIfr_XXgu_INhi_INit_ITja_XXkk_KZko_KRlt_LTlv_LVmy_MMne_NPnl_XXro_ROru_RUsi_LKtr_TRvi_VNzh_CN)sentencepiece)backendsc                     ^  \ rS rSr% Sr\rSS/r/ r\	\
   \S'   / r\	\
   \S'               S-S\\\\4      4U 4S	 jjjrS
 rS r\S 5       r\S\4S j5       r\R.                  S\SS4S j5       r S.S\	\
   S\\	\
      S\S\	\
   4U 4S jjjr S/S\	\
   S\\	\
      S\	\
   4S jjr S/S\	\
   S\\	\
      S\	\
   4S jjrS\S\\   S\\   4S jrS rS\S\	\   4S jrS rS r S  r!S/S!\S"\\   S\"\   4S# jjr#   S0S$\	\   S\S%\\	\      S\S\$4
U 4S& jjjr%S' r&S( r'S1S) jr(S*\SS4S+ jr)S,r*U =r+$ )2MBartTokenizer%   u   
Construct an MBART tokenizer.

Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
[SentencePiece](https://github.com/google/sentencepiece).

The tokenization method is `<tokens> <eos> <language code>` for source language documents, and `<language code>
<tokens> <eos>` for target language documents.

Examples:

```python
>>> from transformers import MBartTokenizer

>>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
>>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
>>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
>>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
```	input_idsattention_maskprefix_tokenssuffix_tokensNsp_model_kwargsc                 :  > [        U[        5      (       a  [        USSS9OUnUc  0 OUU l        [        R
                  " S0 U R                  D6U l        U R                  R                  [        U5      5        Xl        SSSSS.U l	        SU l
        [        U R                  5      U l        [        [        5       VVs0 sH#  u  nnUU R                  U-   U R                  -   _M%     snnU l        U R                  R!                  5        VVs0 sH	  u  nnUU_M     snnU l        [        U R                  5      [        U R                  5      -   U R                  -   U R                  S	'   U R                  R%                  U R                  5        U R                  R!                  5        VVs0 sH	  u  nnUU_M     snnU l        [)        U R                  R+                  5       5      nUb)  UR-                  U Vs/ sH  nUU;  d  M  UPM     sn5        [.        TU ]`  " SUUUUUUUS U
UUU R                  S
.UD6  U
b  U
OSU l        U R                  U R2                     U l        Xl        U R9                  U R2                  5        g s  snnf s  snnf s  snnf s  snf )NTF)lstrip
normalizedr         r   )<s><pad></s><unk><mask>)	bos_token	eos_token	unk_token	sep_token	cls_token	pad_token
mask_tokentokenizer_filesrc_langtgt_langadditional_special_tokensr/   r    )
isinstancestrr   r/   spmSentencePieceProcessorsp_modelLoadr   fairseq_tokens_to_idsfairseq_offsetlensp_model_size	enumerateFAIRSEQ_LANGUAGE_CODESlang_code_to_iditemsid_to_lang_codeupdatefairseq_ids_to_tokenslistkeysextendsuper__init__	_src_langcur_lang_code_idrC   set_src_lang_special_tokens)selfr   r:   r;   r=   r>   r<   r?   r@   rA   rB   rC   r/   rD   kwargsicodekv_additional_special_tokenst	__class__s                        d/var/www/html/shao/venv/lib/python3.13/site-packages/transformers/models/mbart/tokenization_mbart.pyr[   MBartTokenizer.__init__A   sz   & FPPZ\_E`E`Jz$5Afp 	 &5%<r/22JT5I5IJ3z?+$ ./APQ%R"   /NWXnNo 
No71dD$$$q(4+>+>>>No 
 261E1E1K1K1MN1MA11MN/24==/ACH\H\D]/]`d`s`s/s""8,""))$*>*>?7;7Q7Q7W7W7Y%Z7Ytq!ad7Y%Z"%)$*>*>*C*C*E%F"$0&--5]5qB\9\5] 	 	
!&@ 00	
 	
  &.%9w $ 4 4T^^ D ((8G 
  O &[ ^s   >)JJ,J9	JJc                 ~    U R                   R                  5       nS US'   U R                  R                  5       US'   U$ )NrJ   sp_model_proto)__dict__copyrJ   serialized_model_proto)r_   states     rh   __getstate__MBartTokenizer.__getstate__   s;    ""$ j"&--"F"F"H    c                     Xl         [        U S5      (       d  0 U l        [        R                  " S0 U R                  D6U l        U R
                  R                  U R                  5        g )Nr/   rE   )rl   hasattrr/   rH   rI   rJ   LoadFromSerializedProtork   )r_   ds     rh   __setstate__MBartTokenizer.__setstate__   sR     t.//#%D 22JT5I5IJ--d.A.ABrr   c                 x    [        U R                  5      [        U R                  5      -   U R                  -   S-   $ )Nr3   )rN   rJ   rR   rM   r_   s    rh   
vocab_sizeMBartTokenizer.vocab_size   s2    4==!C(<(<$==@S@SSVWWWrr   returnc                     U R                   $ N)r\   rz   s    rh   rB   MBartTokenizer.src_lang   s    ~~rr   new_src_langc                 F    Xl         U R                  U R                   5        g r   )r\   r^   )r_   r   s     rh   rB   r      s    %((8rr   token_ids_0token_ids_1already_has_special_tokensc                   > U(       a  [         TU ]  XSS9$ S/[        U R                  5      -  nS/[        U R                  5      -  nUc  US/[        U5      -  -   U-   $ US/[        U5      -  -   S/[        U5      -  -   U-   $ )ad  
Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
special tokens using the tokenizer `prepare_for_model` method.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.
    already_has_special_tokens (`bool`, *optional*, defaults to `False`):
        Whether or not the token list is already formatted with special tokens for the model.

Returns:
    `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
T)r   r   r   r3   r   )rZ   get_special_tokens_maskrN   r-   r.   )r_   r   r   r   prefix_onessuffix_onesrg   s         rh   r   &MBartTokenizer.get_special_tokens_mask   s    & &72']a 3   cC 2 233cC 2 2331#K(8"89KGGqcC$445!s;?O9OPS^^^rr   c                 ~    Uc  U R                   U-   U R                  -   $ U R                   U-   U-   U R                  -   $ )a  
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. An MBART sequence has the following format, where `X` represents the sequence:

- `input_ids` (for encoder) `X [eos, src_lang_code]`
- `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`

BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
separator.

Args:
    token_ids_0 (`list[int]`):
        List of IDs to which the special tokens will be added.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
)r-   r.   )r_   r   r   s      rh    build_inputs_with_special_tokens/MBartTokenizer.build_inputs_with_special_tokens   sG    , %%3d6H6HHH!!K/+=@R@RRRrr   c                     U R                   /nU R                  /nUc  [        XA-   U-   5      S/-  $ [        XA-   U-   U-   U-   U-   5      S/-  $ )a{  
Create a mask from the two sequences passed to be used in a sequence-pair classification task. mBART does not
make use of token type ids, therefore a list of zeros is returned.

Args:
    token_ids_0 (`list[int]`):
        List of IDs.
    token_ids_1 (`list[int]`, *optional*):
        Optional second list of IDs for sequence pairs.

Returns:
    `list[int]`: List of zeros.

r   )sep_token_idcls_token_idrN   )r_   r   r   sepclss        rh   $create_token_type_ids_from_sequences3MBartTokenizer.create_token_type_ids_from_sequences   si    $   !  !s(3./1#553$s*S0;>DEKKrr   return_tensorsrB   rC   c                 v    Ub  Uc  [        S5      eX0l        U " U4SUS.UD6nU R                  U5      nXvS'   U$ )zIUsed by translation pipeline, to prepare inputs for the generate functionzATranslation requires a `src_lang` and a `tgt_lang` for this modelT)add_special_tokensr   forced_bos_token_id)
ValueErrorrB   convert_tokens_to_ids)r_   
raw_inputsr   rB   rC   extra_kwargsinputstgt_lang_ids           rh   _build_translation_inputs(MBartTokenizer._build_translation_inputs   sU     x/`aa jiT.i\hi00:(3$%rr   c                     [        U R                  5       Vs0 sH  oR                  U5      U_M     nnUR                  U R                  5        U$ s  snf r   )ranger{   convert_ids_to_tokensrU   added_tokens_encoder)r_   ra   vocabs      rh   	get_vocabMBartTokenizer.get_vocab
  sL    ;@;QR;Qa++A.1;QRT../ Ss   Atextc                 >    U R                   R                  U[        S9$ )N)out_type)rJ   encoderG   )r_   r   s     rh   	_tokenizeMBartTokenizer._tokenize  s    }}##D3#77rr   c                     XR                   ;   a  U R                   U   $ U R                  R                  U5      nU(       a  X R                  -   $ U R                  $ )z0Converts a token (str) in an id using the vocab.)rL   rJ   	PieceToIdrM   unk_token_id)r_   tokenspm_ids      rh   _convert_token_to_id#MBartTokenizer._convert_token_to_id  sQ    ...--e44((/ 06v+++L4;L;LLrr   c                     XR                   ;   a  U R                   U   $ U R                  R                  XR                  -
  5      $ )z=Converts an index (integer) in a token (str) using the vocab.)rV   rJ   	IdToPiecerM   )r_   indexs     rh   _convert_id_to_token#MBartTokenizer._convert_id_to_token  s=    ...--e44}}&&u/B/B'BCCrr   c                 l    SR                  U5      R                  [        S5      R                  5       nU$ )zIConverts a sequence of tokens (strings for sub-words) in a single string.  )joinreplaceSPIECE_UNDERLINEstrip)r_   tokens
out_strings      rh   convert_tokens_to_string'MBartTokenizer.convert_tokens_to_string!  s,    WWV_,,-=sCIIK
rr   save_directoryfilename_prefixc                    [         R                  R                  U5      (       d  [        R	                  SU S35        g [         R                  R                  X(       a  US-   OS[        S   -   5      n[         R                  R                  U R                  5      [         R                  R                  U5      :w  aG  [         R                  R                  U R                  5      (       a  [        U R                  U5        U4$ [         R                  R                  U R                  5      (       dC  [        US5       nU R                  R                  5       nUR                  U5        S S S 5        U4$ U4$ ! , (       d  f       U4$ = f)NzVocabulary path (z) should be a directory-r   r   wb)ospathisdirloggererrorr   VOCAB_FILES_NAMESabspathr   isfiler   openrJ   rn   write)r_   r   r   out_vocab_fileficontent_spiece_models         rh   save_vocabularyMBartTokenizer.save_vocabulary&  s,   ww}}^,,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrSrT__n5    00nd+r'+}}'K'K'M$-. ,     	 ,+   s   ?,E99
F		src_texts	tgt_textsc                 >   > X l         X@l        [        TU ]  " X40 UD6$ r   )rB   rC   rZ   prepare_seq2seq_batch)r_   r   rB   r   rC   r`   rg   s         rh   r   $MBartTokenizer.prepare_seq2seq_batch7  s$     ! w,YLVLLrr   c                 8    U R                  U R                  5      $ r   )r^   rB   rz   s    rh   _switch_to_input_mode$MBartTokenizer._switch_to_input_modeC      //>>rr   c                 8    U R                  U R                  5      $ r   )set_tgt_lang_special_tokensrC   rz   s    rh   _switch_to_target_mode%MBartTokenizer._switch_to_target_modeF  r   rr   c                 t    U R                   U   U l        / U l        U R                  U R                  /U l        g)z_Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code].NrR   cur_lang_coder-   eos_token_idr.   )r_   rB   s     rh   r^   *MBartTokenizer.set_src_lang_special_tokensI  s6    !11(;"//1C1CDrr   langc                 t    U R                   U   U l        / U l        U R                  U R                  /U l        g)zcReset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code].Nr   )r_   r   s     rh   r   *MBartTokenizer.set_tgt_lang_special_tokensO  s6    !11$7"//1C1CDrr   )rl   r\   r   r]   rV   rM   rL   rT   rR   r-   rJ   r/   rO   rB   r.   rC   r   )r5   r7   r7   r5   r8   r6   r9   NNNNN)NFr   )r   Nr    )r}   N),__name__
__module____qualname____firstlineno____doc__r   vocab_files_namesmodel_input_namesr-   rW   int__annotations__r.   r   dictrG   r   r[   rp   rw   propertyr{   rB   setterboolr   r   r   r   r   r   r   r   r   tupler   r   r   r   r   r^   r   __static_attributes____classcell__)rg   s   @rh   r)   r)   %   s   ( *$&67!M49!!M49!
 48"&L9 "$sCx.1L9 L9\C X X #   __9S 9T 9 9
 sx_9_3;DI3F_ko_	c_ _> JNS9S3;DI3FS	cS8 JNL9L3;DI3FL	cL2
*-
9A#
RZ[^R_

8c 8d3i 8MD
!c !HSM !]bcf]g !(  )-
M9
M 
M DI&	
M
 
M 

M 
M??EE E E Err   r)   )r   shutilr   typingr   r   r&   rH   tokenization_utilsr   r   r	   utilsr
   utils.import_utilsr   
get_loggerr   r   r   r   rQ   r)   __all__rE   rr   rh   <module>r     s     
     P P  * 
		H	% !#<=  {  
%&mE( mE 'mE`	 
rr   