
    Chd                        S SK Jr  S SKrS SKrS SKrS SKrS SKrS SKJr  S SK	J
r
Jr  SSKJrJr  \R                  " \5      r " S S\5      rg)	    )annotationsN)Iterable)NLTK_IMPORT_ERRORis_nltk_available   )ENGLISH_STOP_WORDSWordTokenizerc                  z    \ rS rSrSr/ \SSS4         SS jjrS rSS jrSS	 jr	SS
 jr
\SS j5       rSrg)PhraseTokenizer   ar  Tokenizes the text with respect to existent phrases in the vocab.

This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
F_   c                    [        5       (       d3  [        [        R                  " U R                  R
                  5      5      e[        U5      U l        X0l        X@l	        XPl
        U R                  U5        g N)r   ImportErrorr   format	__class____name__set
stop_wordsdo_lower_casengram_separatormax_ngram_length	set_vocab)selfvocabr   r   r   r   s         n/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/models/tokenizer/PhraseTokenizer.py__init__PhraseTokenizer.__init__   sW     !""/66t~~7N7NOPPj/*. 0u    c                    U R                   $ r   )r   )r   s    r   	get_vocabPhraseTokenizer.get_vocab)   s    zzr    c                   Xl         [        R                  " [        U5       VVs/ sH  u  p#X24PM
     snn5      U l        [        5       U l        [        5       U l        U H  nU R                  c  M  U R                  U;   d  M$  UR                  U R                  5      S-   nU R                  U R                  -   U;  d  Ma  X@R                  ::  d  Mr  U R                  R                  U5        U R                  R                  U5        M     [        U5      S:  aN  [        R                  SU R                   35        [        R                  S[        U R                  5       35        g g s  snnf )Nr   r   z(PhraseTokenizer - Phrase ngram lengths: zPhraseTokenizer - Num phrases: )r   collectionsOrderedDict	enumerateword2idxr   ngram_lookupngram_lengthsr   countr   addlenloggerinfo)r   r   idxwordngram_counts        r   r   PhraseTokenizer.set_vocab,   s%   
#//iX]N^0_N^$N^0_`  E UD##/D4H4HD4P"jj)=)=>B''$*>*>>dJ{^s^sOs%%))$/&&**;7  u:>KKB4CUCUBVWXKK9#d>O>O:P9QRS  1`s   E*
c                   SSK Jn  U" USS9n[        U R                  SS9 H  nSnU[	        U5      U-
  ::  d  M  U R
                  R                  XFXe-    5      nXpR                  ;   a  U/XFXe-   & O3UR                  5       U R                  ;   a  UR                  5       /XFXe-   & US-  nU[	        U5      U-
  ::  a  M  M     / nU GH  n	XR                  ;   a  M  XR                  ;   a   UR                  U R                  U	   5        MD  U	R                  5       n	XR                  ;   a  Me  XR                  ;   a   UR                  U R                  U	   5        M  U	R                  [        R                  5      n	XR                  ;   a  M  [	        U	5      S:  d  M  XR                  ;   d  M  UR                  U R                  U	   5        GM     U$ )Nr   )word_tokenizeT)preserve_line)reverser   )nltkr5   sortedr*   r-   r   joinr)   lowerr   r(   appendstripstringpunctuation)
r   textkwargsr5   tokens	ngram_lenr0   ngramtokens_filteredtokens
             r   tokenizePhraseTokenizer.tokenize?   s   &t48   2 2DAICVy00,,11&s2OP---5:GF1[[]d&7&775:[[]OF1q Vy00 B E'--'&&t}}U';<KKME'--'&&t}}U';<KK 2 23E'UaE]]$:&&t}}U';<' * r    c           	     j   [        [        R                  R                  US5      S5       n[        R
                  " [        U R                  R                  5       5      [        U R                  5      U R                  U R                  U R                  S.U5        S S S 5        g ! , (       d  f       g = f)Nphrasetokenizer_config.jsonw)r   r   r   r   r   )openospathr:   jsondumplistr(   keysr   r   r   r   )r   output_pathfOuts      r   savePhraseTokenizer.saveh   s~    "'',,{,IJCPTXII!$--"4"4"67"&t"7%)%7%7'+';';(,(=(= 	 QPPs   A0B$$
B2c                    [        [        R                  R                  U S5      5       n[        R
                  " U5      nS S S 5        [        S0 WD6$ ! , (       d  f       N= f)NrJ    )rL   rM   rN   r:   rO   loadr   )
input_pathfInconfigs      r   rY   PhraseTokenizer.loadu   sI    "'',,z+HIJcYYs^F K ((( KJs   A
A")r   r   r*   r)   r   r   r   r(   N)
r   Iterable[str]r   r^   r   boolr   strr   int)r   r^   )r@   r`   returnz	list[int])rS   r`   )rZ   r`   )r   
__module____qualname____firstlineno____doc__r   r   r"   r   rG   rU   staticmethodrY   __static_attributes__rX   r    r   r   r      sw      "$6#" ! " 	
  "T&'R ) )r    r   )
__future__r   r%   rO   loggingrM   r>   collections.abcr   transformers.utils.import_utilsr   r   r	   r   	getLoggerr   r.   r   rX   r    r   <module>rn      s>    "    	  $ P <			8	$i)m i)r    