
    Ch	                    b    S SK Jr  S SKrS SKrS SKrS SKrS SKJr  SSKJ	r	Jr   " S S\5      r
g)    )annotationsN)Iterable   )ENGLISH_STOP_WORDSWordTokenizerc                  n    \ rS rSrSr/ \S4     SS jjrS rSS jrSS jr	SS jr
\SS	 j5       rS
rg)WhitespaceTokenizer   zu
Simple and fast white-space tokenizer. Splits sentence based on white spaces.
Punctuation are stripped from tokens.
Fc                R    [        U5      U l        X0l        U R                  U5        g N)set
stop_wordsdo_lower_case	set_vocab)selfvocabr   r   s       r/var/www/html/shao/venv/lib/python3.13/site-packages/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py__init__WhitespaceTokenizer.__init__   s!     j/*u    c                    U R                   $ r   )r   )r   s    r   	get_vocabWhitespaceTokenizer.get_vocab   s    zzr   c                    Xl         [        R                  " [        U5       VVs/ sH  u  p#X24PM
     snn5      U l        g s  snnf r   )r   collectionsOrderedDict	enumerateword2idx)r   r   idxwords       r   r   WhitespaceTokenizer.set_vocab   s7    
#//iX]N^0_N^$N^0_`0_s   A
c                ~   U R                   (       a  UR                  5       nUR                  5       n/ nU GH  nXPR                  ;   a  M  XPR                  ;   a   UR                  U R                  U   5        MD  UR                  [        R                  5      nXPR                  ;   a  Mt  [        U5      S:  a/  XPR                  ;   a   UR                  U R                  U   5        M  UR                  5       nXPR                  ;   a  M  XPR                  ;   d  M  UR                  U R                  U   5        GM     U$ )Nr   )
r   lowersplitr   r   appendstripstringpunctuationlen)r   textkwargstokenstokens_filteredtokens         r   tokenizeWhitespaceTokenizer.tokenize    s    ::<DE'--'&&t}}U';<KK 2 23E'UaE]]$:&&t}}U';<KKME'--'&&t}}U';<' * r   c                >   [        [        R                  R                  US5      S5       n[        R
                  " [        U R                  R                  5       5      [        U R                  5      U R                  S.U5        S S S 5        g ! , (       d  f       g = f)Nwhitespacetokenizer_config.jsonw)r   r   r   )openospathjoinjsondumplistr   keysr   r   )r   output_pathfOuts      r   saveWhitespaceTokenizer.save>   sm    "'',,{,MNPSTX\II!$--"4"4"67"&t"7%)%7%7
  UTTs   AB
Bc                    [        [        R                  R                  U S5      5       n[        R
                  " U5      nS S S 5        [        S0 WD6$ ! , (       d  f       N= f)Nr2    )r4   r5   r6   r7   r8   loadr	   )
input_pathfInconfigs      r   rB   WhitespaceTokenizer.loadI   sJ    "'',,z+LMNRUYYs^F O #,V,, ONs   A
A")r   r   r   r   N)r   Iterable[str]r   rG   r   bool)r   rG   )r*   strreturnz	list[int])r<   rI   )rC   rI   )__name__
__module____qualname____firstlineno____doc__r   r   r   r   r/   r>   staticmethodrB   __static_attributes__rA   r   r   r	   r	      sX     &(EWot"5Bhla<	 - -r   r	   )
__future__r   r   r8   r5   r'   collections.abcr   r   r   r	   rA   r   r   <module>rT      s(    "   	  $ <B-- B-r   