o
    tBh                     @   s>  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlmZ d dlZd dlmZ d dlmZ d dlm Z  d dl!m"Z"m#Z#m$Z$m%Z% d dl&m'Z' d dl(m)Z) d dl*Z*d dl+m,Z, dZ-dZ.e-e. Z/dd Z0dd Z1dd  Z2d!d" Z3d#d$ Z4d%d& Z5ej67d'eefd(d) Z8d*d+ Z9d,d- Z:d.d/ Z;d0d1 Z<d2d3 Z=d4d5 Z>d6d7 Z?d8d9 Z@d:d; ZAd<d= ZBd>d? ZCd@dA ZDej6EdBej67dCdDdEgdFdG ZFdHdI ZGdJdK ZHdLdM ZIdNdO ZJdPdQ ZKdRdS ZLdTdU ZMdVdW ZNe#dXdY ZOej6EdBej67dCdDdEgdZd[ ZPej67d'eefd\d] ZQej6EdBej67dCdDdEgd^d_ ZRd`da ZSdbdc ZTej67dddedfieUdgfdhdiieUdjfdedkieUdlfdhdmieUdnfdodfdpeUdgfdidqdpeUdjfdrdmieUdsfdrdtieVduffdvdw ZWej6EdBej67dCdDdEgdxdy ZXe#dzd{ ZYej67d'eefd|d} ZZd~d Z[dd Z\dd Z]e#dd Z^dd Z_dd Z`ej67dejaejbejcgdd Zdej6EdBej67dCdDdEgdd Zeej6EdBej67dCdDdEgdd Zfdd Zgdd Zhdd Zidd Zjdd Zkdd Zle#dd Zmdd Zndd Zodd Zpej67d'eeefdd Zqej67dejrejsgdd Ztdd Zuej67dejvejsdfejwejsdfejrejrdfejsejsdfgdd Zxej67deddeddeddgdd Zydd Zze#dd Z{e%dd Z|e#ej67deeegdd Z}ej67deeegej67dde~dfdedfgddń Zej67deeejee#dƍgej67dddɄ ddɄ gej67dddgdd̈́ Zej67deeegddτ Zej67d'eeegej67dddgddddddddf	dddɄ dddddddf	dddɄ dddddddf	ddddɄ ddddɄ dddf	dddddddɄ dddf	dgdd Zej67deddddoddgfee-ffdd Zdd Zdd Ze#dd ZdS )    )MappingN)sparse)
strip_tagsstrip_accents_unicodestrip_accents_ascii)HashingVectorizer)CountVectorizer)TfidfTransformer)TfidfVectorizer)ENGLISH_STOP_WORDS)train_test_split)cross_val_score)GridSearchCV)Pipeline)	LinearSVC)clone)assert_array_almost_equal)assert_array_equal)IS_PYPY)assert_almost_equalfails_if_pypyassert_allclose_dense_sparseskip_if_32bit)defaultdict)partial)StringIO)zthe pizza pizza beer copyrightzthe pizza burger beer copyrightz!the the pizza beer beer copyrightzthe burger beer beer copyrightzthe coke burger coke copyrightzthe coke burger burger)zthe salad celeri copyrightz)the salad salad sparkling water copyrightzthe the celeri celeri copyrightzthe tomato tomato salad waterz the tomato salad water copyrightc                 C   s   t |  S N)r   uppers r"   /var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/sklearn/feature_extraction/tests/test_text.py	uppercase>      r$   c                 C   s   |  ddS )N   ée)replacer    r"   r"   r#   strip_eacuteB   r%   r)   c                 C      |   S r   splitr    r"   r"   r#   split_tokenizeF      r-   c                 C   s   dgS )Nthe_ultimate_featurer"   r    r"   r"   r#   lazy_analyzeJ   s   r0   c                  C   s   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d	} d
}t | |ks<J d} d}t | |ksHJ d} d
}t | |ksTJ d S )N   àáâãäåçèéêëaaaaaaceeee   ìíîïñòóôõöùúûüýiiiinooooouuuuy   إu   ا   this is à testthis is a testu   öou   ̀́̂̃ u   ȫr   aexpectedr"   r"   r#   test_strip_accentsN   s*   r=   c                  C   sd   d} d}t | |ksJ d} d}t | |ksJ d} d}t | |ks$J d} d}t | |ks0J d S )	Nr1   r2   r3   r4   r5   r9   r6   r7   r   r:   r"   r"   r#   test_to_asciir   s   r>   
Vectorizerc                 C   s   | dd  }d}g d}|||ksJ d}g d}|||ks#J | dd  }td	}g d
}|||ks:J | td  }d}g d}|||ksOJ | tdd  }d}g d}|||kseJ d S )Nasciistrip_accents:   J'ai mangé du kangourou  ce midi, c'était pas très bon.)
aimangedu	kangouroucemidietaitpastresbonz0This is a test, really.

 I met Harry yesterday.)thisistestreallymetharry	yesterdayfile)input'This is a test with a file-like object!)rN   rO   rP   withrU   likeobjectpreprocessoru;   J'ai mangé du kangourou  ce midi,  c'était pas très bon.)
AIMANGEDU	KANGOUROUCEMIDIETAITPASTRESBON)	tokenizerrB   )
zj'airE   rF   rG   rH   zmidi,zc'etaitrK   rL   zbon.)build_analyzerr   r$   r-   )r?   watextr<   r"   r"   r#   test_word_analyzer_unigrams   s&   rk   c                  C   s2   t dddd } d}g d}| ||ksJ d S )Nwordunicode      analyzerrB   ngram_rangerC   )rD   rE   rF   rG   rH   rI   rJ   rK   rL   rM   zai mangezmange duzdu kangourouzkangourou cezce midiz
midi etaitz	etait paszpas tresztres bon)r
   rh   )ri   rj   r<   r"   r"   r#   'test_word_analyzer_unigrams_and_bigrams   s   rt   c                  C   s   d} |  d}tddd }tt || W d    n1 s#w   Y  tdddd }tt || W d    d S 1 sFw   Y  d S )	NrC   zutf-8rn   r@   )rs   encodingchar      )rr   rs   ru   )encoder
   rh   pytestraisesUnicodeDecodeError)rj   
text_bytesri   car"   r"   r#   test_unicode_decode_error   s   


"r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J d
}g d}| |d d |ks=J g d}| |d	d  |ksMJ t dddd } td}g d}| |d d |ksjJ d S )Nrv   rm   rw   rq   u9   J'ai mangé du kangourou  ce midi, c'était pas très bon)zj'az'aizai zi mz ma   )zs tresz tres ztres bzres bozes bon1This 
	is a test, really.

 I met Harry yesterday)thihisis zs iz is)z yesteyesteresterdsterdaterdayrU   rV   rr   rs   rW   r
   rh   r   cngarj   r<   r"   r"   r#   test_char_ngram_analyzer   s.   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd } td}g d}| |d d |ksHJ d S )Nchar_wbrm   rw   rq   r   )z thr   r   r   z thir   )r   r   r   r   zerday r   rU   r   zA test with a file-like object!)z a z tetesestzst z tesry   r   r   r"   r"   r#   test_char_wb_ngram_analyzer  s$   r   c                  C   s   t dddd } d}g d}| |d d |ksJ g d}| |d	d  |ks+J t d
ddd }t|}||| |ksBJ d S )Nrl   rm   rw   rq   r   )zthis is testzis test reallyztest really metrx   )ztest really met harry yesterdayzthis is test really met harryz"is test really met harry yesterdayrU   r   r   )r   rj   r<   	cnga_filerU   r"   r"   r#   test_word_ngram_analyzer$  s"   r   c                  C   s   ddd} t |  }ttttttfD ]O}|| }t|d}|	t
 t|tr1|j| ks0J n	t |j|ks:J |t
}|jd t|ksJJ || }t|d}||}t||jd kscJ qd S )Nr   ro   pizzabeer
vocabulary)setkeysdictlistiterr   r   intr
   fitJUNK_FOOD_DOCS
isinstancer   vocabulary_	transformshapeleninverse_transform)vocabtermstypvvectXinvr"   r"   r#   &test_countvectorizer_custom_vocabulary;  s    






r   c                  C   sd   ddg} t dt| dfdt fg}|t}t|jd jt| ks%J |jd t	| ks0J d S )Nr   r   countr   tfidfro   )
r   r
   r   fit_transformALL_FOOD_DOCSr   named_stepsr   r   r   )what_we_likepiper   r"   r"   r#   /test_countvectorizer_custom_vocabulary_pipelineP  s   
r   c                  C   sX   ddd} d}t jt|d t| d}|dg W d    d S 1 s%w   Y  d S )Nr   r   z$Vocabulary contains repeated indicesmatchr   pasta_sizilianar{   r|   
ValueErrorr
   r   )r   msgr   r"   r"   r#   7test_countvectorizer_custom_vocabulary_repeated_indices]  s   

"r   c                  C   sT   ddd} t jtdd t| d}|dg W d    d S 1 s#w   Y  d S )Nro   rp   r   zdoesn't contain indexr   r   pasta_verdurar   r   r   r"   r"   r#   0test_countvectorizer_custom_vocabulary_gap_indexe  s
   

"r   c                  C   s   t  } | jdd |  tksJ | jdd tt |   W d    n1 s+w   Y  | jdd tt |   W d    n1 sJw   Y  g d}| j|d |  t|kscJ d S )Nenglish
stop_words_bad_str_stop__bad_unicode_stop_)someotherwords)r
   
set_paramsget_stop_wordsr   r{   r|   r   r   )cvstoplistr"   r"   r#   test_countvectorizer_stop_wordsl  s   

r   c                  C   s   t jtdd tg d} | dg W d    n1 sw   Y  t jtdd tddd}|g d W d    d S 1 sBw   Y  d S )	Nzempty vocabularyr   r   foo      ?r   )max_dfr   )zto be or not to bez
and me toozand so do your   )r   r   r"   r"   r#   %test_countvectorizer_empty_vocabulary{  s   
"r   c                  C   sF   t  } | td d }| tdd  }|jd |jd ks!J d S )Nr   ro   )r
   r   r   r   )r   X1X2r"   r"   r#   test_fit_countvectorizer_twice  s   r   zignore::FutureWarning:sklearn	get_namesget_feature_namesget_feature_names_outc                 C   sB   g d}d}t |d}|| g d}t||  }t|| dS )zCheck `get_feature_names()` when a custom token pattern is passed.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    z&This is the 1st document in my corpus.z This document is the 2nd sample.zAnd this is the 3rd one.zIs this the 4th document?z'[0-9]{1,3}(?:st|nd|rd|th)\s\b(\w{2,})\btoken_pattern)documentonesampleN)r
   r   getattrr   )r   corpusr   
vectorizerr<   feature_names_outr"   r"   r#   )test_countvectorizer_custom_token_pattern  s   

r   c                  C   sX   g d} d}d}t |d}tjt|d ||  W d   dS 1 s%w   Y  dS )zCheck that we raise an error if token pattern capture several groups.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/12971
    r   z)([0-9]{1,3}(?:st|nd|rd|th))\s\b(\w{2,})\bz,More than 1 capturing group in token patternr   r   Nr
   r{   r|   r   r   )r   r   err_msgr   r"   r"   r#   <test_countvectorizer_custom_token_pattern_with_several_group  s   
"r   c                  C   s   g d} d}t d| d}tjt|d ||  W d    n1 s#w   Y  t  tdt ||  W d    d S 1 sCw   Y  d S )N)SampleUpperCase
VocabularyzyUpper case characters found in vocabulary while 'lowercase' is True. These entries will not be matched with any documentsT)	lowercaser   r   error)	r
   r{   warnsUserWarningr   warningscatch_warningssimplefilterr   )r   messager   r"   r"   r#   'test_countvectorizer_uppercase_in_vocab  s   
"r   c                  C   sH   g dg dg dg} t ddd| }g d}||}t|| dS )	z0Check get_feature_names_out for TfidfTransformerro   ro   ro   ro   ro   r   ro   r   r   Tl2
smooth_idfnorm)r;   cbN)r   r   r   r   )r   trfeature_names_inr   r"   r"   r#   %test_tf_transformer_feature_names_out  s
   
r  c                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}||  }|dk sMJ d S )Nr   r   r   Tr   r   r   rp   ro   axisr   r   r   )r   r   toarrayallr   sumr   r   r   r"   r"   r#   test_tf_idf_smoothing  s   r	  c                  C   s   g dg dg dg} t ddd}||  }|dk s J t|d jd	d
g d g dg dg dg} t ddd}d}tjt|d ||   W d    d S 1 sZw   Y  d S )Nr   r   r   Fr   r   r   rp   ro   r  r  zdivide by zeror   )	r   r   r  r  r   r  r{   r   RuntimeWarning)r   r   r   in_warning_messager"   r"   r#   test_tfidf_no_smoothing  s   "r  c                  C   s   dgdgdgg} t ddd d}||  }|d dksJ |d |d ks(J |d |d ks2J |d dk s:J |d dk sBJ d S )Nro   rp   rx   TF)sublinear_tfuse_idfr   r   )r   r   r  r  r"   r"   r#   test_sublinear_tf  s   r  c                  C   s  t td d } td g}ttd }tdd}|| }t|dr&| }|d|jd f dks3J t|jd	}||fD ]s}||}t|drM| }|j}|d|d
 f dks\J |d|d f dkshJ |d|d f dkstJ d|vszJ d|vsJ |d|d f dksJ |d|d f dksJ |d|d f dksJ |d|d f dksJ q=t	dd}	|	
|| }
t|	jt|jksJ |
j|t|jfksJ |	| }|jt|t|jfksJ t	ddd}|
|| }t|drJ t	dd}tt || W d    n	1 s w   Y  ttj|dddg|  t td d } tdd}|j|_||  }|jrPJ t|
| || }t|| td d	}tt ||  W d    n	1 s|w   Y  |jddd | }d}t|}||}||ksJ |jdd d tt |  W d    n	1 sw   Y  d |_tt |  W d    d S 1 sw   Y  d S )!Nro         ?r   tocsrr   r   rp   r   saladtomatowaterthe	copyrightcokeburgerr   l1r   F)r   r  idf_Tr  r  r   r@   )rB   r   rC   _gabbledegook_)rB   r\   _invalid_analyzer_type_)r   r   r   r
   r   hasattrr  r   r   r   r   r  r  r   r{   r|   r   r   npr  r   r   fixed_vocabulary_r   build_preprocessorr   rh   )
train_data	test_datan_trainv1counts_trainv2r   counts_testr   t1r   
tfidf_testt2tft3tvtfidf2tfidf_test2v3	processorrj   r<   resultr"   r"   r#   test_vectorizer  s~   













$r7  c                  C   s  d\} }}}t | |||d}|t |jj| ksJ |jj|ks#J |jj|ks+J |jj|ks3J d|_d|_d|_d|_|jj| ksGJ |jj|ksOJ |jj|ksWJ |jj|ks_J |t |jj|jksmJ |jj|jksvJ |jj|jksJ |jj|jksJ d S )N)r   FFF)r   r  r   r  r  T)r   r   r   _tfidfr   r  r   r  )r   r  r   r  r1  r"   r"   r#   test_tfidf_vectorizer_settersj  s,   

r9  c                  C   sv  t  } | t}|j}|jtt| jfksJ |j| jksJ t	|j
dks)J t	|j
dk s3J t|j
dks=J t|j
dk sGJ t|jd D ]}ttj|d j
dd qNt ddd} | t}|jtt| jfksuJ |j| jks}J |j}||ksJ |d| k sJ t	|j
dksJ t|j
dk sJ t|jd D ]}ttj|d j
dd qd S )	Nr  r   ro   rp   r   rn   r  )rs   r   )r	   r   r   nnzr   r   
n_featuresdtyper"  mindatamaxranger   linalgr   )r   r   	token_nnzi
ngrams_nnzr"   r"   r#   test_hashing_vectorizer  s.   

rE  c           	      C   sV  t dd}tt t||   W d    n1 sw   Y  |jr%J |t}|j\}}t	|j
|ks8J t||  }| dkrRt|tjsJJ |jtksQJ nt|tsYJ t	||ksaJ tg d| t|D ]\}}||j
|kszJ qlg d}t |d}t||  }tg d| |jsJ t|D ]\}}||j
|ksJ qd S )Nr  r  r   	r   r  celerir  r   r  	sparklingr  r  r   )r
   r{   r|   r   r   r#  r   r   r   r   r   r   r"  ndarrayr<  rZ   r   r   	enumerateget)	r   r   r   	n_samplesr;  feature_namesidxnamer   r"   r"   r#   test_feature_names  s>   





rP  c                 C   sJ   h d}h d}| ddd}| t t|j|ksJ |j|ks#J d S )N>   r   r   r  r  >   r  r  r  rG  r  r  rH  g333333?   )r   max_features)r   r   r   r   stop_words_)r?   expected_vocabularyexpected_stop_wordsr   r"   r"   r#   test_vectorizer_max_features  s   
rV  c           
      C   s   t dd}t dd}t d d}|tjdd}|tjdd}|tjdd}t||  }t||  }t||  }	d| ksDJ d| ksLJ d| ksTJ d|t| ks_J d|t| ksjJ d|	t| ksuJ d S )Nro   rR  rx   r   r     r  )r
   r   r   r  r   r?  r"  argmax)
r   cv_1cv_3cv_Nonecounts_1counts_3counts_None
features_1
features_3features_Noner"   r"   r#   "test_count_vectorizer_max_features  s   


rc  c                  C   s  g d} t ddd}||  d|j v sJ t|j dks#J t|jdks,J d|_||  d|j vs=J t|j d	ksHJ d|jv sOJ t|jd
ksXJ d|_||  d|j vsiJ t|j d	kstJ d|jv s{J t|jd
ksJ d S )Nabcdeaeatrv   r   rr   r   r;   ry   r   r  rQ  rp   ro   )r
   r   r   r   r   rS  r   r&  r   r"   r"   r#   test_vectorizer_max_df-  $   


rj  c                  C   s  g d} t ddd}||  d|j v sJ t|j dks#J t|jdks,J d|_||  d	|j vs=J t|j dksHJ d	|jv sOJ t|jd
ksXJ d|_||  d	|j vsiJ t|j dkstJ d	|jv s{J t|jdksJ d S )Nrd  rv   ro   )rr   min_dfr;   ry   r   rp   r   rQ  g?r   )r
   r   r   r   r   rS  rl  ri  r"   r"   r#   test_vectorizer_min_dfD  rk  rm  zparams, err_type, messager   g       @zmax_df == 2.0, must be <= 1.0.rl  g      ?zmin_df == 1.5, must be <= 1.0.zmax_df == -2, must be >= 0.izmin_df == -10, must be >= 0.rx   )rl  r   2   rR  z"max_features == -10, must be >= 0.g      @z2max_features must be an instance of int, not floatc                 C   s\   t j||d g d}tdi | ddi}|| W d    d S 1 s'w   Y  d S )Nr   rd  rr   rv   r"   )r{   r|   r
   r   )paramserr_typer   r&  r   r"   r"   r#   !test_vectorizer_params_validation[  s
   "rr  c                 C   s   ddg}t ddd}|| }tg dt||   tg dg dg| t ddd	d
}|| }tg dg dg| t ddd	tjd}||}|jtjksVJ d S )Naaabcabbderv   r   rh  )r;   r   r   dr'   )rx   ro   ro   r   r   )ro   rp   r   ro   ro   T)rr   r   binary)ro   ro   ro   r   r   )ro   ro   r   ro   ro   )rr   r   rv  r<  )r
   r   r  r   r   r"  float32r<  )r   r&  r   r   X_sparser"   r"   r#   test_count_binary_occurrencest  s   
ry  c                  C   s   ddg} t ddd d}|| }t|dd jdksJ t|dd	 jd	ks,J |jtjks4J t ddd
d d}|| }t|jdksKJ |jtjksSJ t ddd
d tjd}|| }|jtjksjJ d S )Nrs  rt  Frv   )alternate_signrr   r   r   ro   rx   rp   T)rr   rz  rv  r   )rr   rz  rv  r   r<  )r	   r   r"  r?  r>  r<  float64)r&  r   r   r"   r"   r#   test_hashed_binary_occurrences  s"   


r|  c                 C   s  t }|  }||}||}t|tsJ | }t||D ]\}}tt	||}tt	|}t
|| qt|sBJ |jdksIJ | }	||	}
t||
D ]\}}t
t|t| qW| }||}t||D ]\}}t
t|t| qud S )Ncsr)r   r   r   r   r   rh   zipr"  sortuniquer   r   issparseformatr  tocsc)r?   r>  r   transformed_datainversed_dataanalyzedocinversed_termsr   transformed_data2inversed_data2terms2transformed_data3inversed_data3terms3r"   r"   r#   !test_vectorizer_inverse_transform  s*   



r  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
d}t||ddd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J d S )Nr  ro   g?r   	test_sizerandom_stater   svcro   ro   rn   hingesquared_hinge)vect__ngram_range	svc__lossrx   )n_jobsr   r   )r   NOTJUNK_FOOD_DOCSr   r   r   r
   r   r   r   predictr   best_score_best_estimator_r   rs   r>  targetr%  r&  target_traintarget_testpipeline
parametersgrid_searchpredbest_vectorizerr"   r"   r#   -test_count_vectorizer_pipeline_grid_selection  s   
r  c                  C   s   t t } dgtt  dgtt  }t| |ddd\}}}}tdt fdt fg}dd	gd
dd}t||dd}|||	|}	t
|	| |jdksNJ |jjd }
|
jdks[J |
jdksbJ |
jrgJ d S )Nr  ro   g?r   r  r   r  r  rn   )r  r   r  )r  
vect__normr  )r  r   r   )r   r  r   r   r   r   r   r   r   r  r   r  r  r   rs   r   r#  r  r"   r"   r#   'test_vectorizer_pipeline_grid_selection  s$   
r  c                  C   s^   t t } dgtt  dgtt  }tdt fdt fg}t|| |dd}t|g d d S )Nr  ro   r   r  rx   )r   r  )r   r  r   r   r   r   r   r   )r>  r  r  	cv_scoresr"   r"   r#   )test_vectorizer_pipeline_cross_validation  s
   r  c                  C   sx   d} t  }|| g}|jdksJ td dd}|| g}|jdks%J |j|jks-J tt|j	t|j	 d S )Nu   Машинное обучение — обширный подраздел искусственного интеллекта, изучающий методы построения алгоритмов, способных обучаться.)ro      F)r   rz  )ro   i   )
r
   r   r   r	   r   r:  r   r"  r  r>  )r   r   	X_countedX_hashedr"   r"   r#   test_vectorizer_unicode  s   r  c                  C   sF   ddg} t | d}|t}|t}t| |  |js!J d S )Nr   rG  r   )r   r   r   r   r   r  r#  )r   r   X_1X_2r"   r"   r#   +test_tfidf_vectorizer_with_fixed_vocabulary6  s   


r  c                  C   s   t  t ddt ddt ddt ttdttdttdtttd	tt ttdt tg} | D ]2}t	|}t
|}t||jksJJ | | ksTJ tr\t|t r\q5t|t|t q5d S )
Nr  r  T)rv  rn   rs   r[   )rr   rA   )r	   r
   r   r0   r   r   r)   r   pickledumpsloadstype	__class__
get_paramsr   r   r   r   )	instancesorigr!   copyr"   r"   r#   test_pickling_vectorizer@  s2   


r  factoryc                 C   sB   t  }| |}d}tt|}||}||}||ksJ dS )z_Tokenizers cannot be pickled
    https://github.com/scikit-learn/scikit-learn/issues/12833
    rC   N)r
   r  r  r  )r  vecfunctionrj   roundtripped_functionr<   r6  r"   r"   r#   test_pickling_built_processors^  s   r  c                 C   s   t jd}t g d}tddD ]0}t|j|ddd}t|d}t	t
|}|t |t tt||  t||   qd S Nr   rF  d   r   F)sizer(   r   )r"  randomRandomStatearrayr@  r   choicer
   r  r  r  r   r   r   r   )r   rngvocab_wordsx	vocab_setr   unpickled_cvr"   r"   r#   -test_countvectorizer_vocab_sets_when_picklingt  s   


r  c           	      C   s   t jd}t g d}tddD ]?}t }|j|ddd}tddD ]}|||| < q$t|d}t	t
|}|t |t tt||  t||   qd S r  )r"  r  r  r  r@  r   r  r
   r  r  r  r   r   r   r   )	r   r  r  r  
vocab_dictr   yr   r  r"   r"   r#   .test_countvectorizer_vocab_dicts_when_pickling  s   


r  c                  C   s   t  tttdtttdtf} | D ])}|t }d |_|t }t	|d |t }t
|| t
|| qd S )Nr[   rA   rS  )r   r   r   r
   r   r)   r   r  rS  delattrr   )fitted_vectorizersr   vect_transformstop_None_transformstop_del_transformr"   r"   r#   test_stop_words_removal  s   


r  c                  C   s`   t  t} t | }t|}t|}t||j	ksJ t
||  ||   d S r   )r
   r   r   r   r   r  r  r  r  r  r   r  )r   r  r!   r  r"   r"   r#   test_pickling_transformer  s   

"r  c                  C   sH   t  t} t | }t }|j|_t||  ||   d S r   )	r
   r   r   r   r   r  r   r   r  )r   r  r  r"   r"   r#   test_transformer_idf_setter  s
   "r  c                  C   s   t dd} | t t | jdd}| j|_t|t | t  t | jdd}d}tj	t
|d | j|_W d    d S 1 sDw   Y  d S )NTr  r   r  Fz+`idf_` cannot be set when `user_idf=False`.r   )r   r   r   r   r  r   r   r  r{   r|   r   )r  r  r   r"   r"   r#   test_tfidf_vectorizer_setter  s   


"r  c                  C   sv   t dd} | t t | jdd}t| j}dg|d  }tt t	|d| W d    d S 1 s4w   Y  d S )NTr  r  r   ro   r  )
r   r   r   r   r   r  r{   r|   r   setattr)r   r  expected_idf_leninvalid_idfr"   r"   r#   %test_tfidfvectorizer_invalid_idf_attr  s   


"r  c                  C   sL   g d} t | d}tt |g  W d    d S 1 sw   Y  d S )N)r;   r   r   r;   r;   r   r   r   r"   r"   r#   test_non_unique_vocab  s
   
"r  c                  C   sJ   d} t }dd }tj|| d |  W d    d S 1 sw   Y  d S )Nz?np.nan is an invalid document, expected byte or unicode string.c                  S   s   t  } | dtjdg d S )Nhello worldhello hello)r	   r   r"  nan)hvr"   r"   r#   func  s   z0test_hashingvectorizer_nan_in_docs.<locals>.funcr   )r   r{   r|   )r   	exceptionr  r"   r"   r#   "test_hashingvectorizer_nan_in_docs  s   "r  c                  C   sd   t ddd d} | jsJ | ddg }t| g d | ddg }t| g d d S )NTF)rv  r  r   r  r  )ro   ro   ro   r   )r   rv  r   r  r   ravelr   )r   r   r   r"   r"   r#   test_tfidfvectorizer_binary  s   
r  c                  C   s(   t dd} | t t| j| jj d S )NTr  )r   r   r   r   r  r8  )r   r"   r"   r#   test_tfidfvectorizer_export_idf  s   

r  c                  C   s<   t dgd} t| }| t |t |j| jksJ d S )Nr  r   )r   r   r   r   r   )
vect_vocabvect_vocab_cloner"   r"   r#   test_vectorizer_vocab_clone  s
   

r  c                 C   s   d}|  }t jt|d |d W d    n1 sw   Y  t jt|d |d W d    n1 s8w   Y  |ddg t jt|d |d W d    d S 1 s\w   Y  d S )NzBIterable over raw text documents expected, string object received.r   zhello world!	some textzsome other text)r{   r|   r   r   r   r   )r?   r   r  r"   r"   r#   &test_vectorizer_string_object_as_input  s   "r  X_dtypec                 C   s2   t jdd| dd}t |}|j|jksJ d S N
   i N  *   )r<  r  )r   randr   r   r<  )r  r   X_transr"   r"   r#   test_tfidf_transformer_type0  s   r  c                  C   s^   t jddtjdd} t | }t | }t |}t |}t|| |j	|j	ks-J d S r  )
r   r  r"  r{  
csc_matrix
csr_matrixr   r   r   r  )r   X_cscX_csrX_trans_cscX_trans_csrr"   r"   r#   test_tfidf_transformer_sparse7  s   


r  z0vectorizer_dtype, output_dtype, warning_expectedTFc                 C   s   t g d}t| d}d}|r-tjt|d ||}W d    n1 s'w   Y  nt  t	dt ||}W d    n1 sGw   Y  |j
|ksSJ d S )N)numpyscipysklearnr<  z'dtype' should be used.r   r   )r"  r  r   r{   r   r   r   r   r   r   r<  )vectorizer_dtypeoutput_dtypewarning_expectedr   r   warning_msg_matchX_idfr"   r"   r#   test_tfidf_vectorizer_typeB  s   


r	  r  )rp   ro   r  c                 C   s   | j }td| d}t| trtrtjdd tjt	|d | 
dg W d    n1 s1w   Y  tjt	|d | dg W d    n1 sNw   Y  t| trxtjt	|d | dg W d    d S 1 sqw   Y  d S d S )NzInvalid value for ngram_range=z/ lower boundary larger than the upper boundary.*HashingVectorizer is not supported on PyPy)reasonr   zgood news everyone)rs   reescaper   r	   r   r{   xfailr|   r   r   r   r   )r  invalid_ranger   r"   r"   r#   $test_vectorizers_invalid_ngram_rangeZ  s"   

"r  c                 C   s&   |   }|  }|  }| |||S r   )r   build_tokenizerr$  _check_stop_words_consistency)	estimatorr   tokenize
preprocessr"   r"   r#   r  x  s   r  c               	   C   s   d} d|  }t  t t fD ]1}|jg dd tjt|d |dg W d    n1 s0w   Y  |`t	|du s?J qt
  t
dt |dg W d    n1 s[w   Y  t	|d u shJ |jg d	d tjt|d |dg W d    d S 1 sw   Y  d S )
Nz\['and', 'll', 've'\]z}Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens %s not in stop_words.)you'veyouyou'llANDr   r   r  Fr   )r  r  r  blahr  )r
   r   r	   r   r{   r   r   r   _stop_words_idr  r   r   r   )lstrr   r  r"   r"   r#   'test_vectorizer_stop_words_inconsistent  s*   
"r  c                  C   s`   t jdtjd} tj}| j|| _| j|| _dddd}t | |}||jj	ks.J dS )z
    Check that CountVectorizer._sort_features preserves the dtype of its sparse
    feature matrix.

    This test is skipped on 32bit platforms, see:
        https://github.com/scikit-learn/scikit-learn/pull/11295
    for more details.
    )r   r   r  r   ro   rp   )zscikit-learnrO   zgreat!N)
r   r  r"  int64indicesastypeindptrr
   _sort_featuresr<  )r   INDICES_DTYPEr   Xsr"   r"   r#   7test_countvectorizer_sort_features_64bit_sparse_indices  s   r%  	Estimatorc                 C   s   ddig}|  }t |du sJ | dd dgd}t |dks!J t |d u s)J || G d	d
 d
| }|dgd}t |dksDJ | dd dgd}t |du sUJ d S )Nrj   r  Tc                 S      | d S Nrj   r"   r  r"   r"   r#   <lambda>      z?test_stop_word_validation_custom_preprocessor.<locals>.<lambda>and)r\   r   r   c                   @   s   e Zd Zdd ZdS )zFtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimatorc                 S   s   dd S )Nc                 S   r'  r(  r"   r)  r"   r"   r#   r*    r+  zktest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessor.<locals>.<lambda>r"   )selfr"   r"   r#   r$    r.   zYtest_stop_word_validation_custom_preprocessor.<locals>.CustomEstimator.build_preprocessorN)__name__
__module____qualname__r$  r"   r"   r"   r#   CustomEstimator  s    r1  r   c                 S   s   t d| S )Nz\w{1,})r  compilefindallr  r"   r"   r#   r*    s    )rg   r   )r  r   )r&  r>  r  r1  r"   r"   r#   -test_stop_word_validation_custom_preprocessor  s   


r5  zinput_type, err_type, err_msgfilenamer9   rU   z$'str' object has no attribute 'read'c                 C   sh   t | trtrtd dg}tj||d | dd |d| W d    d S 1 s-w   Y  d S )Nr
  "this is text, not file or filenamer   c                 S   r*   r   r+   r)  r"   r"   r#   r*    r+  z.test_callable_analyzer_error.<locals>.<lambda>rr   rV   )
issubclassr	   r   r{   r  r|   r   )r&  
input_typerq  r   r>  r"   r"   r#   test_callable_analyzer_error  s   
"r;  )marksrr   c                 C   s
   t | dS )Nr)openr4  r"   r"   r#   r*    s   
 r*  c                 C   r*   r   )readr4  r"   r"   r#   r*    r+  r:  c                 C   sL   dg}t ttf | ||d| W d    d S 1 sw   Y  d S )Nr7  r8  )r{   r|   FileNotFoundErrorAttributeErrorr   )r&  rr   r:  r>  r"   r"   r#   &test_callable_analyzer_change_behavior  s   "rB  c                 C   s|   dd }t |trtrtd | d}|d tjtdd ||dd		|g W d    d S 1 s7w   Y  d S )
Nc                 S   s   t d)Ntesting)	Exceptionr4  r"   r"   r#   rr     r.   z6test_callable_analyzer_reraise_error.<locals>.analyzerr
  zfile.txtzsample content
rC  r   rU   r8  )
r9  r	   r   r{   r  joinwriter|   rD  r   )tmpdirr&  rr   fr"   r"   r#   $test_callable_analyzer_reraise_error  s   


"rI  zjstop_words, tokenizer, preprocessor, ngram_range, token_pattern,analyzer, unused_name, ovrd_name, ovrd_msgr  r  r  rv   z'stop_words'
'analyzer'	!= 'word'c                 C   r*   r   r+   r    r"   r"   r#   r*    r+  z'tokenizer'c                 C   r*   r   r+   r    r"   r"   r#   r*  &  r+  \w+rl   'token_pattern'zis not Nonec                 C   r*   r   r   r    r"   r"   r#   r*  2  r+  c                 C   r*   r   rN  r    r"   r"   r#   r*  5  r+  z'preprocessor'zis callablern   c                 C   r*   r   rN  r    r"   r"   r#   r*  @  r+  z'ngram_range')	NNNr  rL  rv   rM  rJ  rK  c
                 C   sl   t }
|  }|j||||||d d|||	f }tjt|d ||
 W d    d S 1 s/w   Y  d S )N)r   rg   r\   rs   r   rr   z-The parameter %s will not be used since %s %sr   )r   r   r{   r   r   r   )r?   r   rg   r\   rs   r   rr   unused_name	ovrd_nameovrd_msgr%  r   r   r"   r"   r#   test_unused_parameters_warn  s$   X"rR  zVectorizer, Xro   rp   )r   bar)r   bazc                 C   s0   |  }t |dr
J || t |drJ d S )Nn_features_in_)r!  r   )r?   r   r   r"   r"   r#   test_n_features_ins  s   	
rV  c                  C   s:   t dd} | ddgj}| ddgj}||ksJ d S )Nro   rW  helloworld)r
   r   r   )r  vocab1vocab2r"   r"   r#   )test_tie_breaking_sample_order_invariance  s   
r[  c                  C   sP   t ddt} d}tjt|d |   W d    d S 1 s!w   Y  d S )Nr  r  z&get_feature_names is deprecated in 1.0r   )r
   r   r   r{   r   FutureWarningr   )r   r   r"   r"   r#   !test_get_feature_names_deprecated  s
   
"r]  c                  C   s.   t ddd} | dgj}|d dksJ d S )Ni@B )rp   rx   )r;  rs   z22pcs efuturer   )r	   r   r  )hashingr  r"   r"   r#   2test_nonnegative_hashing_vectorizer_result_indices  s   r_  )collections.abcr   r  r{   r   r  r   sklearn.feature_extraction.textr   r   r   r	   r
   r   r   r   sklearn.model_selectionr   r   r   sklearn.pipeliner   sklearn.svmr   sklearn.baser   r   r"  numpy.testingr   r   sklearn.utilsr   sklearn.utils._testingr   r   r   r   collectionsr   	functoolsr   r  ior   r   r  r   r$   r)   r-   r0   r=   r>   markparametrizerk   rt   r   r   r   r   r   r   r   r   r   r   r   filterwarningsr   r   r   r  r	  r  r  r7  r9  rE  rP  rV  rc  rj  rm  r   	TypeErrorrr  ry  r|  r  r  r  r  r  r  r  rh   r$  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  rw  r{  r  r  int32r  r	  r  r  r  r%  r5  r@  rA  r;  paramrB  rI  rR  rV  r[  r]  r_  r"   r"   r"   r#   <module>   s   	$
=

g

'K





$'




	





	







H!

