o
    tBh                    @   s  d dl Z d dlZd dlmZ d dlZd dlmZ d dlm	Z	 d dlm
Z
 d dlmZ d dlmZ d dlmZ d d	lmZ d
d Zejdddgdd Zdd Zejdddgdd Zejdejejejgejdejejejgdd Zejdejejejgdd Zejdejdddgdd  Zejdejdddgd!d" Zd#d$ Zd%d& Z ejjd'g d(g d)ge!g d*g d+gej!g d,g d-ge"d.ej!g d/d0ej#d1gge"d.ej!g d/d0e$d2d1gge"d.ej!g d3g d4ge"d.ej!g d5d0ej#dgge"d.ej!g d5d0e$d2dgge"d.gg d6d7d8d9 Z%ejdddgejd:d;d<gejd=dd>gd?d@ Z&ejd:d;d<gejdAdBdCgdDdCgdBdCggg dEg dFg dEgfdGd0gdHd0gdIdJgdHd0ggg dKg dLg dMgfgdNdO Z'dPdQ Z(ejdejdddgejd=g dRejdSg dRdTdU Z)ejdVdWdXgejd'dDdBge!dYdZggd[d\ Z*ejdVdWdXgd]d^ Z+ejjd_d`dCgdadCggd`dagdCggej,fe!dDdBgdbdBggdDdbgdBggej-fej!dcd1gddd1gge"d.dcddgd1ggej,fe!dcd1gddd1ggdcddgd1ggej.fe!dDdBgej#dBggdDej#gdBggej/fej!dcej#gdej#gge"d.dcdgej#ggej,fej!dce$d2gde$d2gge"d.dcdge$d2ggej,fgg ded7dfdg Z0ejdddgejjdhej!d0dJgge"d.j1ej!d0digge"d.j1g djgej,fej!dDdBggdkd.j1ej!dDdlggdkd.j1g dmgej2fej!d0dJgge"d.j1ej!d0digge"d.j1e!g djgej,fej!dd0gge"d.j1ej!ddJgge"d.j1g dnge"fej!d0dJgge"d.j1ej!d0ej#gge"d.j1g doge"fej!d0dgge"d.j1ej!d0ej#gge"d.j1g dpge"fej!d0ej#gge"d.j1ej!d0dgge"d.j1d0ej#dqgge"fgg drd7dsdt Z3dudv Z4dwdx Z5dydz Z6ejdejdddgejjd{d>d|d}gfd~g dfg dddgfgg dd7dd Z7dd Z8ejjd'g d)g d(ge!g dg dgej!g d-g d,ge"d.gg dd7dd Z9ejjdhej!d0dJgge"d.j1ej!d0digge"d.j1g djgej,fej!dDdBggdkd.j1ej!dDdlggdkd.j1g dmgej2fej!d0dJgge"d.j1ej!d0digge"d.j1e!g djgej,fgg dd7dd Z:dd Z;dd Z<ejde$e=gdd Z>ejdddie?dfddie?dfddde?dfddDde@dfddie@dfgdd ZAdd ZBdd ZCdd ZDdd ZEdd ZFdd ZGejdej#de$d2gdd ZHejddgdggd=didfg d)g d(g dgd=ejIdJe"d.idfg d)g d(g dgd=g didfgdd ZJejd=d`dbgg dgdd ZKejjdd<d;gddgd7ejjd=d>g dâgd>dgd7ddƄ ZLejdeegddɄ ZMejdejdddBiddiddidBddМdlddМgejddg dԢggddք ZNejdejd=d~d>dJggdd؄ ZOejd=d0gdiggddڄ ZPejdejdddbiddiddiddiddidbddМdlddМgdd ZQejd=d>dJggdd ZRejd=d0gdiggdd ZSdd ZTejddbdDdМddligdd ZUdd ZVdd ZWdd ZXejddd ZYdd ZZejdddDdgdd Z[ejddBdbdgdd Z\ejdddidfddidfddidfgdd Z]d d Z^ejdg dejdg ddd Z_ejdejdddgejdej#dgdd	 Z`d
d Zaejdddgejdddgdd Zbejdddgdd Zcejdddgdd Zdejdddgdd Zedd Zfejdej#dgdd Zgejdddgejdej#dgdd Zhejjdhej!d0ej#gge"d.j1ej!d0dJgge"d.j1ej!d0ej#dige"d.gej,fej!d0ej#gge"d.j1ej!d0dJgge"d.j1ej!d0ej#dige"d.gej,fej!dej#ggejd.j1ej!dYggejd.j1e!ddZej#ggejfgg dd7d d! Ziejd"e!d#ej#dYggj1e!d$ej#d#ggj1e!dZggfe!g d%gj1e!g d&gj1e!ej#ggfej!d'ej#dJgge"d.j1e!d#ej#d$ggj1ej!digge"d.fej!g d(ge"d.j1e!g d)gj1ej!ej#gge"d.fgd*d+ Zjd,d- Zkd.d/ Zlejd0d1ddggej!d1ddggd2d.ej!d1ddggd3d.gejd4dcddggej!dcddggd2d.ej!dcddggd3d.gd5d6 Zmd7d8 Znd9d: Zod;d< Zpejd=d<d;gd>d? ZqdS (@      Nsparse)NotFittedError)assert_array_equal)assert_allclose)_convert_container)is_scalar_nan)OneHotEncoder)OrdinalEncoderc                  C   s   t g dg dg} t }tdd}|| }|| }|jdks$J |jdks+J t|s2J t|r9J t| g dg dg t| | d S )N         r   r   r   Fr   r      )              ?r   r   r   )r   r   r   r   r   )	nparrayr	   fit_transformshaper   issparser   toarray)X
enc_sparse	enc_denseX_trans_sparseX_trans_dense r   /var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/sklearn/preprocessing/tests/test_encoders.py!test_one_hot_encoder_sparse_dense   s   


r!   handle_unknownignoreinfrequent_if_existc                 C   s  t g dg dg dg}t g dg}tdd}|| tjtdd || W d    n1 s7w   Y  t| d}|| | }t	||
 t g d	g t|| td
d}tjtdd || W d    d S 1 s|w   Y  d S )N)r   r   r   )r   r   r   )r   r   r   )   r   r   errorr"   Found unknown categoriesmatch)r   r   r   r   r   r   r   42zhandle_unknown should be one of)r   r   r	   fitpytestraises
ValueError	transformcopyr   r   r   r"   r   X2oh	X2_passedr   r   r    #test_one_hot_encoder_handle_unknown(   s&   





"r6   c                  C   sb   t dgdgg} tddgd}d}tjt|d ||  W d    d S 1 s*w   Y  d S )Nab
categorieszqThis OneHotEncoder instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.r)   )r   r   r	   r-   r.   r   r0   r   encmsgr   r   r    test_one_hot_encoder_not_fittedE   s   "r>   c                 C   sx   t g dd}t ddgd}t| d}|| | }t|| t g dg dg t|| d S )N)11111111223334444)r   55555r@   r'   )r   r   r   r   r   r   r   r   )	r   r   reshaper	   r,   r1   r   r0   r   r2   r   r   r    +test_one_hot_encoder_handle_unknown_stringsQ   s   

rG   output_dtypeinput_dtypec                 C   s   t jddgg| dj}t jddgddgg|d}td|d}t|| | t||| | td|dd}t||| t|||| d S )Nr   r   dtypeauto)r:   rK   F)r:   rK   r   )	r   asarrayTr	   r   r   r   r,   r0   )rI   rH   r   
X_expectedr4   r   r   r    test_one_hot_encoder_dtyped   s   rP   c                 C   s   t d}|ddgddgd}tjg dg dg| d	}t| d	}t|| | t|	|
| | t| d
d}t||| t|	|
|| d S )Npandasr7   r8   r   r   ABr   r   r   r   r   r   r   r   rJ   F)rK   r   )r-   importorskip	DataFramer   r   r	   r   r   r   r,   r0   )rH   pdX_dfrO   r4   r   r   r    !test_one_hot_encoder_dtype_pandass   s   

r[   zignore::FutureWarning:sklearn	get_namesget_feature_namesget_feature_names_outc                 C   s   t  }g dg dg dg dg}|| t||  }| dkr(t|tjs(J tg d| |g d}t|| g d}tg d| tj	t
d	d
 t|| ddg W d    d S 1 sbw   Y  d S )N)Maler   girlr   r   )Female)   r`   r   
   )r_   3   boy   r   )r_   [   r`         r]   )	x0_Femalex0_Malex1_1x1_41x1_51x1_91x2_boyx2_girlx3_1x3_2x3_12x3_21x4_3x4_10x4_30)onetwothreefourfive)
one_Femaleone_Maletwo_1two_41two_51two_91	three_boy
three_girlfour_1four_2four_12four_21five_3five_10five_30z!input_features should have lengthr)   ry   rz   )r	   r,   getattr
isinstancer   ndarrayr   r]   r-   r.   r/   )r\   r<   r   feature_namesfeature_names2r   r   r    "test_one_hot_encoder_feature_names   s.   
"r   c                 C   sd   t  }tjddggtdj}|| t||  }tddg| t|| dgd}tdd	g| d S )
Nu   c❤t1dat2rJ   u	   x0_c❤t1x0_dat2u   n👍me)input_featuresu   n👍me_c❤t1u   n👍me_dat2)r	   r   r   objectrN   r,   r   r   )r\   r<   r   r   r   r   r    *test_one_hot_encoder_feature_names_unicode   s   
r   c                  C   s   t ddggj} t }|jg dgd | d g dgks"J ||  jdks.J |jg dgd ||  jdksCJ d S )	Nr   r   )r   r   r   r   r9   r:   )r   r%   )r   r   r   r   r%   r   )	r   r   rN   r	   
set_params
get_paramsr   r   r   )r   r4   r   r   r    test_one_hot_encoder_set_params   s   r   c                 C   sN   t dd}|| }t ddd}|| }t| | t|s#J | S )NrL   r9   Fr:   r   )r	   r   r   r   r   isspmatrix_csr)r   r<   Xtr1Xtr2r   r   r    check_categorical_onehot   s   


r   r   defr   7   abcr   r   )rc   r   r   )r   r   r   )r8   rS   cat)r7   rT   r   rJ   )r8   r   r   r7   r   nan)Nr   r   )r7   r   r   )Nr   N)mixednumericr   z	mixed-nanzmixed-float-nanz
mixed-Nonezmixed-None-nanzmixed-None-float-nan)idsc                 C   s   t t| d d dgf }t|ddgddgg t t| d d ddgf }t|g dg dg tdd| }t| g dg dg d S )	Nr   r   )r   r   r   r   r   r   r   r   rL   r9   )r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r	   r   r   )r   Xtrr   r   r    test_one_hot_encoder   s   r   sparse_FTdropfirstc                 C   s  g dg dg dg}t ||d}||}tj|td}t||| ddgddgd	dgg}t |d
|d}||}t|}t||| |d u rg dg dg dg}t || ddgddgg dgd}||}tj|td}d |d< t||| ddgddgd	dgg}t |ddgddgg| d}||}tj|td}d |d< d |d d df< t||| tg dg dg}td}t	j
t|d || W d    d S 1 sw   Y  d S )Nr   r   )r   r   r   r   r   rJ   r   r   r   r   rL   )r   r:   r   r   r   )6   r   8   )r   r"   r:   )r   r   r   r   )r   r:   r"   r   r   r   r   r   r   )Shape of the passed X data is not correctr)   )r	   r   r   r   r   r   inverse_transformreescaper-   r.   r/   )r"   r   r   r   r<   X_trexpr=   r   r   r    test_one_hot_encoder_inverse  sF   





"r   z
X, X_transr   r   r   r   r   r   r   ry   rz   r{   r8   r   r   r   r   r   )r   r   r   r   r   )r   r   r   r   r   c                 C   s`   t |d| }d}|rt|d}tjt|d || W d   dS 1 s)w   Y  dS )zCheck that `inverse_transform` raise an error with unknown samples, no
    dropped feature, and `handle_unknow="error`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/14934
    r   zqSamples \[(\d )*\d\] can not be inverted when drop=None and handle_unknown='error' because they contain all zerosr   r)   N)r	   r,   r   r-   r.   r/   r   )r   X_transr   r<   r=   r   r   r    ?test_one_hot_encoder_inverse_transform_raise_error_with_unknown>  s   
"r   c                  C   sJ   t jddgddgddggtd} tddd	}|| }t|||  d S )
Nr_   r   ra   r   r   rJ   	if_binaryFr   r   )r   r   r   r	   r   r   r   )r   oher   r   r   r    &test_one_hot_encoder_inverse_if_binary^  s    
r   )r   r   N
reset_dropc                 C   s   t jddgddgddggtd}t|dd}|| ||}t||  }|j|d	 t|	|| t
||| tt||  | d S )
Nr_   r   ra   r   r   rJ   Fr   r   )r   r   r   r	   r,   r0   r   r   r   r   r   )r\   r   r   r   r   r   r   r   r   r    test_one_hot_encoder_drop_resetg  s    

r   methodr,   r         @      @c                 C   sL   t  }d}tjt|d t|||  W d    d S 1 sw   Y  d S )N'Expected 2D array, got 1D array insteadr)   )r	   r-   r.   r/   r   )r   r   r4   r=   r   r   r    test_X_is_not_1Dw  s
   "r   c                 C   sd   t d}|g d}t }d}t jt|d t|| | W d    d S 1 s+w   Y  d S )NrQ   )   r   r%   r   r   r)   )r-   rW   Seriesr	   r.   r/   r   )r   rY   r   r4   r=   r   r   r    test_X_is_not_1D_pandas  s   
"r   zX, cat_exp, cat_dtyper   r   r   rS   rT   )r   r   r   stringzmissing-floatzmissing-np.nan-objectzmissing-float-nan-objectc                 C   s   | | d d d fD ]Q}t dd}|| t|jtsJ t|j|D ]6\}}| }t|d rHt|d s9J |d d |d d ksGJ n| |ksPJ t	|j
|sYJ q#q	d S )NrC   rL   r9   )r	   r,   r   categories_listziptolistr   r   
issubdtyperK   )r   cat_exp	cat_dtypeXir<   resr   res_listr   r   r    test_one_hot_encoder_categories  s   #

r   zX, X2, cats, cat_dtypedr7   r8   cint64r%   r   r   r   )Nr7   z)r7   r8   r   )r7   Nr   r   )r   r   zobject-stringzobject-string-nonezobject-string-nanzobject-None-and-nanzobject-nan-and-Nonec                 C   s  t |d}tg dg dg}t||  | t|jd t|d ks)J |jd 	 t|d ks8J |jd j
|ksBJ t |d}tjtdd || W d    n1 s^w   Y  t ||d}tg dg dg}t||| | d S )	Nr9   r   r   r   r   r   r   r   r(   r)   r:   r"   )r   r   r   )r	   r   r   r   r   r   r   r:   r   r   rK   r-   r.   r/   r,   r0   )r   r3   catsr   r"   r<   r   r   r   r    )test_one_hot_encoder_specified_categories  s   
:
r   c                  C   sd  t jddggtdj} tg dgd}t g dg dg}t|| |  | t|	|  | |j
d  g dksBJ t |j
d jt jsOJ t d	d
ggj} tg dgd}d}tjt|d |	|  W d    n1 syw   Y  t d	d
t jggj} td	t jd
ggd}tjt|d |	|  W d    d S 1 sw   Y  d S )Nr7   r8   rJ   )r8   r7   r   r9   r   r   r   r   r   )r   r   r   z%Unsorted categories are not supportedr)   )r   r   r   rN   r	   r   r,   r0   r   r   r   r   r   rK   object_r-   r.   r/   r   )r   r<   r   r=   r   r   r    (test_one_hot_encoder_unsorted_categories  s$   "r   c                  C   s   t jddgddggtdj} tg dg dgd}t g d	g d
g}t||  | |jd 	 g dks;J t 
|jd jt jsHJ |jd 	 g dksUJ t 
|jd jt jsbJ d S )Nr7   r8   r   r   rJ   r   )r   r   r   r9   )r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   rN   r	   r   r   r   r   r   r   rK   r   r   r<   r   r   r   r    7test_one_hot_encoder_specified_categories_mixed_columns#  s   r   c                  C   sD   t d} | ddgddgd}t|}t|g dg dg d S )	NrQ   r7   r8   r   r   rR   rU   rV   )r-   rW   rX   r   r   )rY   rZ   r   r   r   r    test_one_hot_encoder_pandas0  s   
r   zdrop, expected_namesx0_cx2_br   )r   x1_2r   )r   r   r8   x0_bx2_a)r   binarymanualc                 C   sV   g dg dg}t |d}|| t||  }| dkr$t|tjs$J t|| d S )N)r   r   r7   )r8   r   r8   r   r]   )r	   r,   r   r   r   r   r   )r\   r   expected_namesr   r   r   r   r   r    'test_one_hot_encoder_feature_names_drop:  s   

r   c                  C   s   ddgddgddgg} t g dg dg dg}t d d	g}td
dd}|| }t|j| t|| ddgddgddgg} t ddgddgddgg}t d	d g}td
dd}|| }t|j| t|| d S )Nrc   yes   nori   )r   r   r   r   rE   )r   r   r   r   r   r   Fr   truer7   falser   r   )r   r   r	   r   r   	drop_idx_r   )r   expectedexpected_drop_idxr   resultr   r   r    *test_one_hot_encoder_drop_equals_if_binaryP  s    


r   )rc   r   r   )r   r   r   )r   r   r   c                 C   sT   t  }tjg dg dgdd}t|| |d t dd}t|| | d S )Nr   r   r   r   r   r   r   rJ   float64)r
   r   r   r   r   astyper   r   r   r    test_ordinal_encoderh  s
   

r  )r   r   zobject-string-catc                 C   s   t |d}tdgdgg}t|| | t|jd t|d ks%J |jd  t|d ks4J |jd j	|ks>J t |d}t
jtdd || W d    d S 1 s[w   Y  d S )Nr9   r   r   r   r(   r)   )r
   r   r   r   r   r   r:   r   r   rK   r-   r.   r/   r,   )r   r3   r   r   r<   r   r   r   r    )test_ordinal_encoder_specified_categoriesy  s   

"r  c                  C   s   g dg dg} t  }|| }tj| td}t||| tg dg dg}td}t	j
t|d || W d    d S 1 sGw   Y  d S )Nr   r   rJ   )r   r   r   r   rU   r   r)   )r
   r   r   r   r   r   r   r   r   r-   r.   r/   )r   r<   r   r   r=   r   r   r    test_ordinal_encoder_inverse  s   

"r  c                  C   s   t ddd} tjddgddgdd	ggtd
}tjddgddgddggtd
}| | | |}tjddgddgddggdd
}t|| | |}tjdd gd dgddggtd
}t|| d S )Nuse_encoded_valuer"   unknown_valuer7   xr8   yr   r   rJ   xyblar   r   r   r   )r
   r   r   r   r,   r0   r   r   )r<   X_fitr   X_trans_encr   X_trans_invinv_expr   r   r    +test_ordinal_encoder_handle_unknowns_string  s     

 

 r  rK   c                 C   s   t ddd}tjddgddgdd	gg| d
}tjddgddgddgg| d
}|| ||}tjddgddgddggdd
}t|| ||}tjdd gd dgddggtd
}t|| d S )Nr  r  r      r      r   	   rJ   rf      r   r   )r
   r   r   r,   r0   r   r   r   )rK   r<   r  r   r  r   r  r  r   r   r    ,test_ordinal_encoder_handle_unknowns_numeric  s     

 

 r  zparams, err_type, err_msgr  zbunknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got None.r  r  zTunknown_value should only be set when handle_unknown is 'use_encoded_value', got -2.r  r  zaunknown_value should be an integer or np.nan when handle_unknown is 'use_encoded_value', got bla.zhThe used value for unknown_value (1) is one of the values already used for encoding the seen categories.zKhandle_unknown should be either 'error' or 'use_encoded_value', got ignore.c                 C   sf   t jddgddggtd}tdi | }tj||d || W d    d S 1 s,w   Y  d S )Nr7   r	  r8   r
  rJ   r)   r   )r   r   r   r
   r-   r.   r,   )paramserr_typeerr_msgr   encoderr   r   r    *test_ordinal_encoder_handle_unknowns_raise  s
   %"r  c                  C   s`   t dtjd} tdgdgdgg}| | | dgdgdgg}t|dgdgtjgg d S )Nr  r  r   r   r   r%   r   )r
   r   r   r   r,   r0   r   )r<   r  r   r   r   r    (test_ordinal_encoder_handle_unknowns_nan  s
   
r  c                  C   sd   t dtjtd} tdgdgdgg}tjtdd | | W d    d S 1 s+w   Y  d S )Nr  )r"   r  rK   r   r   r   z'dtype parameter should be a float dtyper)   )	r
   r   r   intr   r-   r.   r/   r,   )r<   r  r   r   r    8test_ordinal_encoder_handle_unknowns_nan_non_float_dtype  s   "r  c                  C   sj   t jg dgtdj} g d}t|d}d}tjt|d ||  W d    d S 1 s.w   Y  d S )N)LowMediumHighr!  r   rJ   )r   r!  r"  r9   z*Shape mismatch: if categories is an array,r)   )	r   r   r   rN   r
   r-   r.   r/   r,   )r   r   r<   r=   r   r   r    +test_ordinal_encoder_raise_categories_shape  s   
"r#  c                     sx  t ddtjg dg dgdd} tjddgd	d
ggddtjddgd	d
ggddtddgddggtddgddggtjddgd	dggddfD ]!   t fddtdD scJ t  |  qLddgd	d
gg   tfddtdD sJ t  |  ddgd	dgg   tfddtdD sJ t  |  d S )NrL   r9   )r   r   r   r   )r   r   r   r   r   rJ   r   r   r   r%   r   r7   r8   r   r      a   b   c   dr   c                    s   g | ]}j | j jkqS r   r   rK   .0ir   r<   r   r    
<listcomp>*  s    z'test_encoder_dtypes.<locals>.<listcomp>c                    s"   g | ]}t  j| jt jqS r   )r   r   r   rK   integerr)  r<   r   r    r-  /  s   " c                       g | ]
} j | jd kqS )r   r(  r)  r/  r   r    r-  4      )	r	   r   r   r,   allranger   r0   r   )r   r   r,  r    test_encoder_dtypes  s&   

 

r4  c                     s  t d} tddtjg dg dgdd}| jdd	gd
dgddgddd}| tfddtd	D s<J t	
| | | dd	gddgddgd}|d j|d j|d jg | t fddtd
D sxJ t	
| | d S )NrQ   rL   r9   )r   r   r   r   r   r   )r   r   r   r   r   r   r   rJ   r   r   r   r%   r   r   )rS   rT   Cr   c                    r0  )r   r(  r)  r/  r   r    r-  D  r1  z.test_encoder_dtypes_pandas.<locals>.<listcomp>r7   r8   r   r   rS   rT   r5  c                    s    g | ]}j | j | kqS r   r(  r)  X_typer<   r   r    r-  J  s     )r-   rW   r	   r   r   rX   r,   r2  r3  r   r0   r   rK   )rY   r   r   r   r6  r    test_encoder_dtypes_pandas8  s   

"

 r8  c                  C   s*   t  } ddgddgg}tj| j| d S )Nr_   r   ra   r   )r	   r   testingassert_no_warningsr   )r<   r   r   r   r    test_one_hot_encoder_warningN  s   r;  missing_valuec           	      C   sn  dddd| g}t |d}g dg ddddd| gg}|| }g dg d	g d
g}t|| |j|u s8J dd t|j|jD }||}t	j
|td}t|d rt|d d |d d  t|d skJ t|d ssJ t|d d d df |d d d df  t|dd df |dd df  t|d sJ t|d sJ d S t|| t|| d S )Nr   rf   r   r   r   )r   rf   r   r   r7   )r   rf   r   r   r7   )r   r   r   r   r   )r   r   r   r   r   r   c                 S   s   g | ]\}}|| qS r   r   )r*  r   featurer   r   r    r-  b  s    z4test_one_hot_encoder_drop_manual.<locals>.<listcomp>rJ   rC   )rC   rC   )r	   r   r   r   r   r   r   r   r   r   r   r   r   )	r<  cats_to_dropr<   r   transr   dropped_catsX_inv_transX_arrayr   r   r     test_one_hot_encoder_drop_manualT  s2   


*"
rC  zX_fit, params, err_msgr_   ra   secondz Wrong input for parameter `drop`r   r   ;   )ghir   rF  z&The following categories were supposedc                 C   sL   t di |}tjt|d ||  W d    d S 1 sw   Y  d S )Nr)   r   r	   r-   r.   r/   r,   )r  r  r  r<   r   r   r    #test_one_hot_encoder_invalid_paramsy  s   "rI  )r   r   rb   r7   c                 C   s^   t | d}d}tjt|d |g dg dg dg W d    d S 1 s(w   Y  d S )Nr   z-`drop` should have length equal to the numberr)   r   r   rE  rH  )r   r<   r  r   r   r    test_invalid_drop_length  s
   
"rJ  densityr   denser7   r   r8   r   c                 C   s   t | d}t | |d}g dg dg}|| || t|j|j |dkr/t|jd nt||j|jD ]\}}}|t| |ksFJ q7t|jtj	sPJ |jj
tksXJ d S )Nr   r   )r   r   r7   rM  r   r   )r	   r,   r   r   r   r   r  r   r   r   rK   r   )rK  r   ohe_baseohe_testr   drop_catdrop_idxcat_listr   r   r    test_categories  s   



rS  Encoderc                 C   s   d|    d v sJ d S )NcategoricalX_types)	_get_tags)rT  r   r   r    "test_encoders_has_categorical_tags  s   rX  kwargsmax_categoriesmin_frequency   g(\?r   )rZ  r[  rf   r:   rL   )r7   r8   r   r   c           
      C   s  t dgd dgd  dgd  dgd  gj}td|d	d
d| |}t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t|| dd dgdgd  D }|	|}t|| |
 }	tddg|	 | }	tddg|	 dS )zpTest that different parameters for combine 'a', 'c', and 'd' into
    the infrequent category works as expected.r7   r   r8   r   r   rc   r   r   r$   F)r:   r"   r   )r7   r   r   er   r   c                 S      g | ]}|gqS r   r   r*  colr   r   r    r-        z2test_ohe_infrequent_two_levels.<locals>.<listcomp>infrequent_sklearnr%   r   x0_infrequent_sklearnNr   r   r   rN   r	   r,   r   infrequent_categories_r0   r   r   r]   r^   )
rY  r:   X_trainr   X_testr   r   expected_invX_invr   r   r   r    test_ohe_infrequent_two_levels  s,   2(



rj  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t|jdg t dgdgg}||}tdgdgg| |	 }tdg| |
 }tdg| ||}tdgdgg| dS )z3Test two levels and dropping the frequent category.r7   r   r8   r   r   rc   r   r   r$   Fr   r"   r   rZ  r   r   r   rc  rb  N)r   r   rN   r	   r,   r   r   r0   r   r]   r^   r   )r   rf  r   rg  r   r   	X_inverser   r   r    ,test_ohe_infrequent_two_levels_drop_frequent  s    2

rm  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z_Test two levels and dropping any infrequent category removes the
    whole infrequent category.r7   r   r8   r   r   rc   r   r   r$   Fr   rk  Unable to drop category r   ( from feature 0 because it is infrequentr)   Nr   r   rN   r	   r-   r.   r/   r,   r   rf  r   r=   r   r   r    5test_ohe_infrequent_two_levels_drop_infrequent_errors  s   2"rr  r  gQ?g{Gz?r  c           	      C   s  t dgd dgd  dgd  dgd  gj}tdd	d
d| |}t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t|| dgdgdgdgdgg}|	|}t|| |
 }tg d| | }tg d| dS )zkTest that different parameters for combing 'a', and 'd' into
    the infrequent category works as expected.r7   r   r8   r   r   rc   r   r   r$   Fr"   r   r]  r   r   r   r   r   rb  )r   r   rc  Nr   rd  )	rY  rf  r   rg  r   r   rh  ri  r   r   r   r     test_ohe_infrequent_three_levels  s2   2(



ru  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d|}t dgdgdgg}tddgddgddgg|| |jdd| d}tj	t
|d |dgdgg}W d   n1 sfw   Y  tddgddgg| dS )z5Test three levels and dropping the frequent category.r7   r   r8   r   r   rc   r   r   r$   Frk  r   r   r#   r'   r(   r)   r]  N)r   r   rN   r	   r,   r   r0   r   r-   warnsUserWarning)r   rf  r   rg  r=   r   r   r   r    .test_ohe_infrequent_three_levels_drop_frequent:  s   2"rx  c                 C   s   t dgd dgd  dgd  dgd  gj}td	d
d| d}d| d d}tjt|d || W d   dS 1 sAw   Y  dS )z7Test three levels and dropping the infrequent category.r7   r   r8   r   r   rc   r   r   r$   Frk  rn  r   ro  r)   Nrp  rq  r   r   r    7test_ohe_infrequent_three_levels_drop_infrequent_errorsO  s   2"ry  c                  C   s   t dgd dgd  dgd  dgd  gj} td	d
dd| }t|jddgg dgdgdgdgg}t g dg dg dg dg}||}t|| dgg}d}t	j
t|d || W d   dS 1 sow   Y  dS )zmTest that different parameters for combining 'a', and 'd' into
    the infrequent category works as expected.r7   r   r8   r   r   rc   r   r   r&   F)r"   r   rZ  r   rt  r   badz.Found unknown categories \['bad'\] in column 0r)   N)r   r   rN   r	   r,   r   re  r0   r   r-   r.   r/   )rf  r   rg  r   r   r=   r   r   r    (test_ohe_infrequent_handle_unknown_error\  s   2"

"r{  c                 C   s   t jdgd dgd  gtdj}tdg dgddd	| |}dgd
gdgdgdgg}t ddgddgddgddgddgg}||}t|| dddgg}dgdgg}|D ]}|j|d| tdgdgg|| qZdS )zG'a' is the only frequent category, all other categories are infrequent.r7   r   r]  ri   rJ   r   r   r7   r8   Fr$   r:   r   r"   r8   r   r   r   r   r   r   r   Nr   )	r   r   r   rN   r	   r,   r0   r   r   )rY  rf  r   rg  r   r   dropsr   r   r   r    5test_ohe_infrequent_two_levels_user_cats_one_frequentt  s(   "(

r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jg dg dgdgdgdgdgg}t ddgddgddgddgddgg}||}t	|| dd dgdgd  D }|
|}t|| dS )zFTest that the order of the categories provided by a user is respected.r7   r   r8   r   r   rc   r   r   rJ   r|  Fr$   r   r:   r   r"   rZ  )r   r   r7   r]  r   r   c                 S   r^  r   r   r_  r   r   r    r-    ra  z<test_ohe_infrequent_two_levels_user_cats.<locals>.<listcomp>rb  r%   Nr   r   r   rN   r	   r,   r   re  r0   r   r   rf  r   rg  r   r   rh  ri  r   r   r    (test_ohe_infrequent_two_levels_user_cats  s*   *(


r  c                  C   s   t jdgd dgd  dgd  dgd  gtd	j} tg d
gdddd| }t|jddgg dgdgdgdgdgg}t g dg dg dg dg dg}||}t	|| dgdgdgdgdgg}|
|}t|| dS )zTest that the order of the categories provided by a user is respected.
    In this case 'c' is encoded as the first category and 'b' is encoded
    as the second one.r7   r   r8   r   r   rc   r   r   rJ   )r   r   r8   r7   Fr$   r  r]  r   rt  r   rb  Nr  r  r   r   r    *test_ohe_infrequent_three_levels_user_cats  s4   *(


r  c                  C   sb   t jg dg df } tdddd}||  ddgddgg}||}t|g d	g d
g dS )zaTest infrequent categories where feature 0 has infrequent categories,
    and feature 1 does not.	r   r   r   r   r   r   r   r   r   )	r   r   r   r   r   r   r   r   r   r   r   F)rZ  r   r   r   r   r   r   r   r   )r   r   r   r   N)r   c_r	   r,   r0   r   )r   r   rg  r   r   r   r    test_ohe_infrequent_mixed  s   

r  c            
   
   C   s  t jg dg dg df } tdddd}||  }t|jd d	d
g t|jd	 d	dg t|jd
 d dD ]}t|| }tg d| q:g dg dg dg dg dg dg dg dg dg	}t|| g dg dg}|	|}g dg dg}t||  |
|}t jg dg dgtd}	t|	| tdddd| }tjtdd  |	| W d   n1 sw   Y  g d!g d"g}|	|}g d#g dg}t||  |
|}t jg d$g d%gtd}	t|	| dS )&z?Test infrequent categories with feature matrix with 3 features.r  )	r   r   r   r   r   rc   r   r   r   )	r   r   r   r   r   r   r   r   r   rL   r   r$   r:   rZ  r"   r   r   r   rc   N)r]   r^   )x0_0x0_3rc  x1_0x1_5x1_infrequent_sklearnx2_0x2_1)r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   r   r   )r%   r   r   )r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   )r   rb  N)rb  r   NrJ   r&   r(   r)   )r   r   r   )r   rc   r   )r   r   r   r   r   r   r   r   )rb  rb  r   )r   rb  r   )r   r  r	   r   r   r   re  r   r   r0   r   r   r   r,   r-   r.   r/   )
r   r   r   r\   r   r   rg  X_test_transri  rh  r   r   r    'test_ohe_infrequent_multiple_categories  sr   






r  c            	   
   C   s  t d} | jg dg ddddgd}tdd	d
d}|| }t|jd ddg t|jd g d g dg dg dg dg dg dg dg dg dg	}t|| | jddgddgdddgd}g dg dg}|	|}t||  |
|}tjddgddggtd}t|| | jddgddgdddgd}|	| }g dg dg}t|| |
|}tjddgddggtd}t|| dS )zHTest infrequent categories with a pandas dataframe with multiple dtypes.rQ   )	r7   fr   r  r  r7   r   r8   r8   )	r   r   r   rc   rc   rf   r   r   r   )strr  r  r  columnsrL   r   r$   r  r   r7   r8   r   )r   r   rf   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   r   r   r     rf   rb  rJ   r   r   N)r-   rW   rX   r	   r   r   r   re  r   r0   r   r   r   r   )	rY   r   r   r   r   rg  r  ri  rh  r   r   r    .test_ohe_infrequent_multiple_categories_dtypes?  sV   
	
 


 

r  rh   )r[  rZ  c                 C   sp   t dgd dgd  dgd  dgd  gj}tdd	d
d| }|| |dgg}t|dgg dS ),All user provided categories are infrequent.r7   r   r8   r   r   rc   r   r   r$   Frs  r   Nr   )r   r   rN   r	   r,   r0   r   rY  rf  r   r   r   r   r    $test_ohe_infrequent_one_level_errors  s
   2
r  c                 C   sb   t jdgd gtdj}tdg dgddd| |}|dgdgg}t|d	gd	gg d
S )r  r]  r   rJ   r|  Fr$   r}  r7   r   Nr   )r   r   r   rN   r	   r,   r0   r   r  r   r   r    5test_ohe_infrequent_user_cats_unknown_training_errors  s   r  zkwargs, error_msgz%max_categories must be greater than 1rC   z)min_frequency must be an integer at leastg?c                 C   s   t dgd dgd  dgd  dgd  gj}tdd	d
i| }tjt|d || W d    d S 1 s:w   Y  d S )Nr7   r   r8   r   r   rc   r   r   r"   r$   r)   r   rp  )rY  	error_msgrf  r   r   r   r    ,test_ohe_infrequent_invalid_parameters_error  s
   2	"r  c                  C   sb   t jddggtdj} t | }d}tjt|d |	  W d    d S 1 s*w   Y  d S )Nr   dogrJ   z&get_feature_names is deprecated in 1.0r)   )
r   r   r   rN   r	   r,   r-   rv  FutureWarningr]   r;   r   r   r    1test_one_hot_encoder_get_feature_names_deprecated  s   
"r  zinput_dtype, category_dtype)OOOUUOUUUSSOSUSS
array_type)r   r   	dataframec           
      C   s   t jdgdgg| d}t jddg|dg}t|dd|}tdgdgdgdgg|| d}||}t ddgddgddgddgg}t|| t|d|}	|	|}t dgdgdgdgg}t|| d	S )
a"  Check that encoding work with object, unicode, and byte string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/15616
    https://github.com/scikit-learn/scikit-learn/issues/15726
    https://github.com/scikit-learn/scikit-learn/issues/19677
    r8   r7   rJ   Fr   r   r   r9   N)	r   r   r	   r,   r   r0   r   r
   r   )
rI   category_dtyper  r   r:   r   rg  r   r   oer   r   r    test_encoders_string_categories  s   
"

r  c                 C   sT   t jdd|d|ggtdj}tddd|}t||  }t|ddd	| g d S )
Nr7   r8   rJ   Fr#   r   r"   x0_ar   x0_)r   r   r   rN   r	   r,   r   r   )r\   r<  r   r   namesr   r   r    )test_ohe_missing_values_get_feature_names  s   r  c                  C   sr   t d} | jg dtjdddtjgtdddd	gd
}tg dg dg dg dg}t|}t|| d S )NrQ   )r  r   Nr   r   r   r%   rJ   )col1col2r  r  r  )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )r   r   r   r   r   r   r   )	r-   rW   rX   r   r   r   floatr   r   )rY   dfexpected_df_transr   r   r   r    %test_ohe_missing_value_support_pandas  s    
	r  pd_nan_typepd.NAznp.nanc              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}tg d	g d
g dg dg d
g}td|d}|	|}t
|| t|jdksMJ t|jd d d g d t|jd d sgJ d S )NrQ   r  r  r   r7   r8   categoryrJ   )r   r   r   r   )r   r   r   r   )r   r   r   r   r  Fr  r   r   rC   r   )r-   rW   NAr   r   rX   r   r   r	   r   r   lenr   r   isnan)r  r"   rY   pd_missing_valuer  r  r   df_transr   r   r    1test_ohe_missing_value_support_pandas_categorical  s(   



r  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg d	g}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zZCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during transform.r7   r   r8   r   r   r   Fr   r   r"   r   r   )r   r   r   r   r   tFound unknown categories in columns \[0, 1\] during transform. These unknown categories will be encoded as all zerosr)   NrJ   r	   r   r   r   r   r-   rv  rw  r0   r   r   r   r"   r   r   r   rO   rg  warn_msgri  r   r   r    /test_ohe_drop_first_handle_unknown_ignore_warns  s(   




r  c                 C   s   ddgddgddgg}t dd| d}||}tg d	g d
g dg}t|| ddgg}tg dg}d}tjt|d ||}W d   n1 sPw   Y  t|| |	|}t
|tjddggtd dS )zDCheck drop='if_binary' and handle_unknown='ignore' during transform.r7   r   r8   r   r   r   Fr  r  r   rU   r   r   )r   r   r   r   r  r)   NrJ   r  r  r   r   r    3test_ohe_drop_if_binary_handle_unknown_ignore_warns=  s(   




r  c                 C   s   ddgddgddgg}t dd| ddgddggd}|| d	dgg}tddgg}d
}tjt|d ||}W d   n1 sDw   Y  t|| dS )znCheck drop='first' and handle_unknown='ignore'/'infrequent_if_exist'
    during fit with categories passed in.r7   r   r8   r   r   r   F)r   r   r"   r:   r   zqFound unknown categories in columns \[0\] during transform. These unknown categories will be encoded as all zerosr)   N)	r	   r,   r   r   r-   rv  rw  r0   r   )r"   r   r   rg  rO   r  r   r   r   r    'test_ohe_drop_first_explicit_categories`  s    

r  c                  C   sn   t t jdddggj} tt jd}dt j }tjt|d |	|  W d   dS 1 s0w   Y  dS )zDTest ordinal encoder with nan passthrough fails when dtype=np.int32.r   r   rJ   zdThere are missing values in features \[0\]. For OrdinalEncoder to encode missing values with dtype: r)   N)
r   r   r   rN   r
   int32r-   r.   r/   r,   )r   r  r=   r   r   r    Btest_ordinal_encoder_passthrough_missing_values_float_errors_dtype{  s   "r  encoded_missing_valuec                 C   s   t jt jdddggt jdj}t| d|}t|jdks J t	|jd ddt jg |
|}t	|| gdgdgdgg ||}t	|| dS )	z.Test ordinal encoder with nan on float dtypes.r   r   rJ   r  r   r   r   N)r   r   r   r   rN   r
   r,   r  r   r   r0   r   )r  r   r  r   rl  r   r   r    5test_ordinal_encoder_passthrough_missing_values_float  s   

r  c              	   C   s   t d}| dkr|jntj}|d|jdd|ddgddi}t|d	|}t	|j
d
ks1J t|j
d dd g d t|j
d d sKJ ||}t|dgdg|gdgdgg ||}|jdkskJ t|dddf ddg t|dddf ddg t|d sJ dS )z0Check ordinal encoder is compatible with pandas.rQ   r  r  r   r7   r8   r  rJ   r  r   r   Nr   r   rC          @r   r   )r   r   r   r   )r-   rW   r  r   r   rX   r   r
   r,   r  r   r   r  r0   r   r   r   )r  r  rY   r  r  r  r  rl  r   r   r    =test_ordinal_encoder_missing_value_support_pandas_categorical  s"   


r  r  )zobject-None-missing-valuezobject-nan-missing_valueznumeric-missing-valuec                 C   s   t |d}tdgtjgg}t|| | |jd j|ks!J t |d}tj	t
dd || W d   dS 1 s>w   Y  dS )z.Test ordinal encoder for specified categories.r9   r   r   r(   r)   N)r
   r   r   r   r   r   r   rK   r-   r.   r/   r,   )r   r3   r   r   r  r   r   r   r    =test_ordinal_encoder_specified_categories_missing_passthrough  s   
&
"r  zX, expected_X_trans, X_testr   r   )r   r   r   )r   r  r   r   )r   r7   r8   )r  r   r   c                 C   s8   t ddd}|| }t|| t||dgg dS )z>Test the interaction between missing values and handle_unknownr  rC   r  g      N)r
   r   r   r0   )r   expected_X_transrg  r  r   r   r   r    /test_ordinal_encoder_handle_missing_and_unknown  s   

r  c                  C   s   t g dg dg} t| }t }d}tjt|d || W d   n1 s,w   Y  tjt|d |	| W d   n1 sHw   Y  |	| }t|}tjt|d |
| W d   dS 1 sow   Y  dS )zCheck that we raise proper error with sparse input in OrdinalEncoder.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19878
    r   r   z6A sparse matrix was passed, but dense data is requiredr)   N)r   r   r   
csr_matrixr
   r-   r.   	TypeErrorr,   r   r   )r   X_sparser  r  r   r   r   r   r    test_ordinal_encoder_sparse  s   


"r  c                  C   s   t g dddt jf } tg dgddd}||  tg dgdd}tjtd	d
 ||  W d   dS 1 s>w   Y  dS )zCheck OrdinalEncoder.fit works with unseen category when
    `handle_unknown="use_encoded_value"`.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    )r   r   r   r   r   r   N)rC   r   r   r  r  )r:   r"   r  r&   r   r(   r)   )r   r   newaxisr
   r,   r-   r.   r/   )r   r  r   r   r    -test_ordinal_encoder_fit_with_unseen_category*  s   
"r  rf  AAOUrg  c                 C   s4   t ddd}||  ||}t|ddgg dS )zChecks that `OrdinalEncoder` transforms string dtypes.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/19872
    r  ir  r   N)r
   r,   r0   r   )rf  rg  r<   r   r   r   r    1test_ordinal_encoder_handle_unknown_string_dtypes;  s   

r  c                  C   sb   t g ddd} t | }t|jt j| ddj |	| }t|dgdgdgdgg dS )	zCheck that `OrdinalEncoder` accepts Python integers that are potentially
    larger than 64 bits.
    Non-regression test for:
    https://github.com/scikit-learn/scikit-learn/issues/20721
    )l   	HP
1& l   	H]viel   	 :?i}Ga l   IRK2e6krC   r   r   )axisr   r   N)
r   r   rF   r
   r,   r   r   sortrN   r0   )r   r  r   r   r   r    #test_ordinal_encoder_python_integerW  s   
r  c                  C   sH   t d} g d}| jg dg|d}t |}| }t|| dS )z-Check feature names out is same as the input.rQ   )r8   r   r7   r   r  N)r-   rW   rX   r
   r,   r^   r   )rY   r  r   r<   feature_names_outr   r   r    .test_ordinal_encoder_features_names_out_pandask  s   
r  c                  C   s   t jdgdgt jggtd} tdt jdd| }|| }t|dgdgdgg t jd	gt jggtd}||}t|t jgdgg d
S )zECheck interactions between encode_unknown and missing value encoding.r7   r8   rJ   r  )r"   r  r  r   r   r   N)r   r   r   r   r
   r,   r0   r   )r   r  r   rg  r  r   r   r    0test_ordinal_encoder_unknown_missing_interactionw  s   

r  with_pandasc                 C   s   t jddgddgdt jggtd}d}| r(td}|j|d	d
gd}|d }n|d }tdd}tjt	|d |
| W d   dS 1 sIw   Y  dS )zXCheck OrdinalEncoder errors when encoded_missing_value is used by
    an known category.r7   r  r8   r   r   rJ   zTencoded_missing_value \(1\) is already used to encode a known category in features: rQ   letterpetr  z	\['pet'\]z\[1\]r   r  r)   N)r   r   r   r   r-   rW   rX   r
   r.   r/   r,   )r  r   r  rY   r  r   r   r    0test_ordinal_encoder_encoded_missing_value_error  s   "


"r  )rr   numpyr   scipyr   r-   sklearn.exceptionsr   sklearn.utils._testingr   r   r   sklearn.utilsr   sklearn.preprocessingr	   r
   r!   markparametrizer6   r>   rG   r  float32r   rP   r[   filterwarningsr   r   r   r   r   r   r   r  r   r   r   r   r   r   r   r   r.  str_float_r   rN   r   r   r   r   r   r   r   r  r  r  r  r  r  r  r/   r  r  r  r#  r4  r8  r;  rC  rM   rI  rJ  rS  rX  rj  rm  rr  ru  rx  ry  r{  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r  r   r   r   r    <module>   s   




A


-*

	

 &&* 
!,8



	
	


#	
$







 

"


$

\A





	

#"		"
