o
    tBh!                     @   s  d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlZddlmZmZ ddlmZmZmZmZ ded	ed
efddZded	ed
ejfddZdeeef fddZ	ddedee dee de	e
 d
e
f
ddZdededeeef d
e
fddZ			dddZdS )     N)OrderedDict)	Generator)AnyDictListOptionalTuple   )ArffSparseDataTypeArffContainerType)_chunk_generatorcheck_pandas_supportget_chunk_n_rowsis_scalar_nan	arff_datainclude_columnsreturnc                 C   s   t  t  t  f}dd t|D }t| d | d | d D ] \}}}||v r=|d | |d | |d ||  q|S )a  
    obtains several columns from sparse arff representation. Additionally, the
    column indices are re-labelled, given the columns that are not included.
    (e.g., when including [1, 2, 3], the columns will be relabelled to
    [0, 1, 2])

    Parameters
    ----------
    arff_data : tuple
        A tuple of three lists of equal size; first list indicating the value,
        second the x coordinate and the third the y coordinate.

    include_columns : list
        A list of columns to include.

    Returns
    -------
    arff_data_new : tuple
        Subset of arff data with only the include columns indicated by the
        include_columns argument.
    c                 S      i | ]\}}||qS  r   .0	array_idx
column_idxr   r   t/var/www/html/riverr-enterprise-integrations-main/venv/lib/python3.10/site-packages/sklearn/datasets/_arff_parser.py
<dictcomp>+       
z)_split_sparse_columns.<locals>.<dictcomp>r      r	   )list	enumeratezipappend)r   r   arff_data_newreindexed_columnsvalrow_idxcol_idxr   r   r   _split_sparse_columns   s   "r&   c           	      C   s~   t | d d }|t|f}dd t|D }tj|tjd}t| d | d | d D ]\}}}||v r<||||| f< q+|S )Nr   c                 S   r   r   r   r   r   r   r   r   =   r   z)_sparse_data_to_array.<locals>.<dictcomp>dtyper   r	   )maxlenr   npemptyfloat64r   )	r   r   num_obsy_shaper"   yr#   r$   r%   r   r   r   _sparse_data_to_array6   s   "r1   featurec                 C   s^   | d dkrt S | d dkrdS | d dks| d dv rtjS | d dkr(tjS td	| )
z)Map feature to dtype for pandas DataFrame	data_typestringnominalcategorynumber_of_missing_values0)numericrealintegerzUnsupported feature: {})objectr+   r-   int64
ValueErrorformat)r2   r   r   r   _feature_to_dtypeH   s   r@   arffcol_slice_xcol_slice_yshapec                 C   s  | d }t |trG|du rtd|d dkrd}n|d |d  }tjtj|d|d}|j| }|dd|f }|dd|f }||fS t |t	rt
||}	t|d d }
|
t|f}tjj|	d |	d |	d	 ff|tjd
}| }t||}||fS td)af  
    converts the arff object into the appropriate matrix type (np.array or
    scipy.sparse.csr_matrix) based on the 'data part' (i.e., in the
    liac-arff dict, the object from the 'data' key)

    Parameters
    ----------
    arff : dict
        As obtained from liac-arff object.

    col_slice_x : list
        The column indices that are sliced from the original array to return
        as X data

    col_slice_y : list
        The column indices that are sliced from the original array to return
        as y data

    Returns
    -------
    X : np.array or scipy.sparse.csr_matrix
    y : np.array
    dataNz6shape must be provided when arr['data'] is a Generatorr   r   r-   )r(   countr	   )rD   r(   z(Unexpected Data Type obtained from arff.)
isinstancer   r>   r+   fromiter	itertoolschainfrom_iterablereshapetupler&   r)   r*   scipysparse
coo_matrixr-   tocsrr1   )rA   rB   rC   rD   r   rG   rE   Xr0   arff_data_Xr.   X_shaper   r   r   _convert_arff_dataZ   s6   




rV   columnsfeatures_dictc                    s  t d}t| d }t|}t| d tstdt| d }|j|g|d}|jdd	 }t
|}	 fdd	|D }
g }|||
  t| d |	D ]}||j||d|
  qL|j|dd
}|
D ]'}t|| }|dkrdd	 || D }|jj|}|| j|dd||< qd|fS )au  Convert the ARFF object into a pandas DataFrame.

    Parameters
    ----------
    arff : dict
        As obtained from liac-arff object.

    columns : list
        Columns from dataframe to return.

    features_dict : dict
        Maps feature name to feature info from openml.

    Returns
    -------
    result : tuple
        tuple with the resulting dataframe
    zfetch_openml with as_frame=True
attributesrE   zAarff['data'] must be a generator when converting to pd.DataFrame.rW   T)deepc                    s   g | ]}| v r|qS r   r   )r   colrZ   r   r   
<listcomp>   s    z0_convert_arff_data_dataframe.<locals>.<listcomp>)ignore_indexr6   c                 S   s    g | ]}|d urt |s|qS N)r   )r   catr   r   r   r]      s
    Fcopy)r   r   r   rH   r   r>   next	DataFramememory_usagesumr   r    r   concatr@   apitypesCategoricalDtypeastype)rA   rW   rX   pdrY   arff_columns	first_rowfirst_df	row_bytes	chunksizecolumns_to_keepdfsrE   dfcolumnr(   cats_without_missingr   rZ   r   _convert_arff_data_dataframe   s4   rw   c                    s  |dkr0d   }t | ||\}	|	  }
tdkr |	 netdkr-|	d  nXd nUd }	t| |||\}
 fdd| d D fdd	D }|sSnt|rgtfd
dtD nt|rotdj	d dkr|
dn	j	d dkrd |
|	fS )Npandasr	   r   r   c                    s,   i | ]\}}t |tr|  v r||qS r   )rH   r   )r   kv)data_columnstarget_columnsr   r   r      s    z%_liac_arff_parser.<locals>.<dictcomp>rY   c                    s   h | ]}| v qS r   r   )r   col_name)nominal_attributesr   r   	<setcomp>   s    z$_liac_arff_parser.<locals>.<setcomp>c              
      sJ   g | ]!\}}t t j |d ddd||d f jtddqS )Or'   Nr   Fra   )r+   takeasarraypoprk   int)r   ir}   )r~   r0   r   r   r]      s     z%_liac_arff_parser.<locals>.<listcomp>zAMix of nominal and non-nominal targets is not currently supported)rF   )rw   r*   rV   allr+   hstackr   anyr>   rD   rM   )arff_containeroutput_arrays_typerX   r{   r|   rB   rC   rD   rW   framerS   is_classificationr   )r{   r~   r|   r0   r   _liac_arff_parser   sF   


	r   r_   )NNN) rJ   collectionsr   collections.abcr   typingr   r   r   r   r   numpyr+   scipy.sparserO   externals._arffr
   r   utilsr   r   r   r   r&   ndarrayr1   strr@   r   rV   rw   r   r   r   r   r   <module>   s`    
$

=

A