o
    rhO                     @   s  d dl Zd dlT d dlZd dlZdd Zdd Zed fddZ	d1d
dZ
d2ddZeZd1ddZdd ZeZd3ddZdd Zdd ZeZd4ddZeZd4ddZG dd dZd5d d!ZG d"d# d#Zefd$d%Zd6d'd(ZG d)d* d*Zd+d, ZeZd-d. Ze Z!d7d/d0Z dS )8    N)*c                 C      t j| dd} | j\}}t j||fdd}t j||fdd}t }t||_t||_||_	||_
|  ||t|  |  ||fS )zPreturn k smallest values (and their indices) of the lines of a
    float32 arrayfloat32dtypeint64)npascontiguousarrayshapezerosfaissfloat_maxheap_array_tswig_ptridsvalnhkheapifyaddnreorderarrayr   mnIDha r   R/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/faiss/extra_wrappers.pykmin      


r   c                 C   r   )zOreturn k largest values (and their indices) of the lines of a
    float32 arrayr   r   r   )r   r	   r
   r   r   float_minheap_array_tr   r   r   r   r   r   r   r   r   r   r   r   kmax+   r    r"   c           	   
   C   s   t j| dd} t j|dd}| j\}}|j\}}||ksJ t j||fdd}|tkr<t||t| |t|t| |S |tkrK| |j |dd< |S t	||t| |t|||t| |S )zJcompute the whole pairwise distance matrix between two sets of
    vectorsr   r   N)
r   r	   r
   empty	METRIC_L2pairwise_L2sqrr   METRIC_INNER_PRODUCTTpairwise_extra_distances)	xqxbmetric
metric_argnqdnbd2disr   r   r   pairwise_distances=   s.   



r2   90  c                 C   $   t j| dd}tt||j| |S Nr   r   )r   r#   
float_randr   sizer   seedresr   r   r   randV      r;   c                 C   sD   t j| dd}|d u rtt||j| |S tt||j|| |S Nr   r   )r   r#   
int64_randr   r7   int64_rand_max)r   r9   vmaxr:   r   r   r   randint\   s   rA   c                 C   r4   r5   )r   r#   float_randnr   r7   r8   r   r   r   randnh   r<   rC   c                 C   sV   |  d} | jdkrt| jt| S | j\}}tj|dd}t||t| t| |S )z> compute a checksum for quick-and-dirty comparisons of arrays uint8   uint64r   )	viewndimbvec_checksumr7   r   r
   r   r   bvecs_checksum)ar   r.   csr   r   r   checksumn   s   


rM     c                 C   s(   t j| |fdd}t| |t|| |S r5   )r   r#   rand_smooth_vectors_cr   )r   r.   r9   r:   r   r   r   rand_smooth_vectorsz   s   rP   c              	   C   s   t j| dd} t j|dd}| jd }|jd |ksJ | jd |jd }}d}t|D ]}|t|t| | |t|| 7 }q-|S )z< size of intersection between each line of two result tablesr   r   r   rE   )r   r	   r
   rangeranklist_intersection_sizer   )I1I2r   k1k2ninterir   r   r   eval_intersection   s   
rY   c                 C   s    t | jd | jd t|  d S )NrE   r   )fvec_renorm_L2r
   r   xr   r   r   normalize_L2   s    r]   c                 C   s|   t j| dd} |du rt|  d }t j|d dd}t j| jdd}t| jt| 	d|t|t|| ||fS )a  Perform a bucket sort on a table of integers.

    Parameters
    ----------
    tab : array_like
        elements to sort, max value nbucket - 1
    nbucket : integer
        number of buckets, None if unknown
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    perm : array_like
        perm[lims[i] : lims[i + 1]] contains the indices of bucket #i (size tab.size)
    r   r   NrE   rF   )
r   r	   intmaxr#   r7   bucket_sort_cr   r   rG   )tabnbucketntlimspermr   r   r   bucket_sort   s   rf   c                 C   sn   | j dks| j dksJ | j\}}|du rt|  d }tj|d dd}t||t| |t|| |S )a  Perform a bucket sort on a matrix, recording the original
    row of each element.

    Parameters
    ----------
    tab : array_like
        array of size (N, ncol) that contains the bucket ids, maximum
        value nbucket - 1.
        On output, it the elements are shuffled such that the flat array
        tab.ravel()[lims[i] : lims[i + 1]] contains the row numbers
        of each bucket entry.
    nbucket : integer
        number of buckets (the maximum value in tab should be nbucket - 1)
    nt : integer
        number of threads to use (0 = use unthreaded codepath)

    Returns
    -------
    lims : array_like
        cumulative sum of bucket sizes (size vmax + 1)
    int32r   NrE   r   )	r   r
   r^   r_   r   r#   matrix_bucket_sort_inplace_cr   r   )ra   rb   rc   nrowncolrd   r   r   r   matrix_bucket_sort_inplace   s   

rk   c                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )
ResultHeapz_Accumulate query results from a sliced dataset. The final result will
    be in self.D, self.I.Fc                 C   s~   t j||fdd| _t j||fdd| _||| _| _|r!t }nt }||_||_t	| j|_
t	| j|_|  || _dS )z
        nq: number of query vectors,
        k: number of results per query
        keep_max: keep the top-k maximum values instead of the minima
        r   r   r   N)r   r   r   r   r-   r   r!   r   r   r   r   r   r   heaps)selfr-   r   keep_maxrm   r   r   r   __init__   s   
zResultHeap.__init__c                 C   sd   |j \}}tj|dd}tj|dd}|j ||fksJ || jks#J | j|t|t|| dS )z
        Add results for all heaps
        D, I should be of size (nh, nres)
        D, I do not need to be in a particular order (heap or sorted)
        r   r   r   N)r
   r   r	   r-   rm   addn_with_idsr   )rn   r   r   r-   kdr   r   r   
add_result   s   
zResultHeap.add_resultc                 C   s   |j \}}|t|ksJ |jdkr|j |j ks%|jdkr#|j |fks%J tj|dd}tj|dd}tj|dd}|jdkrAdn|}| j|t||t|t|| dS )z
        Add results for a subset of heaps.
        D, I should hold resutls for all the subset
        as a special case, if I is 1D, then all ids are assumed to be the same
           rE   r   r   r   r   N)r
   lenrH   r   r	   rm   addn_query_subset_with_idsr   )rn   subsetr   r   nsubsetrr   	id_strider   r   r   add_result_subset  s   
zResultHeap.add_result_subsetc                 C   s   | j   d S N)rm   r   )rn   r   r   r   finalize     zResultHeap.finalizeNF)__name__
__module____qualname____doc__rp   rs   rz   r|   r   r   r   r   rl      s    
rl   Fc           	   	   C   s|   |j | j ksJ | j \}}}tj||f| jd}tj||f|jd}|r&tnt}||||t| t|t|t| ||fS )z
    Merge a set of sorted knn-results obtained from different shards in a dataset
    Dall and Iall are of size (nshard, nq, k) each D[i, j] should be sorted
    returns D, I of size (nq, k) as the merged result set
    r   )r
   r   r#   r   merge_knn_results_CMaxmerge_knn_results_CMinr   )	DallIallro   nshardr   r   DnewInewfuncr   r   r   merge_knn_results  s   r   c                   @   s$   e Zd Zdd Zdd Zdd ZdS )MapInt64ToInt64c                 C   sX   t t|| _|d| j ksJ d|| _tj|dfdd| _t| jt	| j d S )Nrt   zneed power of 2 capacityr   r   )
r^   r   log2log2_capacitycapacityr#   ra   r   hashtable_int64_to_int64_initr   )rn   r   r   r   r   rp   3  s
   zMapInt64ToInt64.__init__c                 C   s>   |j \}|j |fksJ t| jt| j|t|t| d S r{   )r
   r   hashtable_int64_to_int64_addr   r   ra   )rn   keysvalsr   r   r   r   add:  s   zMapInt64ToInt64.addc                 C   s>   |j \}tj|fdd}t| jt| j|t|t| |S r=   )r
   r   r#   r   hashtable_int64_to_int64_lookupr   r   ra   )rn   r   r   r   r   r   r   lookupA  s   zMapInt64ToInt64.lookupN)r   r   r   rp   r   r   r   r   r   r   r   1  s    r   c           
   
   C   s   t j| dd} t j|dd}| j\}}|j\}}||ksJ t j||fdd}t j||fdd}	|tkrKtt| t|||||t|	t| |	|fS |tkrftt| t|||||t|	t| |	|fS t	d)a  
    Compute the k nearest neighbors of a vector without constructing an index


    Parameters
    ----------
    xq : array_like
        Query vectors, shape (nq, d) where the dimension d is that same as xb
        `dtype` must be float32.
    xb : array_like
        Database vectors, shape (nb, d) where dimension d is the same as xq
        `dtype` must be float32.
    k : int
        Number of nearest neighbors.
    distance_type : MetricType, optional
        distance measure to use (either METRIC_L2 or METRIC_INNER_PRODUCT)

    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (nq, k)
    I : array_like
        Labels of the nearest neighbors, shape (nq, k)
    r   r   r   z'only L2 and INNER_PRODUCT are supported)
r   r	   r
   r#   r$   	knn_L2sqrr   r&   knn_inner_productNotImplementedError)
r)   r*   r   r+   r-   r.   r/   r0   r   r   r   r   r   knnM  s(   

r   hcc                 C   s   | j \}}|j \}}||ksJ tj||fdd}tj||fdd}	|dkrPt }
||
_||
_t|	|
_t||
_	t
|
t| t|||d ||	fS |dkrptt| t|||||t|t|	 ||	fS t)a  
    Compute the k nearest neighbors of a set of vectors without constructing an index.

    Parameters
    ----------
    xq : array_like
        Query vectors, shape (nq, d) where d is the number of bits / 8
        `dtype` must be uint8.
    xb : array_like
        Database vectors, shape (nb, d) where d is the number of bits / 8
        `dtype` must be uint8.
    k : int
        Number of nearest neighbors.
    variant : string
        Function variant to use, either "mc" (counter) or "hc" (heap)

    Returns
    -------
    D : array_like
        Distances of the nearest neighbors, shape (nq, k)
    I : array_like
        Labels of the nearest neighbors, shape (nq, k)
    rg   r   r   r   rE   mc)r
   r   r#   r   int_maxheap_array_tr   r   r   r   r   hammings_knn_hchammings_knn_mcr   )r)   r*   r   variantr-   r.   r/   r0   r   r   heapr   r   r   knn_hamming}  s.   

r   c                   @   s<   e Zd ZdZdd Zdd ZdddZdd	d
Zdd ZdS )Kmeansa  Object that performs k-means clustering and manages the centroids.
    The `Kmeans` class is essentially a wrapper around the C++ `Clustering` object.

    Parameters
    ----------
    d : int
       dimension of the vectors to cluster
    k : int
       number of clusters
    gpu: bool or int, optional
       False: don't use GPU
       True: use all GPUs
       number: use this many GPUs
    progressive_dim_steps:
        use a progressive dimension clustering (with that number of steps)

    Subsequent parameters are fields of the Clustring object. The most important are:

    niter: int, optional
       clustering iterations
    nredo: int, optional
       redo clustering this many times and keep best
    verbose: bool, optional
    spherical: bool, optional
       do we want normalized centroids?
    int_centroids: bool, optional
       round centroids coordinates to integer
    seed: int, optional
       seed for the random number generator

    c                 K   s   || _ | | d| _d|v rt | _nt | _| D ]$\}}|dkr3|dks,|dkr/t }|| _qt| j| t	| j|| q| 
  dS )zd: input dimension, k: nb of centroids. Additional
         parameters are passed on the ClusteringParameters object,
         including niter=25, verbose=False, spherical = False
        Fprogressive_dim_stepsgpuTN)r.   resetr   "ProgressiveDimClusteringParameterscpClusteringParametersitemsget_num_gpusgetattrsetattr	set_index)rn   r.   r   kwargsvr   r   r   rp     s   

zKmeans.__init__c                 C   sv   | j }| jjtkr)| jjrt|| _nt|| _| jr't	j
| j| jd| _d S d S | jr3t| jd}nt }|| _d S )N)ngpu)r.   r   	__class__r   	sphericalIndexFlatIPindexIndexFlatL2r   r   index_cpu_to_all_gpusGpuProgressiveDimIndexFactoryProgressiveDimIndexFactoryfac)rn   r.   r   r   r   r   r     s   

zKmeans.set_indexNc                 C   s(   |dur	t || _d| _d| _d| _dS )zg prepare k-means object to perform a new clustering, possibly
        with another number of centroids N)r^   r   	centroidsobjiteration_stats)rn   r   r   r   r   r     s
   

zKmeans.resetc           
         sT  t j|dd}|j\}}|| jksJ | jjtkrBt|| j| j}|dur9|j\}}||ks0J t	
| |j ||| j| n$|du sHJ |du sNJ | jjrTJ t|| j| j}||t|| j t	|j}	|	| j|| _|jfddt D t dd D | _d   fddD | _| jjd	kr| jd
 S dS )a   Perform k-means clustering.
        On output of the function call:

        - the centroids are in the centroids field of size (`k`, `d`).

        - the objective value at each iteration is in the array obj (size `niter`)

        - detailed optimization statistics are in the array iteration_stats.

        Parameters
        ----------
        x : array_like
            Training vectors, shape (n, d), `dtype` must be float32 and n should
            be larger than the number of clusters `k`.
        weights : array_like
            weight associated to each vector, shape `n`
        init_centroids : array_like
            initial set of centroids, shape (n, d)

        Returns
        -------
        final_obj: float
            final optimization objective

        r   r   Nc                    s   g | ]}  |qS r   )at).0rX   )statsr   r   
<listcomp>8  s    z Kmeans.train.<locals>.<listcomp>c                 S   s   g | ]}|j qS r   )r   )r   str   r   r   r   9  s    z,obj time time_search imbalance_factor nsplitc                    s   g | ]  fd dD qS )c                    s   i | ]}|t  |qS r   )r   )r   fieldr   r   r   
<dictcomp>=  s    z+Kmeans.train.<locals>.<listcomp>.<dictcomp>r   )r   )stat_fieldsr   r   r   <  s    r   r   g        )r   r	   r
   r.   r   r   r   
Clusteringr   r   copy_array_to_vectorravelr   trainr   r   ProgressiveDimClusteringr   r   vector_float_to_arrayreshaper   rQ   r7   r   r   split)
rn   r\   weightsinit_centroidsr   r.   clusncr0   r   r   )r   r   r   r     s2   


zKmeans.trainc                 C   sZ   t j|dd}| jd usJ d| j  | j| j | j|d\}}| | fS )Nr   r   zshould train before assigningrE   )r   r	   r   r   r   r   searchr   )rn   r\   r   r   r   r   r   assignB  s   
zKmeans.assignr{   )NN)	r   r   r   r   rp   r   r   r   r   r   r   r   r   r     s     

	<r   c                 C   s   t | tjjS r{   )
isinstancecollectionsabcSequencer[   r   r   r   is_sequenceO  r}   r   c                 C   s   | j \}}tj| dd} t|rCtj|dd}|j |fksJ t| d d }tj||fdd}t||t|t| t|| |S || d d }tj||fdd}t|||t| t|| |S )a>  
    Pack a set integers (i, j) where i=0:n and j=0:M into
    n bitstrings.
    Output is an uint8 array of size (n, code_size), where code_size is
    such that at most 7 bits per code are wasted.

    If nbit is an integer: all entries takes nbit bits.
    If nbit is an array: entry (i, j) takes nbit[j] bits.
    rg   r         rD   )	r
   r   r	   r   r^   sumr#   pack_bitstrings_cr   )rK   nbitr   M	code_sizebr   r   r   pack_bitstringsT  s   

r   c                 C   s   | j \}}|du r>tj|dd}t|}t| d d }||ks$J tj||fdd}t||t|t| |t| |S |}|| d d }||ksNJ tj||fdd}t|||t| |t| |S )a  
    Unpack a set integers (i, j) where i=0:n and j=0:M from
    n bitstrings (encoded as uint8s).
    Input is an uint8 array of size (n, code_size), where code_size is
    such that at most 7 bits per code are wasted.

    Two forms:
    - when called with (array, M, nbit): there are M entries of size
      nbit per row
    - when called with (array, nbits): element (i, j) is encoded in
      nbits[j] bits
    Nrg   r   r   r   )	r
   r   r	   ru   r^   r   r#   unpack_bitstrings_cr   )r   
M_or_nbitsr   r   r   r   min_code_sizerK   r   r   r   unpack_bitstringso  s(   


r   )r3   )r3   N)rN   )Nr   r~   )r   r{   )"numpyr   faiss.loaderr   collections.abcr   r   r"   r$   r2   r;   rA   lrandrC   rM   rP   rO   rY   r]   rf   r`   rk   rh   rl   r   r   r   r   r   r   r   r   r   r   r   r   r   r   <module>   s@   


	




'
@
08 