o
    rh>/                     @   s   d dl Z d dlZd dlZddlmZmZmZmZ ddl	m
Z
 G dd dZG dd deZd	D ]
Ze jer9 nq/d
ZG dd deZdd ZG dd deZG dd deZG dd deZG dd deZG dd deZdddZdS )    N   )
fvecs_read
ivecs_read
bvecs_mmap
fvecs_mmap)knnc                   @   s`   e Zd ZdZdd Zdd ZdddZd	d
 ZdddZdddZ	dddZ
dd Zdd ZdS )Datasetz+ Generic abstract class for a test dataset c                 C   s"   d| _ d| _d| _d| _d| _dS )z2 the constructor should set the following fields: L2Ndmetricnqnbntself r   T/var/www/html/alpaca_bot/venv/lib/python3.10/site-packages/faiss/contrib/datasets.py__init__   s
   
zDataset.__init__c                 C      t  )z' return the queries as a (nq, d) array NotImplementedErrorr   r   r   r   get_queries      zDataset.get_queriesNc                 C   r   )z' return the queries as a (nt, d) array r   r   maxtrainr   r   r   	get_train   r   zDataset.get_trainc                 C   r   )z' return the queries as a (nb, d) array r   r   r   r   r   get_database    r   zDataset.get_database   r   r   c           	      c   sb    |   }|\}}| j| | | j|d  | }}t|||D ]}||t|| | V  q dS )a7  returns an iterator on database vectors.
        bs is the number of vectors per batch
        split = (nsplit, rank) means the dataset is split in nsplit
        shards and we want shard number rank
        The default implementation just iterates over the full matrix
        returned by get_dataset.
        r   N)r   r   rangemin	r   bssplitxbnsplitranki0i1j0r   r   r   database_iterator$   s   "zDataset.database_iteratorc                 C   r   )z7 return the ground truth for k-nearest neighbor search r   r   kr   r   r   get_groundtruth2   r   zDataset.get_groundtruthc                 C   r   )z* return the ground truth for range search r   )r   threshr   r   r   get_groundtruth_range6   r   zDataset.get_groundtruth_rangec              
   C   s,   d| j  d| j d| j d| j d| j 
S )Nzdataset in dimension z, with metric z
, size: Q z B z T r   r   r   r   r   __str__:   s   zDataset.__str__c                 C   s   |   j| j| jfksJ | jdkr(| jdd}|jd| jfks(J d|jf |  j| j| jfks5J | jddj| jdfksCJ dS )z8 runs the previous and checks the sizes of the matrices r   {   )r   zshape=%s   )r.   N)	r   shaper   r   r   r   r   r   r/   )r   xtr   r   r   check_sizes>   s   
  zDataset.check_sizesNr   r    )__name__
__module____qualname____doc__r   r   r   r   r,   r/   r1   r2   r7   r   r   r   r   r      s    



r   c                   @   s>   e Zd ZdZdddZdd Zdd	d
Zdd ZdddZdS )SyntheticDatasetzOA dataset that is not completely random but still challenging to
    index
    r
   :  c                 C   s   t |  ||||f\| _| _| _| _d}|| | }tj|}	|	j	||fd}
t
|
|	||}
|
|	|d d  }
t|
}
|
d}
|| _|
d | | _|
|||  | _|
|| d  | _d S )N
   )size   g?float32)r   r   r   r   r   r   nprandomRandomStatenormaldotrandsinastyper   r6   r&   xq)r   r   r   r   r   r   seedd1nrsxr   r   r   r   M   s   


zSyntheticDataset.__init__c                 C      | j S r8   )rL   r   r   r   r   r   _      zSyntheticDataset.get_queriesNc                 C   s    |d ur|n| j }| jd | S r8   )r   r6   r   r   r   r   r   b   s   zSyntheticDataset.get_trainc                 C   rR   r8   )r&   r   r   r   r   r   f   rS   zSyntheticDataset.get_databased   c                 C   s.   t | j| j|| jdkrtjd S tjd S )Nr
   r   )r   rL   r&   r   faiss	METRIC_L2METRIC_INNER_PRODUCTr-   r   r   r   r/   i   s   
z SyntheticDataset.get_groundtruth)r
   r?   r8   )rT   	r:   r;   r<   r=   r   r   r   r   r/   r   r   r   r   r>   H   s    

r>   )z/datasets01/simsearch/041218/z7/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/zdata/c                   @   <   e Zd ZdZdd Zdd ZdddZd	d
 ZdddZdS )DatasetSIFT1M_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1M)
    c                 C   ,   t |  d\| _| _| _| _td | _d S )N)r   順 @B '  zsift1M/r   r   r   r   r   r   dataset_basedirbasedirr   r   r   r   r         
zDatasetSIFT1M.__init__c                 C      t | jd S )Nzsift_query.fvecsr   rb   r   r   r   r   r         zDatasetSIFT1M.get_queriesNc                 C   (   |d ur|n| j }t| jd d | S )Nzsift_learn.fvecsr   r   rb   r   r   r   r   r         zDatasetSIFT1M.get_trainc                 C   rd   )Nzsift_base.fvecsre   r   r   r   r   r      rf   zDatasetSIFT1M.get_databasec                 C   :   t | jd }|d ur|dksJ |d d d |f }|S )Nzsift_groundtruth.ivecsrT   r   rb   r   r.   gtr   r   r   r/      
   zDatasetSIFT1M.get_groundtruthr8   rX   r   r   r   r   rZ          
rZ   c                 C   s   t j| ddS )NrC   dtype)rD   ascontiguousarray)rQ   r   r   r   sanitize   rf   rs   c                   @   H   e Zd ZdZdddZdd Zddd	Zdd
dZdd ZdddZ	dS )DatasetBigANNz_
    The original dataset is available at: http://corpus-texmex.irisa.fr/
    (ANN_SIFT1B)
      c                 C   sN   t |  |dv sJ || _|d }dd|df\| _| _| _| _td | _d S )N)
r         r@      2   rT      i  rv   r^   r    r_   zbigann/)	r   r   nb_Mr   r   r   r   ra   rb   )r   r}   r   r   r   r   r      s   
zDatasetBigANN.__init__c                 C   s   t t| jd d d  S )Nzbigann_query.bvecs)rs   r   rb   r   r   r   r   r      s   zDatasetBigANN.get_queriesNc                 C   ,   |d ur|n| j }tt| jd d | S )Nzbigann_learn.bvecs)r   rs   r   rb   r   r   r   r   r         zDatasetBigANN.get_trainc                 C   s@   t | jd| j  }|d ur|dksJ |d d d |f }|S )Nzgnd/idx_%dM.ivecsrT   )r   rb   r}   rl   r   r   r   r/      s
   zDatasetBigANN.get_groundtruthc                 C   s.   | j dk s	J dtt| jd d | j S )NrT   dataset too large, use iteratorbigann_base.bvecs)r}   rs   r   rb   r   r   r   r   r   r         zDatasetBigANN.get_databaser   r    c           	      c   l    t | jd }|\}}| j| | | j|d  | }}t|||D ]}t||t|| | V  q#d S )Nr   r   )r   rb   r   r!   rs   r"   r#   r   r   r   r,         "zDatasetBigANN.database_iterator)rv   r8   r9   
r:   r;   r<   r=   r   r   r   r/   r   r,   r   r   r   r   ru      s    


ru   c                   @   rt   )DatasetDeep1Bzv
    See
    https://github.com/facebookresearch/faiss/tree/main/benchs#getting-deep1b
    on how to get the data
     ʚ;c                 C   sf   t |  dddddd}||v sJ dd|d	f\| _| _| _| _td
 | _d| j|| j f | _d S )N100k1M10M100M1B)r]   r^   i r|   r   `   i]r_   zdeep1b/z%sdeep%s_groundtruth.ivecs)	r   r   r   r   r   r   ra   rb   gt_fname)r   r   
nb_to_namer   r   r   r      s   

zDatasetDeep1B.__init__c                 C   s   t t| jd S )Nzdeep1B_queries.fvecs)rs   r   rb   r   r   r   r   r      s   zDatasetDeep1B.get_queriesNc                 C   r~   )Nzlearn.fvecs)r   rs   r   rb   r   r   r   r   r      r   zDatasetDeep1B.get_trainc                 C   s6   t | j}|d ur|dksJ |d d d |f }|S )NrT   )r   r   rl   r   r   r   r/      
   
zDatasetDeep1B.get_groundtruthc                 C   s.   | j dks	J dtt| jd d | j  S )Nr|   r   
base.fvecs)r   rs   r   rb   r   r   r   r   r      r   zDatasetDeep1B.get_databaser   r    c           	      c   r   )Nr   r   )r   rb   r   r!   rs   r"   r#   r   r   r   r,      r   zDatasetDeep1B.database_iterator)r   r8   r9   r   r   r   r   r   r      s    


r   c                   @   s4   e Zd ZdZdddZdd Zdd	 Zdd
dZdS )DatasetGlovezD
    Data from http://ann-benchmarks.com/glove-100-angular.hdf5
    NFc                 C   sh   dd l }|r
J d|std }||d| _d| _d\| _| _| jd jd | _| jd jd | _	d S )	Nr   znot implementedzglove/glove-100-angular.hdf5rIP)rT   r   traintest)
h5pyra   File
glove_h5pyr   r   r   r5   r   r   )r   locdownloadr   r   r   r   r      s   zDatasetGlove.__init__c                 C      t | jd }t| |S )Nr   rD   arrayr   rU   normalize_L2r   rL   r   r   r   r        
zDatasetGlove.get_queriesc                 C   r   )Nr   r   r   r&   r   r   r   r     r   zDatasetGlove.get_databasec                 C   s6   | j d }|d ur|dksJ |d d d |f }|S )N	neighborsrT   )r   rl   r   r   r   r/     r   zDatasetGlove.get_groundtruth)NFr8   r:   r;   r<   r=   r   r   r   r/   r   r   r   r   r      s    
r   c                   @   s2   e Zd ZdZdd Zdd Zdd Zdd	d
ZdS )DatasetMusic100zO
    get dataset from
    https://github.com/stanis-morozov/ip-nsw#dataset
    c                 C   s2   t |  d\| _| _| _| _d| _td | _d S )N)rT   r   r^   r_   r   z
music-100/)	r   r   r   r   r   r   r   ra   rb   r   r   r   r   r   #  s   
zDatasetMusic100.__init__c                 C   $   t j| jd dd}|dd}|S )Nzquery_music100.binrC   rp   r	   rT   rD   fromfilerb   reshaper   r   r   r   r   )     zDatasetMusic100.get_queriesc                 C   r   )Nzdatabase_music100.binrC   rp   r	   rT   r   r   r   r   r   r   .  r   zDatasetMusic100.get_databaseNc                 C   s<   t | jd }|d ur|dksJ |d d d |f }|S )Nzgt.npyrT   )rD   loadrb   rl   r   r   r   r/   3  s
   zDatasetMusic100.get_groundtruthr8   r   r   r   r   r   r     s    r   c                   @   rY   )DatasetGIST1Mr[   c                 C   r\   )N)i  r]   r^   r_   zgist1M/r`   r   r   r   r   r   @  rc   zDatasetGIST1M.__init__c                 C   rd   )Nzgist_query.fvecsre   r   r   r   r   r   E  rf   zDatasetGIST1M.get_queriesNc                 C   rg   )Nzgist_learn.fvecsrh   r   r   r   r   r   H  ri   zDatasetGIST1M.get_trainc                 C   rd   )Nzgist_base.fvecsre   r   r   r   r   r   L  rf   zDatasetGIST1M.get_databasec                 C   rj   )Nzgist_groundtruth.ivecsrT   rk   rl   r   r   r   r/   O  rn   zDatasetGIST1M.get_groundtruthr8   rX   r   r   r   r   r   :  ro   r   deep1MFc                 C   s   | dkrt  S | dkrt S | dr&| dkrdnt| dd }t|dS | d	re| d
d }|d dkrBdt|dd  }n|dkrId}n|d dkrZdt|dd  }nJ d| t|dS | dkrlt S | dkrut|dS td|  )z converts a string describing a dataset to a Dataset object
    Supports sift1M, bigann1M..bigann1B, deep1M..deep1B, music-100 and glove
    sift1Mgist1Mbigannbigann1Brv      r	   )r}   deeprB   NMr^   r   r   r.   Fzdid not recognize suffix )r   z	music-100glove)r   zunknown dataset )	rZ   r   
startswithintru   r   r   r   RuntimeError)datasetr   dbsizeszsufr   r   r   dataset_from_nameW  s,   




r   )r   F)osnumpyrD   rU   vecs_ior   r   r   r   exhaustive_searchr   r   r>   ra   pathexistsrZ   rs   ru   r   r   r   r   r   r   r   r   r   <module>   s(   ;/(0#