nmslib · yurymalkov · Jan 17, 2021 · Jan 10, 2021 · Jan 10, 2021 · Jan 13, 2021
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,8 @@
-hnswlib.egg-info/
-build/
-dist/
-tmp/
-python_bindings/tests/__pycache__/
-*.pyd
-hnswlib.cpython*.so
+hnswlib.egg-info/
+build/
+dist/
+tmp/
+python_bindings/tests/__pycache__/
+*.pyd
+hnswlib.cpython*.so
+var/
diff --git a/.travis.yml b/.travis.yml
@@ -30,9 +30,8 @@ jobs:
 
 install:
   - |
-    pip install -r requirements.txt
-    python setup.py install
+    python -m pip install .
 
 script:
   - |
-    python setup.py test
+    python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
diff --git a/Makefile b/Makefile
@@ -3,12 +3,13 @@ pypi: dist
 
 dist:
 	-rm dist/*
-	python3 setup.py sdist
+	pip install build
+	python3 -m build --sdist
 
 test:
-	python3 setup.py test
+	python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
 
 clean:
 	rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so
 
-.PHONY: dist
+.PHONY: dist
diff --git a/README.md b/README.md
@@ -213,8 +213,9 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
 You can install from sources:
 ```bash
 apt-get install -y python-setuptools python-pip
-pip3 install pybind11 numpy setuptools
-python3 setup.py install
+git clone https://github.com/nmslib/hnswlib.git
+cd hnswlib
+pip install .
 ```
 
 or you can install via pip:

diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,9 @@
+[build-system]
+requires = [
+    "setuptools>=42",
+    "wheel",
+    "numpy>=1.10.0",
+    "pybind11>=2.0",
+]
+
+build-backend = "setuptools.build_meta"
diff --git a/python_bindings/tests/bindings_test.py b/python_bindings/tests/bindings_test.py
@@ -1,11 +1,13 @@
 import os
 import unittest
 
+import numpy as np
+
+import hnswlib
+
 
 class RandomSelfTestCase(unittest.TestCase):
     def testRandomSelf(self):
-        import hnswlib
-        import numpy as np
 
         dim = 16
         num_elements = 10000
@@ -41,7 +43,7 @@ def testRandomSelf(self):
 
         # Query the elements for themselves and measure recall:
         labels, distances = p.knn_query(data1, k=1)
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3)
+        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
 
         # Serializing and deleting the index:
         index_path = 'first_half.bin'
@@ -61,10 +63,6 @@ def testRandomSelf(self):
         # Query the elements for themselves and measure recall:
         labels, distances = p.knn_query(data, k=1)
 
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3)
+        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
 
         os.remove(index_path)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python_bindings/tests/bindings_test_getdata.py b/python_bindings/tests/bindings_test_getdata.py
@@ -1,11 +1,13 @@
 import unittest
 
+import numpy as np
+
+import hnswlib
+
 
 class RandomSelfTestCase(unittest.TestCase):
     def testGettingItems(self):
         print("\n**** Getting the data by label test ****\n")
-        import hnswlib
-        import numpy as np
 
         dim = 16
         num_elements = 10000
@@ -42,6 +44,3 @@ def testGettingItems(self):
         # After adding them, all labels should be retrievable
         returned_items = p.get_items(labels)
         self.assertSequenceEqual(data.tolist(), returned_items)
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python_bindings/tests/bindings_test_labels.py b/python_bindings/tests/bindings_test_labels.py
@@ -1,131 +1,127 @@
 import os
 import unittest
 
+import numpy as np
 
-class RandomSelfTestCase(unittest.TestCase):
-    def testRandomSelf(self):
-      for idx in range(16):
-        print("\n**** Index save-load test ****\n")
-        import hnswlib
-        import numpy as np
-
-        np.random.seed(idx)
-        dim = 16
-        num_elements = 10000
-
-        # Generating sample data
-        data = np.float32(np.random.random((num_elements, dim)))
-
-        # Declaring index
-        p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
-
-        # Initing index
-        # max_elements - the maximum number of elements, should be known beforehand
-        #     (probably will be made optional in the future)
-        #
-        # ef_construction - controls index search speed/build speed tradeoff
-        # M - is tightly connected with internal dimensionality of the data
-        #     stronlgy affects the memory consumption
-
-        p.init_index(max_elements = num_elements, ef_construction = 100, M = 16)
-
-        # Controlling the recall by setting ef:
-        # higher ef leads to better accuracy, but slower search
-        p.set_ef(100)
-
-        p.set_num_threads(4)  # by default using all available cores
-
-        # We split the data in two batches:
-        data1 = data[:num_elements // 2]
-        data2 = data[num_elements // 2:]
-
-        print("Adding first batch of %d elements" % (len(data1)))
-        p.add_items(data1)
-
-        # Query the elements for themselves and measure recall:
-        labels, distances = p.knn_query(data1, k=1)
-
-        items=p.get_items(labels)
-
-        # Check the recall:
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3)
-
-        # Check that the returned element data is correct:
-        diff_with_gt_labels=np.mean(np.abs(data1-items))
-        self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4)
-
-        # Serializing and deleting the index.
-        # We need the part to check that serialization is working properly.
-
-        index_path = 'first_half.bin'
-        print("Saving index to '%s'" % index_path)
-        p.save_index(index_path)
-        print("Saved. Deleting...")
-        del p
-        print("Deleted")
-
-        print("\n**** Mark delete test ****\n")
-        # Reiniting, loading the index
-        print("Reiniting")
-        p = hnswlib.Index(space='l2', dim=dim)
-
-        print("\nLoading index from '%s'\n" % index_path)
-        p.load_index(index_path)
-        p.set_ef(100)
-
-        print("Adding the second batch of %d elements" % (len(data2)))
-        p.add_items(data2)
-
-        # Query the elements for themselves and measure recall:
-        labels, distances = p.knn_query(data, k=1)
-        items=p.get_items(labels)
-
-        # Check the recall:
-        self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3)
-
-        # Check that the returned element data is correct:
-        diff_with_gt_labels=np.mean(np.abs(data-items))
-        self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index.
-
-        # Checking that all labels are returned correctly:
-        sorted_labels=sorted(p.get_ids_list())
-        self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0)
-
-        # Delete data1
-        labels1, _ = p.knn_query(data1, k=1)
-
-        for l in labels1:
-            p.mark_deleted(l[0])
-        labels2, _ = p.knn_query(data2, k=1)
-        items=p.get_items(labels2)
-        diff_with_gt_labels=np.mean(np.abs(data2-items))
-        self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console
-
-
-        labels1_after, _ = p.knn_query(data1, k=1)
-        for la in labels1_after:
-            for lb in labels1:
-                if la[0] == lb[0]:
-                    self.assertTrue(False)
-        print("All the data in data1 are removed")
+import hnswlib
 
-        # checking saving/loading index with elements marked as deleted
-        del_index_path = "with_deleted.bin"
-        p.save_index(del_index_path)
-        p = hnswlib.Index(space='l2', dim=dim)
-        p.load_index(del_index_path)
-        p.set_ef(100)
 
-        labels1_after, _ = p.knn_query(data1, k=1)
-        for la in labels1_after:
-            for lb in labels1:
-                if la[0] == lb[0]:
-                    self.assertTrue(False)
-
-      os.remove(index_path)
-      os.remove(del_index_path)
+class RandomSelfTestCase(unittest.TestCase):
+    def testRandomSelf(self):
+        for idx in range(16):
+            print("\n**** Index save-load test ****\n")
 
+            np.random.seed(idx)
+            dim = 16
+            num_elements = 10000
 
+            # Generating sample data
+            data = np.float32(np.random.random((num_elements, dim)))
 
-if __name__ == "__main__":
-    unittest.main()
+            # Declaring index
+            p = hnswlib.Index(space='l2', dim=dim)  # possible options are l2, cosine or ip
+
+            # Initing index
+            # max_elements - the maximum number of elements, should be known beforehand
+            #     (probably will be made optional in the future)
+            #
+            # ef_construction - controls index search speed/build speed tradeoff
+            # M - is tightly connected with internal dimensionality of the data
+            #     stronlgy affects the memory consumption
+
+            p.init_index(max_elements=num_elements, ef_construction=100, M=16)
+
+            # Controlling the recall by setting ef:
+            # higher ef leads to better accuracy, but slower search
+            p.set_ef(100)
+
+            p.set_num_threads(4)  # by default using all available cores
+
+            # We split the data in two batches:
+            data1 = data[:num_elements // 2]
+            data2 = data[num_elements // 2:]
+
+            print("Adding first batch of %d elements" % (len(data1)))
+            p.add_items(data1)
+
+            # Query the elements for themselves and measure recall:
+            labels, distances = p.knn_query(data1, k=1)
+
+            items=p.get_items(labels)
+
+            # Check the recall:
+            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)
+
+            # Check that the returned element data is correct:
+            diff_with_gt_labels=np.mean(np.abs(data1-items))
+            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)
+
+            # Serializing and deleting the index.
+            # We need the part to check that serialization is working properly.
+
+            index_path = 'first_half.bin'
+            print("Saving index to '%s'" % index_path)
+            p.save_index(index_path)
+            print("Saved. Deleting...")
+            del p
+            print("Deleted")
+
+            print("\n**** Mark delete test ****\n")
+            # Reiniting, loading the index
+            print("Reiniting")
+            p = hnswlib.Index(space='l2', dim=dim)
+
+            print("\nLoading index from '%s'\n" % index_path)
+            p.load_index(index_path)
+            p.set_ef(100)
+
+            print("Adding the second batch of %d elements" % (len(data2)))
+            p.add_items(data2)
+
+            # Query the elements for themselves and measure recall:
+            labels, distances = p.knn_query(data, k=1)
+            items=p.get_items(labels)
+
+            # Check the recall:
+            self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)
+
+            # Check that the returned element data is correct:
+            diff_with_gt_labels=np.mean(np.abs(data-items))
+            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index.
+
+            # Checking that all labels are returned correctly:
+            sorted_labels=sorted(p.get_ids_list())
+            self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)
+
+            # Delete data1
+            labels1, _ = p.knn_query(data1, k=1)
+
+            for l in labels1:
+                p.mark_deleted(l[0])
+            labels2, _ = p.knn_query(data2, k=1)
+            items=p.get_items(labels2)
+            diff_with_gt_labels = np.mean(np.abs(data2-items))
+            self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console
+
+            labels1_after, _ = p.knn_query(data1, k=1)
+            for la in labels1_after:
+                for lb in labels1:
+                    if la[0] == lb[0]:
+                        self.assertTrue(False)
+            print("All the data in data1 are removed")
+
+            # checking saving/loading index with elements marked as deleted
+            del_index_path = "with_deleted.bin"
+            p.save_index(del_index_path)
+            p = hnswlib.Index(space='l2', dim=dim)
+            p.load_index(del_index_path)
+            p.set_ef(100)
+
+            labels1_after, _ = p.knn_query(data1, k=1)
+            for la in labels1_after:
+                for lb in labels1:
+                    if la[0] == lb[0]:
+                        self.assertTrue(False)
+
+        os.remove(index_path)
+        os.remove(del_index_path)