Skip to content

PEP-517 and PEP-518 support (add pyproject.toml) #274

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jan 17, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
hnswlib.egg-info/
build/
dist/
tmp/
python_bindings/tests/__pycache__/
*.pyd
hnswlib.cpython*.so
hnswlib.egg-info/
build/
dist/
tmp/
python_bindings/tests/__pycache__/
*.pyd
hnswlib.cpython*.so
var/
5 changes: 2 additions & 3 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,8 @@ jobs:

install:
- |
pip install -r requirements.txt
python setup.py install
python -m pip install .

script:
- |
python setup.py test
python -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ pypi: dist

dist:
-rm dist/*
python3 setup.py sdist
pip install build
python3 -m build --sdist

test:
python3 setup.py test
python3 -m unittest discover --start-directory python_bindings/tests --pattern "*_test*.py"

clean:
rm -rf *.egg-info build dist tmp var tests/__pycache__ hnswlib.cpython*.so

.PHONY: dist
.PHONY: dist
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -213,8 +213,9 @@ print("Recall for two batches:", np.mean(labels.reshape(-1) == np.arange(len(dat
You can install from sources:
```bash
apt-get install -y python-setuptools python-pip
pip3 install pybind11 numpy setuptools
python3 setup.py install
git clone https://github.com/nmslib/hnswlib.git
cd hnswlib
pip install .
```

or you can install via pip:
Expand Down
9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[build-system]
requires = [
"setuptools>=42",
"wheel",
"numpy>=1.10.0",
"pybind11>=2.0",
]

build-backend = "setuptools.build_meta"
14 changes: 6 additions & 8 deletions python_bindings/tests/bindings_test.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import os
import unittest

import numpy as np

import hnswlib


class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
import hnswlib
import numpy as np

dim = 16
num_elements = 10000
Expand Down Expand Up @@ -41,7 +43,7 @@ def testRandomSelf(self):

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3)
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

# Serializing and deleting the index:
index_path = 'first_half.bin'
Expand All @@ -61,10 +63,6 @@ def testRandomSelf(self):
# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)

self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3)
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

os.remove(index_path)


if __name__ == "__main__":
unittest.main()
9 changes: 4 additions & 5 deletions python_bindings/tests/bindings_test_getdata.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import unittest

import numpy as np

import hnswlib


class RandomSelfTestCase(unittest.TestCase):
def testGettingItems(self):
print("\n**** Getting the data by label test ****\n")
import hnswlib
import numpy as np

dim = 16
num_elements = 10000
Expand Down Expand Up @@ -42,6 +44,3 @@ def testGettingItems(self):
# After adding them, all labels should be retrievable
returned_items = p.get_items(labels)
self.assertSequenceEqual(data.tolist(), returned_items)

if __name__ == "__main__":
unittest.main()
240 changes: 118 additions & 122 deletions python_bindings/tests/bindings_test_labels.py
Original file line number Diff line number Diff line change
@@ -1,131 +1,127 @@
import os
import unittest

import numpy as np

class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(16):
print("\n**** Index save-load test ****\n")
import hnswlib
import numpy as np

np.random.seed(idx)
dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption

p.init_index(max_elements = num_elements, ef_construction = 100, M = 16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(100)

p.set_num_threads(4) # by default using all available cores

# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]

print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)

items=p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))),1.0,3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.mean(np.abs(data1-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4)

# Serializing and deleting the index.
# We need the part to check that serialization is working properly.

index_path = 'first_half.bin'
print("Saving index to '%s'" % index_path)
p.save_index(index_path)
print("Saved. Deleting...")
del p
print("Deleted")

print("\n**** Mark delete test ****\n")
# Reiniting, loading the index
print("Reiniting")
p = hnswlib.Index(space='l2', dim=dim)

print("\nLoading index from '%s'\n" % index_path)
p.load_index(index_path)
p.set_ef(100)

print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
items=p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))),1.0,3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.mean(np.abs(data-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-4) # deleting index.

# Checking that all labels are returned correctly:
sorted_labels=sorted(p.get_ids_list())
self.assertEqual(np.sum(~np.asarray(sorted_labels)==np.asarray(range(num_elements))),0)

# Delete data1
labels1, _ = p.knn_query(data1, k=1)

for l in labels1:
p.mark_deleted(l[0])
labels2, _ = p.knn_query(data2, k=1)
items=p.get_items(labels2)
diff_with_gt_labels=np.mean(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta = 1e-3) # console


labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)
print("All the data in data1 are removed")
import hnswlib

# checking saving/loading index with elements marked as deleted
del_index_path = "with_deleted.bin"
p.save_index(del_index_path)
p = hnswlib.Index(space='l2', dim=dim)
p.load_index(del_index_path)
p.set_ef(100)

labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)

os.remove(index_path)
os.remove(del_index_path)
class RandomSelfTestCase(unittest.TestCase):
def testRandomSelf(self):
for idx in range(16):
print("\n**** Index save-load test ****\n")

np.random.seed(idx)
dim = 16
num_elements = 10000

# Generating sample data
data = np.float32(np.random.random((num_elements, dim)))

if __name__ == "__main__":
unittest.main()
# Declaring index
p = hnswlib.Index(space='l2', dim=dim) # possible options are l2, cosine or ip

# Initing index
# max_elements - the maximum number of elements, should be known beforehand
# (probably will be made optional in the future)
#
# ef_construction - controls index search speed/build speed tradeoff
# M - is tightly connected with internal dimensionality of the data
# stronlgy affects the memory consumption

p.init_index(max_elements=num_elements, ef_construction=100, M=16)

# Controlling the recall by setting ef:
# higher ef leads to better accuracy, but slower search
p.set_ef(100)

p.set_num_threads(4) # by default using all available cores

# We split the data in two batches:
data1 = data[:num_elements // 2]
data2 = data[num_elements // 2:]

print("Adding first batch of %d elements" % (len(data1)))
p.add_items(data1)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data1, k=1)

items=p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data1))), 1.0, 3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.mean(np.abs(data1-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4)

# Serializing and deleting the index.
# We need the part to check that serialization is working properly.

index_path = 'first_half.bin'
print("Saving index to '%s'" % index_path)
p.save_index(index_path)
print("Saved. Deleting...")
del p
print("Deleted")

print("\n**** Mark delete test ****\n")
# Reiniting, loading the index
print("Reiniting")
p = hnswlib.Index(space='l2', dim=dim)

print("\nLoading index from '%s'\n" % index_path)
p.load_index(index_path)
p.set_ef(100)

print("Adding the second batch of %d elements" % (len(data2)))
p.add_items(data2)

# Query the elements for themselves and measure recall:
labels, distances = p.knn_query(data, k=1)
items=p.get_items(labels)

# Check the recall:
self.assertAlmostEqual(np.mean(labels.reshape(-1) == np.arange(len(data))), 1.0, 3)

# Check that the returned element data is correct:
diff_with_gt_labels=np.mean(np.abs(data-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-4) # deleting index.

# Checking that all labels are returned correctly:
sorted_labels=sorted(p.get_ids_list())
self.assertEqual(np.sum(~np.asarray(sorted_labels) == np.asarray(range(num_elements))), 0)

# Delete data1
labels1, _ = p.knn_query(data1, k=1)

for l in labels1:
p.mark_deleted(l[0])
labels2, _ = p.knn_query(data2, k=1)
items=p.get_items(labels2)
diff_with_gt_labels = np.mean(np.abs(data2-items))
self.assertAlmostEqual(diff_with_gt_labels, 0, delta=1e-3) # console

labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)
print("All the data in data1 are removed")

# checking saving/loading index with elements marked as deleted
del_index_path = "with_deleted.bin"
p.save_index(del_index_path)
p = hnswlib.Index(space='l2', dim=dim)
p.load_index(del_index_path)
p.set_ef(100)

labels1_after, _ = p.knn_query(data1, k=1)
for la in labels1_after:
for lb in labels1:
if la[0] == lb[0]:
self.assertTrue(False)

os.remove(index_path)
os.remove(del_index_path)
Loading