Skip to content

Commit 4cf279b

Browse files
committed
Merge remote-tracking branch 'upstream/develop' into fix-interface
2 parents 8987188 + b4b7b86 commit 4cf279b

File tree

6 files changed

+609
-105
lines changed

6 files changed

+609
-105
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
python_bindings/hnswlib.egg-info/
2+
python_bindings/build/
3+
python_bindings/dist/
4+
python_bindings/tmp/

README.md

Lines changed: 40 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ Header-only C++ HNSW implementation with python bindings. Paper's code for the H
33

44
**NEWS:**
55

6-
* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddinds).**
6+
* **Thanks to Apoorv Sharma [@apoorv-sharma](https://github.com/apoorv-sharma), hnswlib now supports true element updates (the interface remained the same, but when you the perfromance/memory should not degrade as you update the element embeddings).**
77

8-
* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not mutiple of 4**
8+
* **Thanks to Dmitry [@2ooom](https://github.com/2ooom), hnswlib got a boost in performance for vector dimensions that are not multiple of 4**
99

1010
* **Thanks to Louis Abraham ([@louisabraham](https://github.com/louisabraham)) hnswlib can now be installed via pip!**
1111

@@ -37,7 +37,7 @@ For other spaces use the nmslib library https://github.com/nmslib/nmslib.
3737
#### Short API description
3838
* `hnswlib.Index(space, dim)` creates a non-initialized index an HNSW in space `space` with integer dimension `dim`.
3939

40-
Index methods:
40+
`hnswlib.Index` methods:
4141
* `init_index(max_elements, ef_construction = 200, M = 16, random_seed = 100)` initializes the index from with no elements.
4242
* `max_elements` defines the maximum number of elements that can be stored in the structure(can be increased/shrunk).
4343
* `ef_construction` defines a construction time/accuracy trade-off (see [ALGO_PARAMS.md](ALGO_PARAMS.md)).
@@ -49,14 +49,14 @@ Index methods:
4949
* `data_labels` specifies the labels for the data. If index already has the elements with the same labels, their features will be updated. Note that update procedure is slower than insertion of a new element, but more memory- and query-efficient.
5050
* Thread-safe with other `add_items` calls, but not with `knn_query`.
5151

52-
* `mark_deleted(data_label)` - marks the element as deleted, so it will be ommited from search results.
52+
* `mark_deleted(data_label)` - marks the element as deleted, so it will be omitted from search results.
5353

5454
* `resize_index(new_size)` - changes the maximum capacity of the index. Not thread safe with `add_items` and `knn_query`.
5555

5656
* `set_ef(ef)` - sets the query time accuracy/speed trade-off, defined by the `ef` parameter (
5757
[ALGO_PARAMS.md](ALGO_PARAMS.md)). Note that the parameter is currently not saved along with the index, so you need to set it manually after loading.
5858

59-
* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closests elements for each element of the
59+
* `knn_query(data, k = 1, num_threads = -1)` make a batch query for `k` closest elements for each element of the
6060
* `data` (shape:`N*dim`). Returns a numpy array of (shape:`N*k`).
6161
* `num_threads` sets the number of cpu threads to use (-1 means use default).
6262
* Thread-safe with other `knn_query` calls, but not with `add_items`.
@@ -76,14 +76,34 @@ Index methods:
7676

7777
* `get_current_count()` - returns the current number of element stored in the index
7878

79-
80-
79+
Read-only properties of `hnswlib.Index` class:
80+
81+
* `space` - name of the space (can be one of "l2", "ip", or "cosine").
82+
83+
* `dim` - dimensionality of the space.
84+
85+
* `M` - parameter that defines the maximum number of outgoing connections in the graph.
86+
87+
* `ef_construction` - parameter that controls speed/accuracy trade-off during the index construction.
88+
89+
* `max_elements` - current capacity of the index. Equivalent to `p.get_max_elements()`.
90+
91+
* `element_count` - number of items in the index. Equivalent to `p.get_current_count()`.
92+
93+
Properties of `hnswlib.Index` that support reading and writing:
94+
95+
* `ef` - parameter controlling query time/accuracy trade-off.
96+
97+
* `num_threads` - default number of threads to use in `add_items` or `knn_query`. Note that calling `p.set_num_threads(3)` is equivalent to `p.num_threads=3`.
98+
99+
81100
82101

83102
#### Python bindings examples
84103
```python
85104
import hnswlib
86105
import numpy as np
106+
import pickle
87107

88108
dim = 128
89109
num_elements = 10000
@@ -106,6 +126,18 @@ p.set_ef(50) # ef should always be > k
106126

107127
# Query dataset, k - number of closest elements (returns 2 numpy arrays)
108128
labels, distances = p.knn_query(data, k = 1)
129+
130+
# Index objects support pickling
131+
# WARNING: serialization via pickle.dumps(p) or p.__getstate__() is NOT thread-safe with p.add_items method!
132+
# Note: ef parameter is included in serialization; random number generator is initialized with random_seeed on Index load
133+
p_copy = pickle.loads(pickle.dumps(p)) # creates a copy of index p using pickle round-trip
134+
135+
### Index parameters are exposed as class properties:
136+
print(f"Parameters passed to constructor: space={p_copy.space}, dim={p_copy.dim}")
137+
print(f"Index construction: M={p_copy.M}, ef_construction={p_copy.ef_construction}")
138+
print(f"Index size is {p_copy.element_count} and index capacity is {p_copy.max_elements}")
139+
print(f"Search speed/quality trade-off parameter: ef={p_copy.ef}")
140+
109141
```
110142

111143
An example with updates after serialization/deserialization:
@@ -191,7 +223,7 @@ or you can install via pip:
191223

192224
### Other implementations
193225
* Non-metric space library (nmslib) - main library(python, C++), supports exotic distances: https://github.com/nmslib/nmslib
194-
* Faiss libary by facebook, uses own HNSW implementation for coarse quantization (python, C++):
226+
* Faiss library by facebook, uses own HNSW implementation for coarse quantization (python, C++):
195227
https://github.com/facebookresearch/faiss
196228
* Code for the paper
197229
["Revisiting the Inverted Indices for Billion-Scale Approximate Nearest Neighbors"](https://arxiv.org/abs/1802.02422)

hnswlib/hnswalg.h

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <atomic>
66
#include <random>
77
#include <stdlib.h>
8+
#include <assert.h>
89
#include <unordered_set>
910
#include <list>
1011

@@ -27,7 +28,7 @@ namespace hnswlib {
2728
}
2829

2930
HierarchicalNSW(SpaceInterface<dist_t> *s, size_t max_elements, size_t M = 16, size_t ef_construction = 200, size_t random_seed = 100) :
30-
link_list_locks_(max_elements), element_levels_(max_elements), link_list_update_locks_(max_update_element_locks) {
31+
link_list_locks_(max_elements), link_list_update_locks_(max_update_element_locks), element_levels_(max_elements) {
3132
max_elements_ = max_elements;
3233

3334
has_deletions_=false;
@@ -407,7 +408,7 @@ namespace hnswlib {
407408
top_candidates.pop();
408409
}
409410

410-
tableint next_closest_entry_point = selectedNeighbors[0];
411+
tableint next_closest_entry_point = selectedNeighbors.back();
411412

412413
{
413414
linklistsizeint *ll_cur;
@@ -637,7 +638,6 @@ namespace hnswlib {
637638
if (!input.is_open())
638639
throw std::runtime_error("Cannot open file");
639640

640-
641641
// get file size:
642642
input.seekg(0,input.end);
643643
std::streampos total_filesize=input.tellg();
@@ -869,8 +869,8 @@ namespace hnswlib {
869869
// continue;
870870

871871
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> candidates;
872-
int size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1;
873-
int elementsToKeep = std::min(int(ef_construction_), size);
872+
size_t size = sCand.find(neigh) == sCand.end() ? sCand.size() : sCand.size() - 1; // sCand guaranteed to have size >= 1
873+
size_t elementsToKeep = std::min(ef_construction_, size);
874874
for (auto&& cand : sCand) {
875875
if (cand == neigh)
876876
continue;
@@ -893,7 +893,7 @@ namespace hnswlib {
893893
std::unique_lock <std::mutex> lock(link_list_locks_[neigh]);
894894
linklistsizeint *ll_cur;
895895
ll_cur = get_linklist_at_level(neigh, layer);
896-
int candSize = candidates.size();
896+
size_t candSize = candidates.size();
897897
setListCount(ll_cur, candSize);
898898
tableint *data = (tableint *) (ll_cur + 1);
899899
for (size_t idx = 0; idx < candSize; idx++) {
@@ -1137,7 +1137,7 @@ namespace hnswlib {
11371137
}
11381138

11391139
std::priority_queue<std::pair<dist_t, tableint>, std::vector<std::pair<dist_t, tableint>>, CompareByFirst> top_candidates;
1140-
if (has_deletions_) {
1140+
if (has_deletions_) {
11411141
top_candidates=searchBaseLayerST<true,true>(
11421142
currObj, query_data, std::max(ef_, k));
11431143
}
@@ -1181,27 +1181,27 @@ namespace hnswlib {
11811181
std::unordered_set<tableint> s;
11821182
for (int j=0; j<size; j++){
11831183
assert(data[j] > 0);
1184-
assert(data[j] < cur_element_count);
1184+
assert(data[j] < cur_element_count);
11851185
assert (data[j] != i);
11861186
inbound_connections_num[data[j]]++;
11871187
s.insert(data[j]);
11881188
connections_checked++;
1189-
1189+
11901190
}
11911191
assert(s.size() == size);
11921192
}
11931193
}
11941194
if(cur_element_count > 1){
11951195
int min1=inbound_connections_num[0], max1=inbound_connections_num[0];
1196-
for(int i=0; i < cur_element_count; i++){
1196+
for(int i=0; i < cur_element_count; i++){
11971197
assert(inbound_connections_num[i] > 0);
11981198
min1=std::min(inbound_connections_num[i],min1);
11991199
max1=std::max(inbound_connections_num[i],max1);
12001200
}
12011201
std::cout << "Min inbound: " << min1 << ", Max inbound:" << max1 << "\n";
12021202
}
12031203
std::cout << "integrity ok, checked " << connections_checked << " connections\n";
1204-
1204+
12051205
}
12061206

12071207
};

hnswlib/space_l2.h

Lines changed: 20 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,7 +204,7 @@ namespace hnswlib {
204204
};
205205

206206
static int
207-
L2SqrI(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
207+
L2SqrI4x(const void *__restrict pVect1, const void *__restrict pVect2, const void *__restrict qty_ptr) {
208208

209209
size_t qty = *((size_t *) qty_ptr);
210210
int res = 0;
@@ -226,12 +226,23 @@ namespace hnswlib {
226226
res += ((*a) - (*b)) * ((*a) - (*b));
227227
a++;
228228
b++;
229+
}
230+
return (res);
231+
}
229232

233+
static int L2SqrI(const void* __restrict pVect1, const void* __restrict pVect2, const void* __restrict qty_ptr) {
234+
size_t qty = *((size_t*)qty_ptr);
235+
int res = 0;
236+
unsigned char* a = (unsigned char*)pVect1;
237+
unsigned char* b = (unsigned char*)pVect2;
230238

239+
for(size_t i = 0; i < qty; i++)
240+
{
241+
res += ((*a) - (*b)) * ((*a) - (*b));
242+
a++;
243+
b++;
231244
}
232-
233245
return (res);
234-
235246
}
236247

237248
class L2SpaceI : public SpaceInterface<int> {
@@ -241,7 +252,12 @@ namespace hnswlib {
241252
size_t dim_;
242253
public:
243254
L2SpaceI(size_t dim) {
244-
fstdistfunc_ = L2SqrI;
255+
if(dim % 4 == 0) {
256+
fstdistfunc_ = L2SqrI4x;
257+
}
258+
else {
259+
fstdistfunc_ = L2SqrI;
260+
}
245261
dim_ = dim;
246262
data_size_ = dim * sizeof(unsigned char);
247263
}

0 commit comments

Comments
 (0)