Skip to content

Commit ccd047c

Browse files
committed
[mlir][sparse] optimize COO index handling
By using a shared index pool, we reduce the footprint of each "Element" in the COO scheme and, in addition, reduce the overhead of allocating indices (trading many allocations of vectors for allocations in a single vector only). When the capacity is known, this means *all* allocation can be done in advance. This is a big win. For example, reading matrix SK-2005, with dimensions 50,636,154 x 50,636,154 and 1,949,412,601 nonzero elements improves as follows (time in ms), or about 3.5x faster overall ``` SK-2005 before after speedup --------------------------------------------- read 305,086.65 180,318.12 1.69 sort 2,836,096.23 510,492.87 5.56 pack 364,485.67 312,009.96 1.17 --------------------------------------------- TOTAL 3,505,668.56 1,002,820.95 3.50 ``` Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D124502
1 parent c7bb5ac commit ccd047c

File tree

1 file changed

+44
-18
lines changed

1 file changed

+44
-18
lines changed

mlir/lib/ExecutionEngine/SparseTensorUtils.cpp

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -84,22 +84,14 @@ static inline uint64_t checkedMul(uint64_t lhs, uint64_t rhs) {
8484
/// ({i}, a[i])
8585
/// and a rank-5 tensor element like
8686
/// ({i,j,k,l,m}, a[i,j,k,l,m])
87+
/// We use pointer to a shared index pool rather than e.g. a direct
88+
/// vector since that (1) reduces the per-element memory footprint, and
89+
/// (2) centralizes the memory reservation and (re)allocation to one place.
8790
template <typename V>
8891
struct Element {
89-
Element(const std::vector<uint64_t> &ind, V val) : indices(ind), value(val){};
90-
std::vector<uint64_t> indices;
92+
Element(uint64_t *ind, V val) : indices(ind), value(val){};
93+
uint64_t *indices; // pointer into shared index pool
9194
V value;
92-
/// Returns true if indices of e1 < indices of e2.
93-
static bool lexOrder(const Element<V> &e1, const Element<V> &e2) {
94-
uint64_t rank = e1.indices.size();
95-
assert(rank == e2.indices.size());
96-
for (uint64_t r = 0; r < rank; r++) {
97-
if (e1.indices[r] == e2.indices[r])
98-
continue;
99-
return e1.indices[r] < e2.indices[r];
100-
}
101-
return false;
102-
}
10395
};
10496

10597
/// A memory-resident sparse tensor in coordinate scheme (collection of
@@ -112,29 +104,61 @@ struct SparseTensorCOO {
112104
public:
113105
SparseTensorCOO(const std::vector<uint64_t> &szs, uint64_t capacity)
114106
: sizes(szs) {
115-
if (capacity)
107+
if (capacity) {
116108
elements.reserve(capacity);
109+
indices.reserve(capacity * getRank());
110+
}
117111
}
112+
118113
/// Adds element as indices and value.
119114
void add(const std::vector<uint64_t> &ind, V val) {
120115
assert(!iteratorLocked && "Attempt to add() after startIterator()");
116+
uint64_t *base = indices.data();
117+
uint64_t size = indices.size();
121118
uint64_t rank = getRank();
122119
assert(rank == ind.size());
123-
for (uint64_t r = 0; r < rank; r++)
120+
for (uint64_t r = 0; r < rank; r++) {
124121
assert(ind[r] < sizes[r]); // within bounds
125-
elements.emplace_back(ind, val);
122+
indices.push_back(ind[r]);
123+
}
124+
// This base only changes if indices were reallocated. In that case, we
125+
// need to correct all previous pointers into the vector. Note that this
126+
// only happens if we did not set the initial capacity right, and then only
127+
// for every internal vector reallocation (which with the doubling rule
128+
// should only incur an amortized linear overhead).
129+
uint64_t *new_base = indices.data();
130+
if (new_base != base) {
131+
for (uint64_t i = 0, n = elements.size(); i < n; i++)
132+
elements[i].indices = new_base + (elements[i].indices - base);
133+
base = new_base;
134+
}
135+
// Add element as (pointer into shared index pool, value) pair.
136+
elements.emplace_back(base + size, val);
126137
}
138+
127139
/// Sorts elements lexicographically by index.
128140
void sort() {
129141
assert(!iteratorLocked && "Attempt to sort() after startIterator()");
130142
// TODO: we may want to cache an `isSorted` bit, to avoid
131143
// unnecessary/redundant sorting.
132-
std::sort(elements.begin(), elements.end(), Element<V>::lexOrder);
144+
std::sort(elements.begin(), elements.end(),
145+
[this](const Element<V> &e1, const Element<V> &e2) {
146+
uint64_t rank = getRank();
147+
for (uint64_t r = 0; r < rank; r++) {
148+
if (e1.indices[r] == e2.indices[r])
149+
continue;
150+
return e1.indices[r] < e2.indices[r];
151+
}
152+
return false;
153+
});
133154
}
155+
134156
/// Returns rank.
135157
uint64_t getRank() const { return sizes.size(); }
158+
136159
/// Getter for sizes array.
137160
const std::vector<uint64_t> &getSizes() const { return sizes; }
161+
138162
/// Getter for elements array.
139163
const std::vector<Element<V>> &getElements() const { return elements; }
140164

@@ -143,6 +167,7 @@ struct SparseTensorCOO {
143167
iteratorLocked = true;
144168
iteratorPos = 0;
145169
}
170+
146171
/// Get the next element.
147172
const Element<V> *getNext() {
148173
assert(iteratorLocked && "Attempt to getNext() before startIterator()");
@@ -172,7 +197,8 @@ struct SparseTensorCOO {
172197

173198
private:
174199
const std::vector<uint64_t> sizes; // per-dimension sizes
175-
std::vector<Element<V>> elements;
200+
std::vector<Element<V>> elements; // all COO elements
201+
std::vector<uint64_t> indices; // shared index pool
176202
bool iteratorLocked = false;
177203
unsigned iteratorPos = 0;
178204
};

0 commit comments

Comments
 (0)