Skip to content

[mlir][sparse] support sparse dilated convolution. #80470

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,16 +313,16 @@ void LoopEmitter::initSubSectIterator(OpBuilder &builder, Location loc) {
// Compute the subsection size.
Value size = c0;
for (auto [loop, stride] : remDepStack[t][lvl]) {
Value loopHi = loopHighs[loop];
size = ADDI(size, MULI(loopHi, C_IDX(stride)));
Value idxMax = SUBI(loopHighs[loop], C_IDX(1));
size = ADDI(size, ADDI(MULI(idxMax, C_IDX(stride)), C_IDX(1)));
}
it = makeNonEmptySubSectIterator(builder, loc, parent, loopHighs[loop],
std::move(lvlIt), size, curDep.second);
} else {
Value size = loopHighs[loop];
const SparseIterator &subSectIter = *iters[t][lvl].back();
it = makeTraverseSubSectIterator(subSectIter, *parent, std::move(lvlIt),
size, curDep.second);
it = makeTraverseSubSectIterator(builder, loc, subSectIter, *parent,
std::move(lvlIt), loopHighs[loop],
curDep.second);
}
lastIter[t] = it.get();
iters[t][lvl].emplace_back(std::move(it));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -665,13 +665,10 @@ class SubSectIterator : public SparseIterator {
public:
SubSectIterator(const NonEmptySubSectIterator &subSect,
const SparseIterator &parent,
std::unique_ptr<SparseIterator> &&wrap, Value size,
unsigned stride)
std::unique_ptr<SparseIterator> &&wrap)
: SparseIterator(IterKind::kSubSect, *wrap,
/*extraCursorCnt=*/wrap->randomAccessible() ? 0 : 1),
subSect(subSect), wrap(std::move(wrap)), parent(parent), size(size),
stride(stride), helper(*this) {
assert(stride == 1 && "Not implemented.");
subSect(subSect), wrap(std::move(wrap)), parent(parent), helper(*this) {
assert(subSect.tid == tid && subSect.lvl == lvl);
assert(parent.kind != IterKind::kSubSect || parent.lvl + 1 == lvl);
};
Expand All @@ -693,7 +690,9 @@ class SubSectIterator : public SparseIterator {

bool randomAccessible() const override { return wrap->randomAccessible(); };
bool iteratableByFor() const override { return randomAccessible(); };
Value upperBound(OpBuilder &b, Location l) const override { return size; }
Value upperBound(OpBuilder &b, Location l) const override {
return subSect.subSectSz;
}
std::pair<Value, Value> getCurPosition() const override {
return wrap->getCurPosition();
};
Expand All @@ -711,7 +710,7 @@ class SubSectIterator : public SparseIterator {
assert(p->lvl + 1 == lvl);
wrap->genInit(b, l, p);
// Linearize the dense subsection index.
nxLvlTupleStart = MULI(size, p->getNxLvlTupleId(b, l));
nxLvlTupleStart = MULI(subSect.subSectSz, p->getNxLvlTupleId(b, l));
} else {
assert(subSect.lvl == lvl && subSect.isSubSectRoot());
wrap->deserialize(subSect.delegate->serialize());
Expand Down Expand Up @@ -765,9 +764,6 @@ class SubSectIterator : public SparseIterator {
std::unique_ptr<SparseIterator> wrap;
const SparseIterator &parent;

Value size;
unsigned stride;

SubSectIterHelper helper;
};

Expand Down Expand Up @@ -1330,29 +1326,19 @@ sparse_tensor::makeSlicedLevelIterator(std::unique_ptr<SparseIterator> &&sit,
return std::make_unique<FilterIterator>(std::move(sit), offset, stride, size);
}

template <typename IterType>
static const SparseIterator *tryUnwrapFilter(const SparseIterator *it) {
auto *filter = llvm::dyn_cast_or_null<FilterIterator>(it);
if (filter && llvm::isa<IterType>(filter->wrap.get())) {
if (filter)
return filter->wrap.get();
}
return it;
}
template <typename IterType>
static const IterType *unwrapFilter(const SparseIterator *it) {
auto *filter = llvm::dyn_cast_or_null<FilterIterator>(it);
if (filter) {
return llvm::cast<IterType>(filter->wrap.get());
}
return llvm::cast<IterType>(it);
}

std::unique_ptr<SparseIterator> sparse_tensor::makeNonEmptySubSectIterator(
OpBuilder &b, Location l, const SparseIterator *parent, Value loopBound,
std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride) {

// Try unwrap the NonEmptySubSectIterator from a filter parent.
parent = tryUnwrapFilter<NonEmptySubSectIterator>(parent);
parent = tryUnwrapFilter(parent);
auto it = std::make_unique<NonEmptySubSectIterator>(
b, l, parent, std::move(delegate), size);

Expand All @@ -1366,12 +1352,22 @@ std::unique_ptr<SparseIterator> sparse_tensor::makeNonEmptySubSectIterator(
}

std::unique_ptr<SparseIterator> sparse_tensor::makeTraverseSubSectIterator(
const SparseIterator &subSectIter, const SparseIterator &parent,
std::unique_ptr<SparseIterator> &&wrap, Value size, unsigned stride) {
OpBuilder &b, Location l, const SparseIterator &subSectIter,
const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
Value loopBound, unsigned stride) {

// This must be a subsection iterator or a filtered subsection iterator.
auto &subSect = *unwrapFilter<NonEmptySubSectIterator>(&subSectIter);
return std::make_unique<SubSectIterator>(subSect, parent, std::move(wrap),
size, stride);
auto &subSect =
llvm::cast<NonEmptySubSectIterator>(*tryUnwrapFilter(&subSectIter));

auto it = std::make_unique<SubSectIterator>(
subSect, *tryUnwrapFilter(&parent), std::move(wrap));

if (stride != 1) {
return std::make_unique<FilterIterator>(std::move(it), /*offset=*/C_IDX(0),
C_IDX(stride), /*size=*/loopBound);
}
return it;
}

#undef CMPI
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,8 +300,9 @@ std::unique_ptr<SparseIterator> makeNonEmptySubSectIterator(
/// Helper function to create a SparseIterator object that iterate over a
/// non-empty subsection created by NonEmptySubSectIterator.
std::unique_ptr<SparseIterator> makeTraverseSubSectIterator(
const SparseIterator &subsectIter, const SparseIterator &parent,
std::unique_ptr<SparseIterator> &&delegate, Value size, unsigned stride);
OpBuilder &b, Location l, const SparseIterator &subsectIter,
const SparseIterator &parent, std::unique_ptr<SparseIterator> &&wrap,
Value loopBound, unsigned stride);

} // namespace sparse_tensor
} // namespace mlir
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
//--------------------------------------------------------------------------------------------------
// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
//
// Set-up that's shared across all tests in this directory. In principle, this
// config could be moved to lit.local.cfg. However, there are downstream users that
// do not use these LIT config files. Hence why this is kept inline.
//
// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
// DEFINE: %{run_opts} = -e entry -entry-point-result=void
// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
//
// DEFINE: %{env} =
//--------------------------------------------------------------------------------------------------

// RUN: %{compile} | %{run} | FileCheck %s
//
// Do the same run, but now with direct IR generation.
// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
// RUN: %{compile} | %{run} | FileCheck %s
//
// Do the same run, but now with direct IR generation and vectorization.
// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
// RUN: %{compile} | %{run} | FileCheck %s
//
// Do the same run, but now with direct IR generation and VLA vectorization.
// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}

#CCCC = #sparse_tensor.encoding<{
map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : compressed, d2 : compressed, d3 : compressed)
}>

#CDCC = #sparse_tensor.encoding<{
map = (d0, d1, d2, d3) -> (d0 : compressed, d1 : dense, d2 : compressed, d3 : compressed)
}>

// Creates and returns 4-D buffer of size (%s1, %s2, %s3, %s4) filled with the value %f
func.func @alloc_4d_filled_f32(%s1 : index, %s2 : index, %s3 : index, %s4 : index, %f : f32) -> tensor<?x?x?x?xf32> {
%buf = tensor.empty(%s1, %s2, %s3, %s4) : tensor<?x?x?x?xf32>
%ret = linalg.fill ins(%f : f32) outs(%buf : tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %ret : tensor<?x?x?x?xf32>
}

func.func @conv_2d_nhwc_hwcf(%arg0: tensor<?x?x?x?xf32>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<2> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %ret : tensor<?x?x?x?xf32>
}

func.func @conv_2d_nhwc_hwcf_CCCC(%arg0: tensor<?x?x?x?xf32, #CCCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<2> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>)
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %ret : tensor<?x?x?x?xf32>
}

func.func @conv_2d_nhwc_hwcf_CDCC(%arg0: tensor<?x?x?x?xf32, #CDCC>, %arg1: tensor<?x?x?x?xf32>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<2> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CDCC>, tensor<?x?x?x?xf32>)
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %ret : tensor<?x?x?x?xf32>
}

func.func @conv_2d_nhwc_hwcf_dual_CDCC(%arg0: tensor<?x?x?x?xf32, #CDCC>, %arg1: tensor<?x?x?x?xf32, #CDCC>, %arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32> {
%ret = linalg.conv_2d_nhwc_hwcf {dilations = dense<2> : tensor<2xi64>,
strides = dense<1> : tensor<2xi64>}
ins (%arg0, %arg1: tensor<?x?x?x?xf32, #CDCC>, tensor<?x?x?x?xf32, #CDCC>)
outs (%arg2: tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
return %ret : tensor<?x?x?x?xf32>
}


func.func @entry() {
%c0 = arith.constant 0 : index
%c1 = arith.constant 1 : index
%c3 = arith.constant 3 : index
%c5 = arith.constant 5 : index
%c6 = arith.constant 6 : index
%c7 = arith.constant 7 : index
%f10 = arith.constant 10.00000e+00 : f32
%val = arith.constant 2.00000e+00 : f32
%zero = arith.constant 0.00000e+00 : f32

%filter2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %val) :(index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
%in2D_tmp = call @alloc_4d_filled_f32(%c3, %c7, %c7, %c3, %f10) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)
%in2D_nhwc = tensor.insert %zero into %in2D_tmp[%c0, %c1, %c1, %c0] : tensor<?x?x?x?xf32>
%out2D_nhwc = call @alloc_4d_filled_f32(%c3, %c3, %c3, %c1, %zero) : (index, index, index, index, f32) -> (tensor<?x?x?x?xf32>)

%in2D_nhwc_CCCC = sparse_tensor.convert %in2D_nhwc
: tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CCCC>
%filter2D_nhwc_CDCC = sparse_tensor.convert %filter2D_nhwc
: tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CDCC>
%in2D_nhwc_CDCC = sparse_tensor.convert %in2D_nhwc
: tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32, #CDCC>

%dense_ret = call @conv_2d_nhwc_hwcf(%in2D_nhwc, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
%CCCC_ret = call @conv_2d_nhwc_hwcf_CCCC(%in2D_nhwc_CCCC, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32, #CCCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)
%CDCC_ret = call @conv_2d_nhwc_hwcf_CDCC(%in2D_nhwc_CDCC, %filter2D_nhwc, %out2D_nhwc) : (tensor<?x?x?x?xf32, #CDCC>, tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)

%dual_CDCC_ret = call @conv_2d_nhwc_hwcf_dual_CDCC(%in2D_nhwc_CDCC, %filter2D_nhwc_CDCC, %out2D_nhwc)
: (tensor<?x?x?x?xf32, #CDCC>, tensor<?x?x?x?xf32, #CDCC>, tensor<?x?x?x?xf32>) -> (tensor<?x?x?x?xf32>)

// CHECK: ( ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 520 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ) )
%dense_v = vector.transfer_read %dense_ret[%c0, %c0, %c0, %c0], %zero
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
vector.print %dense_v : vector<3x3x3x1xf32>

// CHECK-NEXT: ( ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 520 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ) )
%v_dual = vector.transfer_read %dual_CDCC_ret[%c0, %c0, %c0, %c0], %zero
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
vector.print %v_dual : vector<3x3x3x1xf32>

// CHECK-NEXT: ( ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 520 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ) )
%v1 = vector.transfer_read %CCCC_ret[%c0, %c0, %c0, %c0], %zero
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
vector.print %v1 : vector<3x3x3x1xf32>

// CHECK-NEXT: ( ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 520 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ),
// CHECK-SAME: ( ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ), ( ( 540 ), ( 540 ), ( 540 ) ) ) )
%v2 = vector.transfer_read %CDCC_ret[%c0, %c0, %c0, %c0], %zero
: tensor<?x?x?x?xf32>, vector<3x3x3x1xf32>
vector.print %v1 : vector<3x3x3x1xf32>

// Free the resources.
bufferization.dealloc_tensor %in2D_nhwc : tensor<?x?x?x?xf32>
bufferization.dealloc_tensor %filter2D_nhwc : tensor<?x?x?x?xf32>
bufferization.dealloc_tensor %out2D_nhwc : tensor<?x?x?x?xf32>

bufferization.dealloc_tensor %filter2D_nhwc_CDCC : tensor<?x?x?x?xf32, #CDCC>
bufferization.dealloc_tensor %in2D_nhwc_CCCC : tensor<?x?x?x?xf32, #CCCC>
bufferization.dealloc_tensor %in2D_nhwc_CDCC : tensor<?x?x?x?xf32, #CDCC>
return
}