Skip to content

[MLIR][SROA] Replace pattern based approach with a one-shot one #85437

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions mlir/include/mlir/Transforms/SROA.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,9 @@
#ifndef MLIR_TRANSFORMS_SROA_H
#define MLIR_TRANSFORMS_SROA_H

#include "mlir/IR/PatternMatch.h"
#include "mlir/Interfaces/MemorySlotInterfaces.h"
#include "mlir/Support/LogicalResult.h"
#include "llvm/ADT/Statistic.h"
#include <variant>

namespace mlir {

Expand All @@ -29,24 +27,6 @@ struct SROAStatistics {
llvm::Statistic *maxSubelementAmount = nullptr;
};

/// Pattern applying SROA to the regions of the operations on which it
/// matches.
class SROAPattern
: public OpInterfaceRewritePattern<DestructurableAllocationOpInterface> {
public:
using OpInterfaceRewritePattern::OpInterfaceRewritePattern;

SROAPattern(MLIRContext *context, SROAStatistics statistics = {},
PatternBenefit benefit = 1)
: OpInterfaceRewritePattern(context, benefit), statistics(statistics) {}

LogicalResult matchAndRewrite(DestructurableAllocationOpInterface allocator,
PatternRewriter &rewriter) const override;

private:
SROAStatistics statistics;
};

/// Attempts to destructure the slots of destructurable allocators. Returns
/// failure if no slot was destructured.
LogicalResult tryToDestructureMemorySlots(
Expand Down
41 changes: 28 additions & 13 deletions mlir/lib/Transforms/SROA.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
#include "mlir/Transforms/SROA.h"
#include "mlir/Analysis/SliceAnalysis.h"
#include "mlir/Interfaces/MemorySlotInterfaces.h"
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
#include "mlir/Transforms/Passes.h"

namespace mlir {
Expand Down Expand Up @@ -205,13 +204,6 @@ LogicalResult mlir::tryToDestructureMemorySlots(
return success(destructuredAny);
}

LogicalResult
SROAPattern::matchAndRewrite(DestructurableAllocationOpInterface allocator,
PatternRewriter &rewriter) const {
hasBoundedRewriteRecursion();
return tryToDestructureMemorySlots({allocator}, rewriter, statistics);
}

namespace {

struct SROA : public impl::SROABase<SROA> {
Expand All @@ -223,12 +215,35 @@ struct SROA : public impl::SROABase<SROA> {
SROAStatistics statistics{&destructuredAmount, &slotsWithMemoryBenefit,
&maxSubelementAmount};

RewritePatternSet rewritePatterns(&getContext());
rewritePatterns.add<SROAPattern>(&getContext(), statistics);
FrozenRewritePatternSet frozen(std::move(rewritePatterns));
bool changed = false;

for (Region &region : scopeOp->getRegions()) {
if (region.getBlocks().empty())
continue;

if (failed(applyPatternsAndFoldGreedily(scopeOp, frozen)))
signalPassFailure();
OpBuilder builder(&region.front(), region.front().begin());
IRRewriter rewriter(builder);

// Destructuring a slot can allow for further destructuring of other
// slots, destructuring is tried until no destructuring succeeds.
while (true) {
SmallVector<DestructurableAllocationOpInterface> allocators;
// Build a list of allocators to attempt to destructure the slots of.
// TODO: Update list on the fly to avoid repeated visiting of the same
// allocators.
region.walk([&](DestructurableAllocationOpInterface allocator) {
allocators.emplace_back(allocator);
});

if (failed(
tryToDestructureMemorySlots(allocators, rewriter, statistics)))
break;

changed = true;
}
}
if (!changed)
markAllAnalysesPreserved();
}
};

Expand Down
105 changes: 58 additions & 47 deletions mlir/test/Dialect/LLVMIR/sroa-intrinsics.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -146,21 +146,22 @@ llvm.func @invalid_indirect_memset() -> i32 {

// CHECK-LABEL: llvm.func @memset_double_use
llvm.func @memset_double_use() -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
// CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
// CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.struct<"foo", (i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
%memset_value = llvm.mlir.constant(42 : i8) : i8
// 8 bytes means it will span over the two i32 entries.
%memset_len = llvm.mlir.constant(8 : i32) : i32
// We expect two generated memset, one for each field.
// CHECK-NOT: "llvm.intr.memset"
// CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
// CHECK-DAG: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
// CHECK: %[[MEMSET_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN]]) <{isVolatile = false}>
// CHECK-NOT: "llvm.intr.memset"
"llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
%2 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"foo", (i32, f32)>
Expand Down Expand Up @@ -208,21 +209,21 @@ llvm.func @memset_considers_alignment() -> i32 {

// CHECK-LABEL: llvm.func @memset_considers_packing
llvm.func @memset_considers_packing() -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-DAG: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// CHECK-DAG: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
// CHECK-DAG: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
// After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK-DAG: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ALLOCA_FLOAT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x f32
// CHECK: %[[ALLOCA_INT:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// CHECK: %[[MEMSET_VALUE:.*]] = llvm.mlir.constant(42 : i8) : i8
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.struct<"foo", packed (i8, i32, f32)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
%memset_value = llvm.mlir.constant(42 : i8) : i8
// 8 bytes means it will span over all the fields, because there is no padding as the struct is packed.
%memset_len = llvm.mlir.constant(8 : i32) : i32
// Now all fields are touched by the memset.
// CHECK-NOT: "llvm.intr.memset"
// After SROA, only 32-bit values will be actually used, so only 4 bytes will be set.
// CHECK: %[[MEMSET_LEN_WHOLE:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memset"(%[[ALLOCA_INT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_WHOLE]]) <{isVolatile = false}>
// CHECK: %[[MEMSET_LEN_PARTIAL:.*]] = llvm.mlir.constant(3 : i32) : i32
// CHECK: "llvm.intr.memset"(%[[ALLOCA_FLOAT]], %[[MEMSET_VALUE]], %[[MEMSET_LEN_PARTIAL]]) <{isVolatile = false}>
// CHECK-NOT: "llvm.intr.memset"
"llvm.intr.memset"(%1, %memset_value, %memset_len) <{isVolatile = false}> : (!llvm.ptr, i8, i32) -> ()
Expand All @@ -241,14 +242,14 @@ llvm.func @memset_considers_packing() -> i32 {
// CHECK-LABEL: llvm.func @memcpy_dest
// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
%memcpy_len = llvm.mlir.constant(40 : i32) : i32
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMCPY_LEN]]) <{isVolatile = false}>
"llvm.intr.memcpy"(%1, %other_array, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
%2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
Expand All @@ -261,24 +262,27 @@ llvm.func @memcpy_dest(%other_array: !llvm.ptr) -> i32 {
// CHECK-LABEL: llvm.func @memcpy_src
// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
%memcpy_len = llvm.mlir.constant(16 : i32) : i32
// Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
// We can only check that the amount of operations and allocated slots is correct, which should be sufficient
// as unused slots are not generated.
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
"llvm.intr.memcpy"(%other_array, %1, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
%2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
%3 = llvm.load %2 : !llvm.ptr -> i32
Expand All @@ -289,14 +293,19 @@ llvm.func @memcpy_src(%other_array: !llvm.ptr) -> i32 {

// CHECK-LABEL: llvm.func @memcpy_double
llvm.func @memcpy_double() -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-DAG: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
%0 = llvm.mlir.constant(1 : i32) : i32
// CHECK-COUNT-2: = llvm.alloca %[[ALLOCA_LEN]] x i32
// CHECK: = llvm.alloca %[[ALLOCA_LEN]] x i32
// TODO: This should also disappear as a GEP with all zero indices should be
// ignored.
// CHECK: = llvm.alloca %[[ALLOCA_LEN]] x !llvm.array<1 x i32>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does this no longer disappear?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Forgot to investigate. Will get back to you once I found the cause.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Found the issue:
This is due to changing from the rewriter to a builder. The rewriter was able to fold the GEPOp zero indexing GEPOp away, while this is not done be the builder.
Somehow, SROA is not happy with this intermediate GEP, which degrades it's performance in such cases.

I'll address this as part of the improvement efforts.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The rewriter was able to fold the GEPOp zero indexing GEPOp away, while this is not done be the builder.

It's not the builder/rewriter I believe, instead it is the Greedy driver doing a lot of cleanup like this on the fly.

That said you could do it yourself by using createOrFold instead of create if these are GEPOp you are creating. And otherwise have the algorithm match GEPOp and fold them as it goes.

%1 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
%2 = llvm.alloca %0 x !llvm.array<1 x i32> : (i32) -> !llvm.ptr
// Match the dead constant, to avoid collision with the newly created one.
// CHECK: llvm.mlir.constant
%memcpy_len = llvm.mlir.constant(4 : i32) : i32
// CHECK-NOT: "llvm.intr.memcpy"
// CHECK: %[[MEMCPY_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memcpy"(%{{.*}}, %{{.*}}, %[[MEMCPY_LEN]]) <{isVolatile = false}>
// CHECK-NOT: "llvm.intr.memcpy"
"llvm.intr.memcpy"(%1, %2, %memcpy_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
Expand Down Expand Up @@ -346,14 +355,14 @@ llvm.func @memcpy_no_volatile(%other_array: !llvm.ptr) -> i32 {
// CHECK-LABEL: llvm.func @memmove_dest
// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-DAG: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK: %[[ALLOCA:.*]] = llvm.alloca %[[ALLOCA_LEN]] x i32
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.array<10 x i32> : (i32) -> !llvm.ptr
%memmove_len = llvm.mlir.constant(40 : i32) : i32
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memmove"(%[[ALLOCA]], %[[SLOT_IN_OTHER]], %[[MEMMOVE_LEN]]) <{isVolatile = false}>
"llvm.intr.memmove"(%1, %other_array, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
%2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<10 x i32>
Expand All @@ -366,24 +375,26 @@ llvm.func @memmove_dest(%other_array: !llvm.ptr) -> i32 {
// CHECK-LABEL: llvm.func @memmove_src
// CHECK-SAME: (%[[OTHER_ARRAY:.*]]: !llvm.ptr)
llvm.func @memmove_src(%other_array: !llvm.ptr) -> i32 {
// CHECK-DAG: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// After SROA, only one i32 will be actually used, so only 4 bytes will be set.
// CHECK-DAG: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: %[[ALLOCA_LEN:.*]] = llvm.mlir.constant(1 : i32) : i32
// CHECK-COUNT-4: = llvm.alloca %[[ALLOCA_LEN]] x i32
%0 = llvm.mlir.constant(1 : i32) : i32
%1 = llvm.alloca %0 x !llvm.array<4 x i32> : (i32) -> !llvm.ptr
%memmove_len = llvm.mlir.constant(16 : i32) : i32
// Unfortunately because of FileCheck limitations it is not possible to check which slot gets read from.
// We can only check that the amount of operations and allocated slots is correct, which should be sufficient
// as unused slots are not generated.
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK-DAG: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK-DAG: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
// CHECK: %[[SLOT_IN_OTHER:.*]] = llvm.getelementptr %[[OTHER_ARRAY]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
// CHECK: %[[MEMMOVE_LEN:.*]] = llvm.mlir.constant(4 : i32) : i32
// CHECK: "llvm.intr.memmove"(%[[SLOT_IN_OTHER]], %{{.*}}, %[[MEMMOVE_LEN]]) <{isVolatile = false}>
"llvm.intr.memmove"(%other_array, %1, %memmove_len) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
%2 = llvm.getelementptr %1[0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<4 x i32>
%3 = llvm.load %2 : !llvm.ptr -> i32
Expand Down