Skip to content

Commit 09c3d14

Browse files
authored
[IA] Add support for [de]interleave{3,5,7} (#139373)
This adds support for lowering deinterleave and interleave intrinsics for factors 3 5 and 7 into target specific memory intrinsics. Notably this doesn't add support for handling higher factors constructed from interleaving interleave intrinsics, e.g. factor 6 from interleave3 + interleave2. I initially tried this but it became very complex very quickly. For example, because there's now multiple factors involved interleaveLeafValues is no longer symmetric between interleaving and deinterleaving. There's then also two ways of representing a factor 6 deinterleave: It can both be done as either 1 deinterleave3 and 3 deinterleave2s OR 1 deinterleave2 and 3 deinterleave3s. I'm not sure the complexity of supporting arbitrary factors is warranted given how we only need to support a small number of factors currently: SVE only needs factors 2,3,4 whilst RVV only needs 2,3,4,5,6,7,8. My preference would be to just add a interleave6 and deinterleave6 intrinsic to avoid all this ambiguity, but I'll defer this discussion to a later patch.
1 parent 0641ca1 commit 09c3d14

File tree

8 files changed

+704
-21
lines changed

8 files changed

+704
-21
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3234,7 +3234,7 @@ class TargetLoweringBase {
32343234

32353235
/// Lower a deinterleave intrinsic to a target specific load intrinsic.
32363236
/// Return true on success. Currently only supports
3237-
/// llvm.vector.deinterleave2
3237+
/// llvm.vector.deinterleave{2,3,5,7}
32383238
///
32393239
/// \p LI is the accompanying load instruction.
32403240
/// \p DeinterleaveValues contains the deinterleaved values.
@@ -3246,7 +3246,7 @@ class TargetLoweringBase {
32463246

32473247
/// Lower an interleave intrinsic to a target specific store intrinsic.
32483248
/// Return true on success. Currently only supports
3249-
/// llvm.vector.interleave2
3249+
/// llvm.vector.interleave{2,3,5,7}
32503250
///
32513251
/// \p SI is the accompanying store instruction
32523252
/// \p InterleaveValues contains the interleaved values.

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,25 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
571571
return true;
572572
}
573573

574+
static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
575+
switch (II->getIntrinsicID()) {
576+
case Intrinsic::vector_deinterleave2:
577+
case Intrinsic::vector_interleave2:
578+
return 2;
579+
case Intrinsic::vector_deinterleave3:
580+
case Intrinsic::vector_interleave3:
581+
return 3;
582+
case Intrinsic::vector_deinterleave5:
583+
case Intrinsic::vector_interleave5:
584+
return 5;
585+
case Intrinsic::vector_deinterleave7:
586+
case Intrinsic::vector_interleave7:
587+
return 7;
588+
default:
589+
llvm_unreachable("Unexpected intrinsic");
590+
}
591+
}
592+
574593
// For an (de)interleave tree like this:
575594
//
576595
// A C B D
@@ -586,7 +605,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
586605
// to reorder them by interleaving these values.
587606
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
588607
unsigned NumLeaves = SubLeaves.size();
589-
if (NumLeaves == 2)
608+
if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves))
590609
return;
591610

592611
assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
@@ -608,7 +627,10 @@ static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
608627
static bool
609628
getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
610629
SmallVectorImpl<Instruction *> &DeadInsts) {
611-
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
630+
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 ||
631+
II->getIntrinsicID() == Intrinsic::vector_interleave3 ||
632+
II->getIntrinsicID() == Intrinsic::vector_interleave5 ||
633+
II->getIntrinsicID() == Intrinsic::vector_interleave7);
612634

613635
// Visit with BFS
614636
SmallVector<IntrinsicInst *, 8> Queue;
@@ -620,7 +642,7 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
620642
// All the intermediate intrinsics will be deleted.
621643
DeadInsts.push_back(Current);
622644

623-
for (unsigned I = 0; I < 2; ++I) {
645+
for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) {
624646
Value *Op = Current->getOperand(I);
625647
if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
626648
if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
@@ -638,9 +660,10 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
638660
}
639661

640662
const unsigned Factor = Operands.size();
641-
// Currently we only recognize power-of-two factors.
663+
// Currently we only recognize factors of 3, 5, 7, and powers of 2.
642664
// FIXME: should we assert here instead?
643-
if (Factor <= 1 || !isPowerOf2_32(Factor))
665+
if (Factor <= 1 ||
666+
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
644667
return false;
645668

646669
interleaveLeafValues(Operands);
@@ -651,9 +674,12 @@ static bool
651674
getVectorDeinterleaveFactor(IntrinsicInst *II,
652675
SmallVectorImpl<Value *> &Results,
653676
SmallVectorImpl<Instruction *> &DeadInsts) {
654-
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
677+
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 ||
678+
II->getIntrinsicID() == Intrinsic::vector_deinterleave3 ||
679+
II->getIntrinsicID() == Intrinsic::vector_deinterleave5 ||
680+
II->getIntrinsicID() == Intrinsic::vector_deinterleave7);
655681
using namespace PatternMatch;
656-
if (!II->hasNUses(2))
682+
if (!II->hasNUses(getIntrinsicFactor(II)))
657683
return false;
658684

659685
// Visit with BFS
@@ -662,12 +688,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
662688
while (!Queue.empty()) {
663689
IntrinsicInst *Current = Queue.front();
664690
Queue.erase(Queue.begin());
665-
assert(Current->hasNUses(2));
691+
assert(Current->hasNUses(getIntrinsicFactor(Current)));
666692

667693
// All the intermediate intrinsics will be deleted from the bottom-up.
668694
DeadInsts.insert(DeadInsts.begin(), Current);
669695

670-
ExtractValueInst *LHS = nullptr, *RHS = nullptr;
696+
SmallVector<ExtractValueInst *> EVs(getIntrinsicFactor(Current), nullptr);
671697
for (User *Usr : Current->users()) {
672698
if (!isa<ExtractValueInst>(Usr))
673699
return 0;
@@ -679,17 +705,15 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
679705
if (Indices.size() != 1)
680706
return false;
681707

682-
if (Indices[0] == 0 && !LHS)
683-
LHS = EV;
684-
else if (Indices[0] == 1 && !RHS)
685-
RHS = EV;
708+
if (!EVs[Indices[0]])
709+
EVs[Indices[0]] = EV;
686710
else
687711
return false;
688712
}
689713

690714
// We have legal indices. At this point we're either going
691715
// to continue the traversal or push the leaf values into Results.
692-
for (ExtractValueInst *EV : {LHS, RHS}) {
716+
for (ExtractValueInst *EV : EVs) {
693717
// Continue the traversal. We're playing safe here and matching only the
694718
// expression consisting of a perfectly balanced binary tree in which all
695719
// intermediate values are only used once.
@@ -713,9 +737,10 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
713737
}
714738

715739
const unsigned Factor = Results.size();
716-
// Currently we only recognize power-of-two factors.
740+
// Currently we only recognize factors of 3, 5, 7, and powers of 2.
717741
// FIXME: should we assert here instead?
718-
if (Factor <= 1 || !isPowerOf2_32(Factor))
742+
if (Factor <= 1 ||
743+
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
719744
return 0;
720745

721746
interleaveLeafValues(Results);
@@ -878,11 +903,23 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
878903

879904
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
880905
// At present, we only have intrinsics to represent (de)interleaving
881-
// with a factor of 2.
882-
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
906+
// with a factor of 2,3,5 and 7.
907+
switch (II->getIntrinsicID()) {
908+
case Intrinsic::vector_deinterleave2:
909+
case Intrinsic::vector_deinterleave3:
910+
case Intrinsic::vector_deinterleave5:
911+
case Intrinsic::vector_deinterleave7:
883912
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
884-
else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
913+
break;
914+
case Intrinsic::vector_interleave2:
915+
case Intrinsic::vector_interleave3:
916+
case Intrinsic::vector_interleave5:
917+
case Intrinsic::vector_interleave7:
885918
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
919+
break;
920+
default:
921+
break;
922+
}
886923
}
887924
}
888925

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -257,6 +257,23 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
257257
ret {<2 x double>, <2 x double>} %res1
258258
}
259259

260+
define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
261+
; CHECK-LABEL: vector_deinterleave_load_factor3:
262+
; CHECK: # %bb.0:
263+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
264+
; CHECK-NEXT: vlseg3e8.v v6, (a0)
265+
; CHECK-NEXT: ret
266+
%vec = load <24 x i8>, ptr %p
267+
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
268+
%t0 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 0
269+
%t1 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 1
270+
%t2 = extractvalue {<8 x i8>, <8 x i8>, <8 x i8>} %d0, 2
271+
%res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
272+
%res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 0
273+
%res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 0
274+
ret { <8 x i8>, <8 x i8>, <8 x i8> } %res2
275+
}
276+
260277
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor4(ptr %p) {
261278
; CHECK-LABEL: vector_deinterleave_load_factor4:
262279
; CHECK: # %bb.0:
@@ -281,6 +298,52 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
281298
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3
282299
}
283300

301+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
302+
; CHECK-LABEL: vector_deinterleave_load_factor5:
303+
; CHECK: # %bb.0:
304+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
305+
; CHECK-NEXT: vlseg5e8.v v8, (a0)
306+
; CHECK-NEXT: ret
307+
%vec = load <40 x i8>, ptr %p
308+
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec)
309+
%t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
310+
%t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
311+
%t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
312+
%t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
313+
%t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
314+
%res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
315+
%res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
316+
%res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
317+
%res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
318+
%res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
319+
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res4
320+
}
321+
322+
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
323+
; CHECK-LABEL: vector_deinterleave_load_factor7:
324+
; CHECK: # %bb.0:
325+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
326+
; CHECK-NEXT: vlseg7e8.v v8, (a0)
327+
; CHECK-NEXT: ret
328+
%vec = load <56 x i8>, ptr %p
329+
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec)
330+
%t0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 0
331+
%t1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 1
332+
%t2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 2
333+
%t3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 3
334+
%t4 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 4
335+
%t5 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 5
336+
%t6 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %d0, 6
337+
%res0 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } poison, <8 x i8> %t0, 0
338+
%res1 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res0, <8 x i8> %t1, 1
339+
%res2 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res1, <8 x i8> %t2, 2
340+
%res3 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res2, <8 x i8> %t3, 3
341+
%res4 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t4, 4
342+
%res5 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t5, 5
343+
%res6 = insertvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res3, <8 x i8> %t6, 6
344+
ret { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %res6
345+
}
346+
284347
define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave_load_factor8(ptr %ptr) {
285348
; CHECK-LABEL: vector_deinterleave_load_factor8:
286349
; CHECK: # %bb.0:

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleave-store.ll

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,17 @@ define void @vector_interleave_store_v4f64_v2f64(<2 x double> %a, <2 x double> %
181181
ret void
182182
}
183183

184+
define void @vector_interleave_store_factor3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, ptr %p) {
185+
; CHECK-LABEL: vector_interleave_store_factor3:
186+
; CHECK: # %bb.0:
187+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
188+
; CHECK-NEXT: vsseg3e32.v v8, (a0)
189+
; CHECK-NEXT: ret
190+
%v = call <12 x i32> @llvm.vector.interleave3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c)
191+
store <12 x i32> %v, ptr %p
192+
ret void
193+
}
194+
184195
define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, ptr %p) {
185196
; CHECK-LABEL: vector_interleave_store_factor4:
186197
; CHECK: # %bb.0:
@@ -194,6 +205,28 @@ define void @vector_interleave_store_factor4(<4 x i32> %a, <4 x i32> %b, <4 x i3
194205
ret void
195206
}
196207

208+
define void @vector_interleave_store_factor5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, ptr %p) {
209+
; CHECK-LABEL: vector_interleave_store_factor5:
210+
; CHECK: # %bb.0:
211+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
212+
; CHECK-NEXT: vsseg5e32.v v8, (a0)
213+
; CHECK-NEXT: ret
214+
%v = call <20 x i32> @llvm.vector.interleave5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e)
215+
store <20 x i32> %v, ptr %p
216+
ret void
217+
}
218+
219+
define void @vector_interleave_store_factor7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, ptr %p) {
220+
; CHECK-LABEL: vector_interleave_store_factor7:
221+
; CHECK: # %bb.0:
222+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
223+
; CHECK-NEXT: vsseg7e32.v v8, (a0)
224+
; CHECK-NEXT: ret
225+
%v = call <28 x i32> @llvm.vector.interleave7(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g)
226+
store <28 x i32> %v, ptr %p
227+
ret void
228+
}
229+
197230
define void @vector_interleave_store_factor8(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <4 x i32> %e, <4 x i32> %f, <4 x i32> %g, <4 x i32> %h, ptr %p) {
198231
; CHECK-LABEL: vector_interleave_store_factor8:
199232
; CHECK: # %bb.0:

0 commit comments

Comments
 (0)