Skip to content

Commit a6b4b02

Browse files
committed
[IA] Support [de]interleave{3,5,7}
1 parent c45fca8 commit a6b4b02

File tree

6 files changed

+168
-1260
lines changed

6 files changed

+168
-1260
lines changed

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 56 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -571,6 +571,25 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
571571
return true;
572572
}
573573

574+
static unsigned getIntrinsicFactor(const IntrinsicInst *II) {
575+
switch (II->getIntrinsicID()) {
576+
case Intrinsic::vector_deinterleave2:
577+
case Intrinsic::vector_interleave2:
578+
return 2;
579+
case Intrinsic::vector_deinterleave3:
580+
case Intrinsic::vector_interleave3:
581+
return 3;
582+
case Intrinsic::vector_deinterleave5:
583+
case Intrinsic::vector_interleave5:
584+
return 5;
585+
case Intrinsic::vector_deinterleave7:
586+
case Intrinsic::vector_interleave7:
587+
return 7;
588+
default:
589+
llvm_unreachable("Unexpected intrinsic");
590+
}
591+
}
592+
574593
// For an (de)interleave tree like this:
575594
//
576595
// A C B D
@@ -586,7 +605,7 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
586605
// to reorder them by interleaving these values.
587606
static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
588607
unsigned NumLeaves = SubLeaves.size();
589-
if (NumLeaves == 2)
608+
if (NumLeaves == 2 || !isPowerOf2_64(NumLeaves))
590609
return;
591610

592611
assert(isPowerOf2_32(NumLeaves) && NumLeaves > 1);
@@ -608,7 +627,10 @@ static void interleaveLeafValues(MutableArrayRef<Value *> SubLeaves) {
608627
static bool
609628
getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
610629
SmallVectorImpl<Instruction *> &DeadInsts) {
611-
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2);
630+
assert(II->getIntrinsicID() == Intrinsic::vector_interleave2 ||
631+
II->getIntrinsicID() == Intrinsic::vector_interleave3 ||
632+
II->getIntrinsicID() == Intrinsic::vector_interleave5 ||
633+
II->getIntrinsicID() == Intrinsic::vector_interleave7);
612634

613635
// Visit with BFS
614636
SmallVector<IntrinsicInst *, 8> Queue;
@@ -620,7 +642,7 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
620642
// All the intermediate intrinsics will be deleted.
621643
DeadInsts.push_back(Current);
622644

623-
for (unsigned I = 0; I < 2; ++I) {
645+
for (unsigned I = 0; I < getIntrinsicFactor(Current); ++I) {
624646
Value *Op = Current->getOperand(I);
625647
if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
626648
if (OpII->getIntrinsicID() == Intrinsic::vector_interleave2) {
@@ -638,9 +660,10 @@ getVectorInterleaveFactor(IntrinsicInst *II, SmallVectorImpl<Value *> &Operands,
638660
}
639661

640662
const unsigned Factor = Operands.size();
641-
// Currently we only recognize power-of-two factors.
663+
// Currently we only recognize factors of 2, 3, 5 and 7.
642664
// FIXME: should we assert here instead?
643-
if (Factor <= 1 || !isPowerOf2_32(Factor))
665+
if (Factor <= 1 ||
666+
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
644667
return false;
645668

646669
interleaveLeafValues(Operands);
@@ -651,9 +674,12 @@ static bool
651674
getVectorDeinterleaveFactor(IntrinsicInst *II,
652675
SmallVectorImpl<Value *> &Results,
653676
SmallVectorImpl<Instruction *> &DeadInsts) {
654-
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2);
677+
assert(II->getIntrinsicID() == Intrinsic::vector_deinterleave2 ||
678+
II->getIntrinsicID() == Intrinsic::vector_deinterleave3 ||
679+
II->getIntrinsicID() == Intrinsic::vector_deinterleave5 ||
680+
II->getIntrinsicID() == Intrinsic::vector_deinterleave7);
655681
using namespace PatternMatch;
656-
if (!II->hasNUses(2))
682+
if (!II->hasNUses(getIntrinsicFactor(II)))
657683
return false;
658684

659685
// Visit with BFS
@@ -662,12 +688,12 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
662688
while (!Queue.empty()) {
663689
IntrinsicInst *Current = Queue.front();
664690
Queue.erase(Queue.begin());
665-
assert(Current->hasNUses(2));
691+
assert(Current->hasNUses(getIntrinsicFactor(Current)));
666692

667693
// All the intermediate intrinsics will be deleted from the bottom-up.
668694
DeadInsts.insert(DeadInsts.begin(), Current);
669695

670-
ExtractValueInst *LHS = nullptr, *RHS = nullptr;
696+
SmallVector<ExtractValueInst *> EVs(getIntrinsicFactor(Current), nullptr);
671697
for (User *Usr : Current->users()) {
672698
if (!isa<ExtractValueInst>(Usr))
673699
return 0;
@@ -679,17 +705,15 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
679705
if (Indices.size() != 1)
680706
return false;
681707

682-
if (Indices[0] == 0 && !LHS)
683-
LHS = EV;
684-
else if (Indices[0] == 1 && !RHS)
685-
RHS = EV;
708+
if (!EVs[Indices[0]])
709+
EVs[Indices[0]] = EV;
686710
else
687711
return false;
688712
}
689713

690714
// We have legal indices. At this point we're either going
691715
// to continue the traversal or push the leaf values into Results.
692-
for (ExtractValueInst *EV : {LHS, RHS}) {
716+
for (ExtractValueInst *EV : EVs) {
693717
// Continue the traversal. We're playing safe here and matching only the
694718
// expression consisting of a perfectly balanced binary tree in which all
695719
// intermediate values are only used once.
@@ -713,9 +737,10 @@ getVectorDeinterleaveFactor(IntrinsicInst *II,
713737
}
714738

715739
const unsigned Factor = Results.size();
716-
// Currently we only recognize power-of-two factors.
740+
// Currently we only recognize factors of 2, 3, 5 and 7.
717741
// FIXME: should we assert here instead?
718-
if (Factor <= 1 || !isPowerOf2_32(Factor))
742+
if (Factor <= 1 ||
743+
(!isPowerOf2_32(Factor) && Factor != getIntrinsicFactor(II)))
719744
return 0;
720745

721746
interleaveLeafValues(Results);
@@ -878,11 +903,23 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
878903

879904
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
880905
// At present, we only have intrinsics to represent (de)interleaving
881-
// with a factor of 2.
882-
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
906+
// with a factor of 2,3,5 and 7.
907+
switch (II->getIntrinsicID()) {
908+
case Intrinsic::vector_deinterleave2:
909+
case Intrinsic::vector_deinterleave3:
910+
case Intrinsic::vector_deinterleave5:
911+
case Intrinsic::vector_deinterleave7:
883912
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
884-
else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
913+
break;
914+
case Intrinsic::vector_interleave2:
915+
case Intrinsic::vector_interleave3:
916+
case Intrinsic::vector_interleave5:
917+
case Intrinsic::vector_interleave7:
885918
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
919+
break;
920+
default:
921+
break;
922+
}
886923
}
887924
}
888925

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-deinterleave-load.ll

Lines changed: 3 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -260,34 +260,8 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_load_v2f64_v4f64(ptr %p
260260
define { <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor3(ptr %p) {
261261
; CHECK-LABEL: vector_deinterleave_load_factor3:
262262
; CHECK: # %bb.0:
263-
; CHECK-NEXT: addi sp, sp, -16
264-
; CHECK-NEXT: .cfi_def_cfa_offset 16
265-
; CHECK-NEXT: csrr a1, vlenb
266-
; CHECK-NEXT: slli a1, a1, 1
267-
; CHECK-NEXT: sub sp, sp, a1
268-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
269-
; CHECK-NEXT: vsetivli zero, 24, e8, m2, ta, ma
270-
; CHECK-NEXT: vle8.v v8, (a0)
271-
; CHECK-NEXT: csrr a0, vlenb
272-
; CHECK-NEXT: srli a0, a0, 1
273-
; CHECK-NEXT: add a1, a0, a0
274-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
275-
; CHECK-NEXT: vslidedown.vi v12, v8, 8
276-
; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
277-
; CHECK-NEXT: vslidedown.vi v10, v8, 16
278-
; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
279-
; CHECK-NEXT: vslideup.vx v8, v12, a0
280-
; CHECK-NEXT: addi a0, sp, 16
281-
; CHECK-NEXT: vmv1r.v v9, v10
282-
; CHECK-NEXT: vs2r.v v8, (a0)
283-
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
263+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
284264
; CHECK-NEXT: vlseg3e8.v v6, (a0)
285-
; CHECK-NEXT: csrr a0, vlenb
286-
; CHECK-NEXT: slli a0, a0, 1
287-
; CHECK-NEXT: add sp, sp, a0
288-
; CHECK-NEXT: .cfi_def_cfa sp, 16
289-
; CHECK-NEXT: addi sp, sp, 16
290-
; CHECK-NEXT: .cfi_def_cfa_offset 0
291265
; CHECK-NEXT: ret
292266
%vec = load <24 x i8>, ptr %p
293267
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave3(<24 x i8> %vec)
@@ -327,42 +301,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_fact
327301
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor5(ptr %p) {
328302
; CHECK-LABEL: vector_deinterleave_load_factor5:
329303
; CHECK: # %bb.0:
330-
; CHECK-NEXT: addi sp, sp, -16
331-
; CHECK-NEXT: .cfi_def_cfa_offset 16
332-
; CHECK-NEXT: csrr a1, vlenb
333-
; CHECK-NEXT: slli a1, a1, 2
334-
; CHECK-NEXT: sub sp, sp, a1
335-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
336-
; CHECK-NEXT: li a1, 40
337-
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
338-
; CHECK-NEXT: vle8.v v8, (a0)
339-
; CHECK-NEXT: csrr a0, vlenb
340-
; CHECK-NEXT: srli a0, a0, 1
341-
; CHECK-NEXT: add a1, a0, a0
342-
; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
343-
; CHECK-NEXT: vslidedown.vi v12, v8, 24
344-
; CHECK-NEXT: vslidedown.vi v14, v8, 16
345-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
346-
; CHECK-NEXT: vslidedown.vi v13, v8, 8
347-
; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
348-
; CHECK-NEXT: vslideup.vx v14, v12, a0
349-
; CHECK-NEXT: vmv1r.v v12, v8
350-
; CHECK-NEXT: vslideup.vx v12, v13, a0
351-
; CHECK-NEXT: li a0, 32
352-
; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
353-
; CHECK-NEXT: vslidedown.vx v8, v8, a0
354-
; CHECK-NEXT: vmv1r.v v13, v14
355-
; CHECK-NEXT: addi a0, sp, 16
356-
; CHECK-NEXT: vmv2r.v v14, v8
357-
; CHECK-NEXT: vs4r.v v12, (a0)
358-
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
304+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
359305
; CHECK-NEXT: vlseg5e8.v v8, (a0)
360-
; CHECK-NEXT: csrr a0, vlenb
361-
; CHECK-NEXT: slli a0, a0, 2
362-
; CHECK-NEXT: add sp, sp, a0
363-
; CHECK-NEXT: .cfi_def_cfa sp, 16
364-
; CHECK-NEXT: addi sp, sp, 16
365-
; CHECK-NEXT: .cfi_def_cfa_offset 0
366306
; CHECK-NEXT: ret
367307
%vec = load <40 x i8>, ptr %p
368308
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave5(<40 x i8> %vec)
@@ -382,49 +322,8 @@ define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave
382322
define { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @vector_deinterleave_load_factor7(ptr %p) {
383323
; CHECK-LABEL: vector_deinterleave_load_factor7:
384324
; CHECK: # %bb.0:
385-
; CHECK-NEXT: addi sp, sp, -16
386-
; CHECK-NEXT: .cfi_def_cfa_offset 16
387-
; CHECK-NEXT: csrr a1, vlenb
388-
; CHECK-NEXT: slli a1, a1, 2
389-
; CHECK-NEXT: sub sp, sp, a1
390-
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
391-
; CHECK-NEXT: li a1, 56
392-
; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma
393-
; CHECK-NEXT: vle8.v v8, (a0)
394-
; CHECK-NEXT: csrr a0, vlenb
395-
; CHECK-NEXT: li a1, 40
396-
; CHECK-NEXT: li a2, 32
397-
; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
398-
; CHECK-NEXT: vslidedown.vx v16, v8, a1
399-
; CHECK-NEXT: li a1, 48
400-
; CHECK-NEXT: srli a0, a0, 1
401-
; CHECK-NEXT: vslidedown.vx v12, v8, a2
402-
; CHECK-NEXT: add a2, a0, a0
403-
; CHECK-NEXT: vsetivli zero, 8, e8, m2, ta, ma
404-
; CHECK-NEXT: vslidedown.vi v14, v8, 24
405-
; CHECK-NEXT: vslidedown.vi v18, v8, 16
406-
; CHECK-NEXT: vsetivli zero, 8, e8, m1, ta, ma
407-
; CHECK-NEXT: vslidedown.vi v13, v8, 8
408-
; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
409-
; CHECK-NEXT: vslideup.vx v18, v14, a0
410-
; CHECK-NEXT: vsetivli zero, 8, e8, m4, ta, ma
411-
; CHECK-NEXT: vslidedown.vx v20, v8, a1
412-
; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
413-
; CHECK-NEXT: vslideup.vx v8, v13, a0
414-
; CHECK-NEXT: vslideup.vx v12, v16, a0
415-
; CHECK-NEXT: vmv1r.v v9, v18
416-
; CHECK-NEXT: addi a0, sp, 16
417-
; CHECK-NEXT: vmv1r.v v13, v20
418-
; CHECK-NEXT: vmv2r.v v10, v12
419-
; CHECK-NEXT: vs4r.v v8, (a0)
420-
; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma
325+
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
421326
; CHECK-NEXT: vlseg7e8.v v8, (a0)
422-
; CHECK-NEXT: csrr a0, vlenb
423-
; CHECK-NEXT: slli a0, a0, 2
424-
; CHECK-NEXT: add sp, sp, a0
425-
; CHECK-NEXT: .cfi_def_cfa sp, 16
426-
; CHECK-NEXT: addi sp, sp, 16
427-
; CHECK-NEXT: .cfi_def_cfa_offset 0
428327
; CHECK-NEXT: ret
429328
%vec = load <56 x i8>, ptr %p
430329
%d0 = call {<8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>} @llvm.vector.deinterleave7(<56 x i8> %vec)

0 commit comments

Comments
 (0)