@@ -16441,18 +16441,56 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
16441
16441
return true;
16442
16442
}
16443
16443
16444
+ bool GetDeinterleaveLeaves(Value *DI,
16445
+ SmallVectorImpl<Value *> &DeinterleaveUsers,
16446
+ SmallVectorImpl<Instruction *> &DeadInsts) {
16447
+ if (!DI->hasNUses(2))
16448
+ return false;
16449
+
16450
+ auto *Extr0 = *(++DI->user_begin());
16451
+ auto *Extr1 = *(DI->user_begin());
16452
+ if (!match(Extr0, m_ExtractValue<0>(m_Deinterleave2(m_Value()))))
16453
+ return false;
16454
+
16455
+ auto De1 = *(Extr0->user_begin());
16456
+ if (!GetDeinterleaveLeaves(De1, DeinterleaveUsers, DeadInsts))
16457
+ // leaf extract
16458
+ DeinterleaveUsers.push_back(Extr0);
16459
+ else {
16460
+ // parent extract that will not be used anymore
16461
+ DeadInsts.push_back(cast<Instruction>(De1));
16462
+ DeadInsts.push_back(cast<Instruction>(Extr0));
16463
+ }
16464
+ auto De2 = *(Extr1->user_begin());
16465
+ if (!GetDeinterleaveLeaves(De2, DeinterleaveUsers, DeadInsts))
16466
+ // leaf extract
16467
+ DeinterleaveUsers.push_back(Extr1);
16468
+ else {
16469
+ // parent extract that will not be used anymore
16470
+ DeadInsts.push_back(cast<Instruction>(De2));
16471
+ DeadInsts.push_back(cast<Instruction>(Extr1));
16472
+ }
16473
+ return true;
16474
+ }
16475
+
16444
16476
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16445
- IntrinsicInst *DI, SmallVector<Value *> &LeafNodes, LoadInst *LI) const {
16477
+ IntrinsicInst *DI, LoadInst *LI) const {
16446
16478
// Only deinterleave2 supported at present.
16447
16479
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
16448
16480
return false;
16449
16481
16450
- const unsigned Factor = std::max(2, (int)LeafNodes.size());
16451
-
16452
- VectorType *VTy = (LeafNodes.size() > 0)
16453
- ? cast<VectorType>(LeafNodes.front()->getType())
16454
- : cast<VectorType>(DI->getType()->getContainedType(0));
16482
+ SmallVector<Value *, 4> ValuesToDeinterleave;
16483
+ SmallVector<Instruction *, 10> DeadInsts;
16455
16484
const DataLayout &DL = DI->getModule()->getDataLayout();
16485
+ unsigned Factor = 2;
16486
+ VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
16487
+ if (GetDeinterleaveLeaves(DI, ValuesToDeinterleave, DeadInsts)) {
16488
+ Factor = ValuesToDeinterleave.size();
16489
+ VTy = cast<VectorType>(ValuesToDeinterleave[0]->getType());
16490
+ }
16491
+
16492
+ assert(Factor && "Expected Interleave Factor >= 2");
16493
+
16456
16494
bool UseScalable;
16457
16495
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
16458
16496
return false;
@@ -16463,7 +16501,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16463
16501
return false;
16464
16502
16465
16503
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
16466
-
16467
16504
VectorType *LdTy =
16468
16505
VectorType::get(VTy->getElementType(),
16469
16506
VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -16473,7 +16510,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16473
16510
UseScalable, LdTy, PtrTy);
16474
16511
16475
16512
IRBuilder<> Builder(LI);
16476
-
16477
16513
Value *Pred = nullptr;
16478
16514
if (UseScalable)
16479
16515
Pred =
@@ -16482,9 +16518,8 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16482
16518
Value *BaseAddr = LI->getPointerOperand();
16483
16519
Value *Result;
16484
16520
if (NumLoads > 1) {
16485
- Value *Left = PoisonValue::get(VTy);
16486
- Value *Right = PoisonValue::get(VTy);
16487
-
16521
+ // Create multiple legal small ldN instead of a wide one.
16522
+ SmallVector<Value *, 4> WideValues(Factor, (PoisonValue::get(VTy)));
16488
16523
for (unsigned I = 0; I < NumLoads; ++I) {
16489
16524
Value *Offset = Builder.getInt64(I * Factor);
16490
16525
@@ -16494,49 +16529,71 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
16494
16529
LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
16495
16530
else
16496
16531
LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
16497
-
16498
16532
Value *Idx =
16499
16533
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
16500
- Left = Builder.CreateInsertVector(
16501
- VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
16502
- Right = Builder.CreateInsertVector(
16503
- VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
16534
+ for (int J = 0; J < Factor; ++J) {
16535
+ WideValues[J] = Builder.CreateInsertVector(
16536
+ VTy, WideValues[J], Builder.CreateExtractValue(LdN, J), Idx);
16537
+ }
16538
+ }
16539
+ // FIXME: the types should NOT be added manually.
16540
+ if (2 == Factor)
16541
+ Result = PoisonValue::get(StructType::get(VTy, VTy));
16542
+ else
16543
+ Result = PoisonValue::get(StructType::get(VTy, VTy, VTy, VTy));
16544
+ // Construct the wide result out of the small results.
16545
+ for (int J = 0; J < Factor; ++J) {
16546
+ Result = Builder.CreateInsertValue(Result, WideValues[J], J);
16504
16547
}
16505
-
16506
- Result = PoisonValue::get(DI->getType());
16507
- Result = Builder.CreateInsertValue(Result, Left, 0);
16508
- Result = Builder.CreateInsertValue(Result, Right, 1);
16509
16548
} else {
16510
- if (UseScalable) {
16549
+ if (UseScalable)
16511
16550
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
16512
- if (Factor == 2) {
16513
- DI->replaceAllUsesWith(Result);
16514
- return true;
16515
- }
16516
- for (unsigned I = 0; I < LeafNodes.size(); I++) {
16517
- llvm::Value *CurrentExtract = LeafNodes[I];
16518
- Value *Newextrct = Builder.CreateExtractValue(Result, I);
16519
- CurrentExtract->replaceAllUsesWith(Newextrct);
16520
- }
16521
- return true;
16522
- } else
16551
+ else
16523
16552
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
16524
16553
}
16554
+ if (Factor > 2) {
16555
+ for (unsigned I = 0; I < ValuesToDeinterleave.size(); I++) {
16556
+ llvm::Value *CurrentExtract = ValuesToDeinterleave[I];
16557
+ Value *NewExtract = Builder.CreateExtractValue(Result, I);
16558
+ CurrentExtract->replaceAllUsesWith(NewExtract);
16559
+ cast<Instruction>(CurrentExtract)->eraseFromParent();
16560
+ }
16525
16561
16562
+ for (auto &dead : DeadInsts)
16563
+ dead->eraseFromParent();
16564
+ return true;
16565
+ }
16526
16566
DI->replaceAllUsesWith(Result);
16527
16567
return true;
16528
16568
}
16529
16569
16570
+ bool GetInterleaveLeaves(Value *II, SmallVectorImpl<Value *> &InterleaveOps) {
16571
+ Value *Op0, *Op1;
16572
+ if (!match(II, m_Interleave2(m_Value(Op0), m_Value(Op1))))
16573
+ return false;
16574
+
16575
+ if (!GetInterleaveLeaves(Op0, InterleaveOps)) {
16576
+ InterleaveOps.push_back(Op0);
16577
+ }
16578
+
16579
+ if (!GetInterleaveLeaves(Op1, InterleaveOps)) {
16580
+ InterleaveOps.push_back(Op1);
16581
+ }
16582
+ return true;
16583
+ }
16584
+
16530
16585
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16531
- IntrinsicInst *II, SmallVector<Value *> &LeafNodes, StoreInst *SI) const {
16586
+ IntrinsicInst *II, StoreInst *SI) const {
16532
16587
// Only interleave2 supported at present.
16533
16588
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
16534
16589
return false;
16535
16590
16536
- // leaf nodes are the nodes that will be interleaved
16537
- const unsigned Factor = LeafNodes.size();
16591
+ SmallVector<Value *, 4> ValuesToInterleave;
16592
+ GetInterleaveLeaves(II, ValuesToInterleave);
16593
+ unsigned Factor = ValuesToInterleave.size();
16594
+ assert(Factor >= 2 && "Expected Interleave Factor >= 2");
16595
+ VectorType *VTy = cast<VectorType>(ValuesToInterleave[0]->getType());
16538
16596
16539
- VectorType *VTy = cast<VectorType>(LeafNodes.front()->getType());
16540
16597
const DataLayout &DL = II->getModule()->getDataLayout();
16541
16598
bool UseScalable;
16542
16599
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
@@ -16566,28 +16623,26 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
16566
16623
Pred =
16567
16624
Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
16568
16625
16569
- Value *L = II->getOperand(0);
16570
- Value *R = II->getOperand(1);
16571
-
16626
+ auto InterleaveOps = ValuesToInterleave;
16627
+ if (UseScalable)
16628
+ ValuesToInterleave.push_back(Pred);
16629
+ ValuesToInterleave.push_back(BaseAddr);
16572
16630
for (unsigned I = 0; I < NumStores; ++I) {
16573
16631
Value *Address = BaseAddr;
16574
16632
if (NumStores > 1) {
16575
16633
Value *Offset = Builder.getInt64(I * Factor);
16576
16634
Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
16577
-
16578
16635
Value *Idx =
16579
16636
Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
16580
- L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
16581
- R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
16637
+ for (int J = 0; J < Factor; J++) {
16638
+ ValuesToInterleave[J] =
16639
+ Builder.CreateExtractVector(StTy, InterleaveOps[J], Idx);
16640
+ }
16641
+ // update the address
16642
+ ValuesToInterleave[ValuesToInterleave.size() - 1] = Address;
16582
16643
}
16583
16644
16584
- if (UseScalable) {
16585
- SmallVector<Value *> Args(LeafNodes);
16586
- Args.push_back(Pred);
16587
- Args.push_back(Address);
16588
- Builder.CreateCall(StNFunc, Args);
16589
- } else
16590
- Builder.CreateCall(StNFunc, {L, R, Address});
16645
+ Builder.CreateCall(StNFunc, ValuesToInterleave);
16591
16646
}
16592
16647
16593
16648
return true;
0 commit comments