60
60
#include " llvm/IR/Instruction.h"
61
61
#include " llvm/IR/Instructions.h"
62
62
#include " llvm/IR/IntrinsicInst.h"
63
+ #include " llvm/IR/PatternMatch.h"
63
64
#include " llvm/InitializePasses.h"
64
65
#include " llvm/Pass.h"
65
66
#include " llvm/Support/Casting.h"
@@ -248,6 +249,186 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
248
249
return false ;
249
250
}
250
251
252
+ // For an (de)interleave tree like this:
253
+ //
254
+ // A C B D
255
+ // |___| |___|
256
+ // |_____|
257
+ // |
258
+ // A B C D
259
+ //
260
+ // We will get ABCD at the end while the leave operands/results
261
+ // are ACBD, which are also what we initially collected in
262
+ // getVectorInterleaveFactor / getVectorDeinterleaveFactor. But TLI
263
+ // hooks (e.g. lowerInterleavedScalableLoad) expect ABCD, so we need
264
+ // to reorder them by interleaving these values.
265
+ static void interleaveLeafValues (SmallVectorImpl<Value *> &Leaves) {
266
+ unsigned Factor = Leaves.size ();
267
+ assert (isPowerOf2_32 (Factor) && Factor <= 8 && Factor > 1 );
268
+
269
+ if (Factor == 2 )
270
+ return ;
271
+
272
+ SmallVector<Value *, 8 > Buffer;
273
+ if (Factor == 4 ) {
274
+ for (unsigned SrcIdx : {0 , 2 , 1 , 3 })
275
+ Buffer.push_back (Leaves[SrcIdx]);
276
+ } else {
277
+ // Factor of 8.
278
+ //
279
+ // A E C G B F D H
280
+ // |_| |_| |_| |_|
281
+ // |___| |___|
282
+ // |_______|
283
+ // |
284
+ // A B C D E F G H
285
+ for (unsigned SrcIdx : {0 , 4 , 2 , 6 , 1 , 5 , 3 , 7 })
286
+ Buffer.push_back (Leaves[SrcIdx]);
287
+ }
288
+
289
+ llvm::copy (Buffer, Leaves.begin ());
290
+ }
291
+
292
+ static unsigned getVectorInterleaveFactor (IntrinsicInst *II,
293
+ SmallVectorImpl<Value *> &Operands) {
294
+ if (II->getIntrinsicID () != Intrinsic::vector_interleave2)
295
+ return 0 ;
296
+
297
+ unsigned Factor = 0 ;
298
+
299
+ // Visit with BFS
300
+ SmallVector<IntrinsicInst *, 8 > Queue;
301
+ Queue.push_back (II);
302
+ while (!Queue.empty ()) {
303
+ IntrinsicInst *Current = Queue.front ();
304
+ Queue.erase (Queue.begin ());
305
+
306
+ for (unsigned I = 0 ; I < 2 ; ++I) {
307
+ Value *Op = Current->getOperand (I);
308
+ if (auto *OpII = dyn_cast<IntrinsicInst>(Op))
309
+ if (OpII->getIntrinsicID () == Intrinsic::vector_interleave2) {
310
+ Queue.push_back (OpII);
311
+ continue ;
312
+ }
313
+
314
+ ++Factor;
315
+ Operands.push_back (Op);
316
+ }
317
+ }
318
+
319
+ // Currently we only recognize power-of-two factors.
320
+ // FIXME: should we assert here instead?
321
+ if (Factor > 1 && isPowerOf2_32 (Factor)) {
322
+ interleaveLeafValues (Operands);
323
+ return Factor;
324
+ }
325
+ return 0 ;
326
+ }
327
+
328
+ // / Check the interleaved mask
329
+ // /
330
+ // / - if a value within the optional is non-nullptr, the value corresponds to
331
+ // / deinterleaved mask
332
+ // / - if a value within the option is nullptr, the value corresponds to all-true
333
+ // / mask
334
+ // / - return nullopt if mask cannot be deinterleaved
335
+ static std::optional<Value *> getMask (Value *WideMask, unsigned Factor) {
336
+ using namespace llvm ::PatternMatch;
337
+ if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
338
+ SmallVector<Value *, 8 > Operands;
339
+ if (unsigned MaskFactor = getVectorInterleaveFactor (IMI, Operands)) {
340
+ assert (!Operands.empty ());
341
+ if (MaskFactor == Factor &&
342
+ std::equal (Operands.begin (), Operands.end (), Operands.begin ()))
343
+ return Operands.front ();
344
+ }
345
+ }
346
+ if (match (WideMask, m_AllOnes ()))
347
+ return nullptr ;
348
+ return std::nullopt;
349
+ }
350
+
351
+ static unsigned getVectorDeInterleaveFactor (IntrinsicInst *II,
352
+ SmallVectorImpl<Value *> &Results) {
353
+ using namespace PatternMatch ;
354
+ if (II->getIntrinsicID () != Intrinsic::vector_deinterleave2 ||
355
+ !II->hasNUses (2 ))
356
+ return 0 ;
357
+
358
+ unsigned Factor = 0 ;
359
+
360
+ // Visit with BFS
361
+ SmallVector<IntrinsicInst *, 8 > Queue;
362
+ Queue.push_back (II);
363
+ while (!Queue.empty ()) {
364
+ IntrinsicInst *Current = Queue.front ();
365
+ Queue.erase (Queue.begin ());
366
+ assert (Current->hasNUses (2 ));
367
+
368
+ unsigned VisitedIdx = 0 ;
369
+ for (User *Usr : Current->users ()) {
370
+ // We're playing safe here and matches only the expression
371
+ // consisting of a perfectly balanced binary tree in which all
372
+ // intermediate values are only used once.
373
+ if (!Usr->hasOneUse () || !isa<ExtractValueInst>(Usr))
374
+ return 0 ;
375
+
376
+ auto *EV = cast<ExtractValueInst>(Usr);
377
+ ArrayRef<unsigned > Indices = EV->getIndices ();
378
+ if (Indices.size () != 1 || Indices[0 ] >= 2 )
379
+ return 0 ;
380
+
381
+ // The idea is that we don't want to have two extractvalue
382
+ // on the same index. So we XOR (index + 1) onto VisitedIdx
383
+ // such that if there is any duplication, VisitedIdx will be
384
+ // zero.
385
+ VisitedIdx ^= Indices[0 ] + 1 ;
386
+ if (!VisitedIdx)
387
+ return 0 ;
388
+ // We have a legal index. At this point we're either going
389
+ // to continue the traversal or push the leaf values into Results.
390
+ // But in either cases we need to follow the order imposed by
391
+ // ExtractValue's indices and swap with the last element pushed
392
+ // into Queue/Results if necessary (This is also one of the main
393
+ // reasons using BFS instead of DFS here, btw).
394
+
395
+ // When VisitedIdx equals to 0b11, we're the last visted ExtractValue.
396
+ // So if the current index is 0, we need to swap. Conversely, when
397
+ // we're either the first visited ExtractValue or the last operand
398
+ // in Queue/Results is of index 0, there is no need to swap.
399
+ bool SwapWithLast = VisitedIdx == 0b11 && Indices[0 ] == 0 ;
400
+
401
+ // Continue the traversal.
402
+ if (match (EV->user_back (),
403
+ m_Intrinsic<Intrinsic::vector_deinterleave2>()) &&
404
+ EV->user_back ()->hasNUses (2 )) {
405
+ auto *EVUsr = cast<IntrinsicInst>(EV->user_back ());
406
+ if (SwapWithLast)
407
+ Queue.insert (Queue.end () - 1 , EVUsr);
408
+ else
409
+ Queue.push_back (EVUsr);
410
+ continue ;
411
+ }
412
+
413
+ // Save the leaf value.
414
+ if (SwapWithLast)
415
+ Results.insert (Results.end () - 1 , EV);
416
+ else
417
+ Results.push_back (EV);
418
+
419
+ ++Factor;
420
+ }
421
+ }
422
+
423
+ // Currently we only recognize power-of-two factors.
424
+ // FIXME: should we assert here instead?
425
+ if (Factor > 1 && isPowerOf2_32 (Factor)) {
426
+ interleaveLeafValues (Results);
427
+ return Factor;
428
+ }
429
+ return 0 ;
430
+ }
431
+
251
432
bool InterleavedAccessImpl::lowerInterleavedLoad (
252
433
LoadInst *LI, SmallSetVector<Instruction *, 32 > &DeadInsts) {
253
434
if (!LI->isSimple () || isa<ScalableVectorType>(LI->getType ()))
@@ -480,6 +661,81 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
480
661
481
662
bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic (
482
663
IntrinsicInst *DI, SmallSetVector<Instruction *, 32 > &DeadInsts) {
664
+ using namespace PatternMatch ;
665
+ SmallVector<Value *, 8 > DeInterleaveResults;
666
+ unsigned Factor = getVectorDeInterleaveFactor (DI, DeInterleaveResults);
667
+
668
+ if (auto *VPLoad = dyn_cast<VPIntrinsic>(DI->getOperand (0 ));
669
+ Factor && VPLoad) {
670
+ if (!match (VPLoad, m_OneUse (m_Intrinsic<Intrinsic::vp_load>())))
671
+ return false ;
672
+
673
+ // Check mask operand. Handle both all-true and interleaved mask.
674
+ Value *WideMask = VPLoad->getOperand (1 );
675
+ std::optional<Value *> Mask = getMask (WideMask, Factor);
676
+ if (!Mask)
677
+ return false ;
678
+
679
+ LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI << " \n " );
680
+
681
+ // Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
682
+ // TLI function to emit target-specific interleaved instruction.
683
+ if (!TLI->lowerInterleavedScalableLoad (VPLoad, *Mask, DI, Factor,
684
+ DeInterleaveResults))
685
+ return false ;
686
+
687
+ DeadInsts.insert (DI);
688
+ DeadInsts.insert (VPLoad);
689
+ return true ;
690
+ }
691
+
692
+ // Match
693
+ // %x = vp.strided.load ;; VPStridedLoad
694
+ // %y = bitcast %x ;; BitCast
695
+ // %y' = inttoptr %y
696
+ // %z = deinterleave %y ;; DI
697
+ if (Factor && isa<BitCastInst, IntToPtrInst>(DI->getOperand (0 ))) {
698
+ auto *BitCast = cast<Instruction>(DI->getOperand (0 ));
699
+ if (!BitCast->hasOneUse ())
700
+ return false ;
701
+
702
+ Instruction *IntToPtrCast = nullptr ;
703
+ if (auto *BC = dyn_cast<BitCastInst>(BitCast->getOperand (0 ))) {
704
+ IntToPtrCast = BitCast;
705
+ BitCast = BC;
706
+ }
707
+
708
+ // Match the type is
709
+ // <VF x (factor * elementTy)> bitcast to <(VF * factor) x elementTy>
710
+ Value *BitCastSrc = BitCast->getOperand (0 );
711
+ auto *BitCastSrcTy = dyn_cast<VectorType>(BitCastSrc->getType ());
712
+ auto *BitCastDstTy = cast<VectorType>(BitCast->getType ());
713
+ if (!BitCastSrcTy || (BitCastSrcTy->getElementCount () * Factor !=
714
+ BitCastDstTy->getElementCount ()))
715
+ return false ;
716
+
717
+ if (auto *VPStridedLoad = dyn_cast<VPIntrinsic>(BitCast->getOperand (0 ))) {
718
+ if (VPStridedLoad->getIntrinsicID () !=
719
+ Intrinsic::experimental_vp_strided_load ||
720
+ !VPStridedLoad->hasOneUse ())
721
+ return false ;
722
+
723
+ LLVM_DEBUG (dbgs () << " IA: Found a deinterleave intrinsic: " << *DI
724
+ << " \n " );
725
+
726
+ if (!TLI->lowerDeinterleaveIntrinsicToStridedLoad (
727
+ VPStridedLoad, DI, Factor, DeInterleaveResults))
728
+ return false ;
729
+
730
+ DeadInsts.push_back (DI);
731
+ if (IntToPtrCast)
732
+ DeadInsts.push_back (IntToPtrCast);
733
+ DeadInsts.push_back (BitCast);
734
+ DeadInsts.push_back (VPStridedLoad);
735
+ return true ;
736
+ }
737
+ }
738
+
483
739
LoadInst *LI = dyn_cast<LoadInst>(DI->getOperand (0 ));
484
740
485
741
if (!LI || !LI->hasOneUse () || !LI->isSimple ())
@@ -504,6 +760,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
504
760
if (!II->hasOneUse ())
505
761
return false ;
506
762
763
+ if (auto *VPStore = dyn_cast<VPIntrinsic>(*(II->users ().begin ()))) {
764
+ if (VPStore->getIntrinsicID () != Intrinsic::vp_store)
765
+ return false ;
766
+
767
+ SmallVector<Value *, 8 > InterleaveOperands;
768
+ unsigned Factor = getVectorInterleaveFactor (II, InterleaveOperands);
769
+ if (!Factor)
770
+ return false ;
771
+
772
+ Value *WideMask = VPStore->getOperand (2 );
773
+ std::optional<Value *> Mask = getMask (WideMask, Factor);
774
+ if (!Mask)
775
+ return false ;
776
+
777
+ LLVM_DEBUG (dbgs () << " IA: Found an interleave intrinsic: " << *II << " \n " );
778
+
779
+ // Since lowerInterleavedStore expects Shuffle and StoreInst, use special
780
+ // TLI function to emit target-specific interleaved instruction.
781
+ if (!TLI->lowerInterleavedScalableStore (VPStore, *Mask, II, Factor,
782
+ InterleaveOperands))
783
+ return false ;
784
+
785
+ DeadInsts.insert (VPStore);
786
+ DeadInsts.insert (II);
787
+ return true ;
788
+ }
789
+
507
790
StoreInst *SI = dyn_cast<StoreInst>(*(II->users ().begin ()));
508
791
509
792
if (!SI || !SI->isSimple ())
0 commit comments