Add gather4_masked_scaled2 and gather_masked_scaled2 intrinsics

azabazno · azabazno · commit 50326439b1d0 · 2020-12-02T12:03:41.000+03:00
diff --git a/GenXIntrinsics/include/llvm/GenXIntrinsics/GenXSimdCFLowering.h b/GenXIntrinsics/include/llvm/GenXIntrinsics/GenXSimdCFLowering.h
@@ -72,6 +72,12 @@ class CMSimdCFLower {
   std::set<AssertingVH<Value>> AlreadyPredicated;
   // Mask for shufflevector to extract part of EM.
   SmallVector<Constant *, 32> ShuffleMask;
+  // Original predicate for an instruction (if it was changed with AND respect
+  // to EM)
+  std::map<Instruction *, Value *> OriginalPred;
+  // Replicate mask for provided number of channels
+  Value *replicateMask(Value *EM, Instruction *InsertBefore, unsigned SimdWidth,
+                       unsigned NumChannels = 1);
 
 public:
   static const unsigned MAX_SIMD_CF_WIDTH = 32;
@@ -106,7 +112,7 @@ class CMSimdCFLower {
   void lowerSimdCF();
   void lowerUnmaskOps();
   unsigned deduceNumChannels(Instruction *SI);
-  Instruction *loadExecutionMask(Instruction *InsertBefore, unsigned SimdWidth, unsigned NumChannels = 1);
+  Instruction *loadExecutionMask(Instruction *InsertBefore, unsigned SimdWidth);
   Value *getRMAddr(BasicBlock *JP, unsigned SimdWidth);
 };
 
diff --git a/GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsic_definitions.py b/GenXIntrinsics/include/llvm/GenXIntrinsics/Intrinsic_definitions.py
@@ -1804,6 +1804,23 @@
 ###
     "gather_scaled2" : ["anyvector",["int","short","int","int","anyint"],"ReadMem"],
 
+### ``llvm.genx.gather.masked.scaled2`` : vISA GATHER_SCALED instruction
+### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+###
+###
+### * (Exec_size inferred from element offset type)
+### * arg0: i32 log2 num blocks, constant (0/1/2 for num blocks 1/2/4)
+### * arg1: i16 scale, constant
+### * arg2: i32 surface index
+### * arg3: i32 global offset in bytes
+### * arg4: vXi32 element offset in bytes (overloaded)
+### * arg5: vXi1 predicate (overloaded)
+###
+### * Return value: the data read
+###
+    "gather_masked_scaled2" : ["anyvector",["int","short","int","int","anyint","anyvector"],"ReadMem"],
+
+
 ### ``llvm.genx.gather4.scaled.<return type>.<vector type>.<any int>`` : vISA GATHER4_SCALED instruction
 ### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ###
@@ -1859,6 +1876,22 @@
 ###
     "gather4_scaled2" : ["anyvector",["int","short","int","int","anyint"],"ReadMem"],
 
+### ``llvm.genx.gather4.masked.scaled2`` : vISA GATHER4_SCALED instruction
+### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+###
+### * (Exec_size inferred from element offset type)
+### * arg0: i32 channel mask, constant
+### * arg1: i16 scale, constant
+### * arg2: i32 surface index
+### * arg3: i32 global offset in bytes
+### * arg4: vXi32 element offset in bytes
+### * arg5: vXi1 predicate (overloaded)
+###
+### * Return value: the data read
+###
+    "gather4_masked_scaled2" : ["anyvector",["int","short","int","int","anyint","anyvector"],"ReadMem"],
+
+
 ### ``llvm.genx.gather4.typed.<return type>.<vector type>.<vector type>`` : vISA GATHER4_TYPED instruction
 ### ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ###
diff --git a/GenXIntrinsics/lib/GenXIntrinsics/GenXSimdCFLowering.cpp b/GenXIntrinsics/lib/GenXIntrinsics/GenXSimdCFLowering.cpp
@@ -535,6 +535,14 @@ void CMSimdCFLower::processFunction(Function *ArgF)
   unsigned CMWidth = PredicatedSubroutines[F];
   // Find the simd branches.
   bool FoundSIMD = findSimdBranches(CMWidth);
+
+  // Create shuffle mask for EM adjustment
+  if (ShuffleMask.empty()) {
+    auto I32Ty = Type::getInt32Ty(F->getContext());
+    for (unsigned i = 0; i != 32; ++i)
+      ShuffleMask.push_back(ConstantInt::get(I32Ty, i));
+  }
+
   if (CMWidth > 0 || FoundSIMD) {
     // Determine which basic blocks need to be predicated.
     determinePredicatedBlocks();
@@ -555,10 +563,13 @@ void CMSimdCFLower::processFunction(Function *ArgF)
     lowerSimdCF();
     lowerUnmaskOps();
   }
+
+  ShuffleMask.clear();
   SimdBranches.clear();
   PredicatedBlocks.clear();
   JoinPoints.clear();
   RMAddrs.clear();
+  OriginalPred.clear();
   AlreadyPredicated.clear();
 }
 
@@ -1214,6 +1225,7 @@ unsigned CMSimdCFLower::deduceNumChannels(Instruction *SI) {
   // If it's not a function call then check for a specific instruction
   unsigned IID = GenXIntrinsic::getGenXIntrinsicID(CI);
   switch (IID) {
+  case GenXIntrinsic::genx_gather4_masked_scaled2:
   case GenXIntrinsic::genx_gather4_scaled2: {
     unsigned AddrElems = VCINTR::VectorType::getNumElements(
         cast<VectorType>(CI->getOperand(4)->getType()));
@@ -1262,6 +1274,7 @@ void CMSimdCFLower::predicateStore(Instruction *SI, unsigned SimdWidth)
   CallInst *WrRegionToPredicate = nullptr;
   Use *U = &SI->getOperandUse(0);
   Use *UseNeedsUpdate = nullptr;
+  Value *ExistingPred = nullptr;
   for (;;) {
     if (auto BC = dyn_cast<BitCastInst>(V)) {
       U = &BC->getOperandUse(0);
@@ -1277,6 +1290,15 @@ void CMSimdCFLower::predicateStore(Instruction *SI, unsigned SimdWidth)
     unsigned IID = GenXIntrinsic::getGenXIntrinsicID(WrRegion);
     if (IID != GenXIntrinsic::genx_wrregioni
          && IID != GenXIntrinsic::genx_wrregionf) {
+      // genx_gather4_masked_scaled2 is slightly different: it has predicate
+      // operand and its users have to be predicated as well since it returns value
+      // with size greater of execution size
+      if (IID == GenXIntrinsic::genx_gather4_masked_scaled2) {
+        assert(AlreadyPredicated.find(WrRegion) != AlreadyPredicated.end());
+        if (OriginalPred.count(WrRegion))
+          ExistingPred = OriginalPred[WrRegion];
+        break;
+      }
       // Not wrregion. See if it is an intrinsic that has already been
       // predicated; if so do not attempt to predicate the store.
       if (AlreadyPredicated.find(WrRegion) != AlreadyPredicated.end())
@@ -1361,7 +1383,19 @@ void CMSimdCFLower::predicateStore(Instruction *SI, unsigned SimdWidth)
     Load = CallInst::Create(Fn, Addr, ".simdcfpred.vload", SI);
   }
   Load->setDebugLoc(SI->getDebugLoc());
-  auto EM = loadExecutionMask(SI, SimdWidth, NumChannels);
+  Value *EM = loadExecutionMask(SI, SimdWidth);
+
+  // If there was a predicate already then update it with current EM
+  if (ExistingPred) {
+    EM = BinaryOperator::Create(
+        Instruction::And, ExistingPred, EM,
+        ExistingPred->getName() + ".and." + EM->getName(), SI);
+    cast<Instruction>(EM)->setDebugLoc(SI->getDebugLoc());
+  }
+
+  // Replicate mask for each channel if needed
+  EM = replicateMask(EM, SI, SimdWidth, NumChannels);
+
   auto Select = SelectInst::Create(EM, SI->getOperand(0), Load,
       SI->getOperand(0)->getName() + ".simdcfpred", SI);
   SI->setOperand(0, Select);
@@ -1450,16 +1484,26 @@ void CMSimdCFLower::predicateScatterGather(CallInst *CI, unsigned SimdWidth,
 {
   Value *OldPred = CI->getArgOperand(PredOperandNum);
   assert(OldPred->getType()->getScalarType()->isIntegerTy(1));
-  if (SimdWidth != VCINTR::VectorType::getNumElements(
-                       cast<VectorType>(OldPred->getType()))) {
-    DiagnosticInfoSimdCF::emit(CI, "mismatching SIMD width of scatter/gather inside SIMD control flow");
-    return;
+  switch (GenXIntrinsic::getGenXIntrinsicID(CI)) {
+  case GenXIntrinsic::genx_gather4_masked_scaled2:
+    break;
+  default: {
+    if (SimdWidth != VCINTR::VectorType::getNumElements(
+                         cast<VectorType>(OldPred->getType()))) {
+      DiagnosticInfoSimdCF::emit(
+          CI,
+          "mismatching SIMD width of scatter/gather inside SIMD control flow");
+      return;
+    }
+    break;
+  }
   }
   Instruction *NewPred = loadExecutionMask(CI, SimdWidth);
   if (auto C = dyn_cast<Constant>(OldPred))
     if (C->isAllOnesValue())
       OldPred = nullptr;
   if (OldPred) {
+    OriginalPred[CI] = OldPred;
     auto And = BinaryOperator::Create(Instruction::And, OldPred, NewPred,
         OldPred->getName() + ".and." + NewPred->getName(), CI);
     And->setDebugLoc(CI->getDebugLoc());
@@ -1496,6 +1540,7 @@ CallInst *CMSimdCFLower::predicateWrRegion(CallInst *WrR, unsigned SimdWidth)
   if (!Pred)
     Pred = EM;
   else {
+    OriginalPred[WrR] = Pred;
     auto And = BinaryOperator::Create(Instruction::And, EM, Pred,
         Pred->getName() + ".and." + EM->getName(), WrR);
     And->setDebugLoc(WrR->getDebugLoc());
@@ -1783,39 +1828,46 @@ CallInst *CMSimdCFLower::isSimdCFAny(Value *V)
   return nullptr;
 }
 
+/***********************************************************************
+ * replicateMask : copy mask for provided number of channels using shufflevector
+ */
+Value *CMSimdCFLower::replicateMask(Value *EM, Instruction *InsertBefore,
+                                    unsigned SimdWidth, unsigned NumChannels) {
+  // No need to replicate the mask for one channel
+  if (NumChannels == 1)
+    return EM;
+
+  SmallVector<Constant *, 128> ChannelMask{SimdWidth * NumChannels};
+  for (unsigned i = 0; i < NumChannels; ++i)
+    std::copy(ShuffleMask.begin(), ShuffleMask.begin() + SimdWidth,
+              ChannelMask.begin() + SimdWidth * i);
+  EM = new ShuffleVectorInst(
+      EM, UndefValue::get(EM->getType()), ConstantVector::get(ChannelMask),
+      Twine("ChannelEM") + Twine(SimdWidth), InsertBefore);
+
+  return EM;
+}
+
 /***********************************************************************
  * loadExecutionMask : create instruction to load EM
  */
 Instruction *CMSimdCFLower::loadExecutionMask(Instruction *InsertBefore,
-    unsigned SimdWidth, unsigned NumChannels)
-{
+                                              unsigned SimdWidth) {
   Instruction *EM =
       new LoadInst(EMVar->getType()->getPointerElementType(), EMVar,
                    EMVar->getName(), false /* isVolatile */, InsertBefore);
-  EM->setDebugLoc(InsertBefore->getDebugLoc());
+
   // If the simd width is not MAX_SIMD_CF_WIDTH, extract the part of EM we want.
-  if (NumChannels == 1 && SimdWidth == MAX_SIMD_CF_WIDTH)
+  if (SimdWidth == MAX_SIMD_CF_WIDTH)
     return EM;
-  if (ShuffleMask.empty()) {
-    auto I32Ty = Type::getInt32Ty(F->getContext());
-    for (unsigned i = 0; i != 32; ++i)
-      ShuffleMask.push_back(ConstantInt::get(I32Ty, i));
-  }
-  if (NumChannels == 1) {
-    ArrayRef<Constant *> Mask = ShuffleMask;
-    EM = new ShuffleVectorInst(EM, UndefValue::get(EM->getType()),
-                               ConstantVector::get(Mask.take_front(SimdWidth)),
-                               Twine("EM") + Twine(SimdWidth), InsertBefore);
-  } else {
-    SmallVector<Constant *, 128> ChannelMask{SimdWidth * NumChannels};
-    for (unsigned i = 0; i < NumChannels; ++i)
-      std::copy(ShuffleMask.begin(), ShuffleMask.begin() + SimdWidth,
-                ChannelMask.begin() + SimdWidth * i);
-    EM = new ShuffleVectorInst(
-        EM, UndefValue::get(EM->getType()), ConstantVector::get(ChannelMask),
-        Twine("ChannelEM") + Twine(SimdWidth), InsertBefore);
-  }
+
+  ArrayRef<Constant *> Mask = ShuffleMask;
+  EM = new ShuffleVectorInst(EM, UndefValue::get(EM->getType()),
+                             ConstantVector::get(Mask.take_front(SimdWidth)),
+                             Twine("EM") + Twine(SimdWidth), InsertBefore);
+
   EM->setDebugLoc(InsertBefore->getDebugLoc());
+
   return EM;
 }
 
diff --git a/GenXIntrinsics/test/replicate_mask_masked_gather4.ll b/GenXIntrinsics/test/replicate_mask_masked_gather4.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -cmsimdcflowering < %s | FileCheck %s
+
+@Rcp_T2 = internal global <64 x i32> undef
+
+; CHECK: @EM = internal global <32 x i1> 
+
+define dso_local dllexport void @test(<32 x i16> %mask, <32 x i32> %addrs) {
+entry:
+  %Rcp_T = alloca <64 x i32>, align 512
+  %0 = icmp ne <32 x i16> %mask, zeroinitializer
+  %call = call i1 @llvm.genx.simdcf.any.v32i1(<32 x i1> %0)
+  br i1 %call, label %if.then, label %if.end
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: [[EM_LOAD1:%.*]] = load <32 x i1>, <32 x i1>* @EM
+; CHECK-NEXT: [[CALL1:%.*]] = call <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32 12, i16 0, i32 254, i32 0, <32 x i32> %addrs, <32 x i1> [[EM_LOAD1]])
+  %call1 = call <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32 12, i16 0, i32 254, i32 0, <32 x i32> %addrs, <32 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>)
+
+; CHECK: [[EM_LOAD2:%.*]] = load <32 x i1>, <32 x i1>* @EM
+; CHECK-NEXT: [[CHENNELEM:%.*]] = shufflevector <32 x i1> [[EM_LOAD2]], <32 x i1> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:  [[CALL1_SIMDCFPREDL:%.*]] = select <64 x i1> [[CHENNELEM]], <64 x i32> [[CALL1]]
+  store <64 x i32> %call1, <64 x i32>* %Rcp_T
+  br label %if.end
+
+if.end:
+  %1 = load <64 x i32>, <64 x i32>* %Rcp_T
+  store <64 x i32> %1, <64 x i32>* @Rcp_T2
+  ret void
+}
+
+declare <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32, i16, i32, i32, <32 x i32>, <32 x i1>)
+declare i1 @llvm.genx.simdcf.any.v32i1(<32 x i1>)
diff --git a/GenXIntrinsics/test/update_mask_masked_gather4.ll b/GenXIntrinsics/test/update_mask_masked_gather4.ll
@@ -0,0 +1,51 @@
+; RUN: opt -S -cmsimdcflowering < %s | FileCheck %s
+
+@Rcp_T2 = internal global <64 x i32> undef
+
+; CHECK: @EM = internal global <32 x i1> 
+
+define dso_local dllexport void @test(<32 x i16> %cond1, <32 x i16> %cond2, <32 x i32> %addrs, <32 x i1> %pred) {
+entry:
+  %Rcp_T = alloca <64 x i32>, align 512
+  %0 = icmp ne <32 x i16> %cond1, zeroinitializer
+  %call = call i1 @llvm.genx.simdcf.any.v32i1(<32 x i1> %0)
+  br i1 %call, label %if.then, label %if.end
+if.then:
+; CHECK-LABEL: if.then:
+; CHECK: [[EM_LOAD1:%.*]] = load <32 x i1>, <32 x i1>* @EM
+; CHECK-NEXT: [[EM_UPDATE1:%.*]] = and <32 x i1> %pred, [[EM_LOAD1]]
+; CHECK-NEXT: [[CALL1:%.*]] = call <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32 12, i16 0, i32 254, i32 0, <32 x i32> %addrs, <32 x i1> [[EM_UPDATE1]])
+  %call1 = call <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32 12, i16 0, i32 254, i32 0, <32 x i32> %addrs, <32 x i1> %pred)
+
+; CHECK: [[EM_LOAD2:%.*]] = load <32 x i1>, <32 x i1>* @EM
+; CHECK-NEXT: [[EM_UPDATE2:%.*]] = and <32 x i1> %pred, [[EM_LOAD2]]
+; CHECK-NEXT: [[CHENNELEM:%.*]] = shufflevector <32 x i1> [[EM_UPDATE2]], <32 x i1> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:  [[CALL1_SIMDCFPRED1:%.*]] = select <64 x i1> [[CHENNELEM]], <64 x i32> [[CALL1]]
+
+  store <64 x i32> %call1, <64 x i32>* %Rcp_T
+
+  %1 = icmp ne <32 x i16> %cond2, zeroinitializer
+  %nest = call i1 @llvm.genx.simdcf.any.v32i1(<32 x i1> %1)
+  br i1 %nest, label %if.then2, label %if.end2
+
+if.then2:
+; CHECK-LABEL: if.then2:
+; CHECK: [[EM_LOAD3:%.*]] = load <32 x i1>, <32 x i1>* @EM
+; CHECK-NEXT: [[EM_UPDATE2:%.*]] = and <32 x i1> %pred, [[EM_LOAD3]]
+; CHECK-NEXT: [[CHENNELEM2:%.*]] = shufflevector <32 x i1> [[EM_UPDATE2]], <32 x i1> undef, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: %call1.simdcfpred7 = select <64 x i1> [[CHENNELEM2]], <64 x i32> [[CALL1]]
+  store <64 x i32> %call1, <64 x i32>* %Rcp_T
+  br label %if.end2
+
+if.end2:
+  br label %if.end
+if.end:
+  %2 = load <64 x i32>, <64 x i32>* %Rcp_T
+  store <64 x i32> %2, <64 x i32>* @Rcp_T2
+  ret void
+}
+
+declare <64 x i32> @llvm.genx.gather4.masked.scaled2.v64i32.v32i32.v32i1(i32, i16, i32, i32, <32 x i32>, <32 x i1>)
+declare <64 x i32> @llvm.genx.wrregioni.v64i32.v16i32.i16.i1(<64 x i32> %load, <64 x i32> %call, i32, i32, i32, i16, i32, i1)
+declare i1 @llvm.genx.simdcf.any.v32i1(<32 x i1>)
+