llvm
diff --git a/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 4 additions & 0 deletions b/‎clang/lib/CodeGen/CGBuiltin.cpp
Lines changed: 4 additions & 0 deletions
diff --git a/‎clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Lines changed: 2 additions & 2 deletions b/‎clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Lines changed: 2 additions & 2 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 6 additions & 9 deletions b/‎llvm/include/llvm/IR/IntrinsicsAMDGPU.td
Lines changed: 6 additions & 9 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 5 additions & 5 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Lines changed: 4 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Lines changed: 29 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
Lines changed: 29 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Lines changed: 190 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
Lines changed: 190 additions & 0 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Lines changed: 3 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
Lines changed: 3 additions & 0 deletions
@@ -18479,6 +18479,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
         CGM.getIntrinsic(Intrinsic::amdgcn_update_dpp, Args[0]->getType());
     return Builder.CreateCall(F, Args);
   }
+  case AMDGPU::BI__builtin_amdgcn_readlane:
+    return emitBinaryBuiltin(*this, E, Intrinsic::amdgcn_readlane);
+  case AMDGPU::BI__builtin_amdgcn_readfirstlane:
+    return emitUnaryBuiltin(*this, E, Intrinsic::amdgcn_readfirstlane);
   case AMDGPU::BI__builtin_amdgcn_div_fixup:
   case AMDGPU::BI__builtin_amdgcn_div_fixupf:
   case AMDGPU::BI__builtin_amdgcn_div_fixuph:
 
@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)
 }
 
 // CHECK-LABEL: @test_readfirstlane
-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)
+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)
 void test_readfirstlane(global int* out, int a)
 {
   *out = __builtin_amdgcn_readfirstlane(a);
 }
 
 // CHECK-LABEL: @test_readlane
-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)
+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)
 void test_readlane(global int* out, int a, int b)
 {
   *out = __builtin_amdgcn_readlane(a, b);
 
@@ -2176,26 +2176,23 @@ def int_amdgcn_wave_reduce_umin : AMDGPUWaveReduce;
 def int_amdgcn_wave_reduce_umax : AMDGPUWaveReduce;
 
 def int_amdgcn_readfirstlane :
-  ClangBuiltin<"__builtin_amdgcn_readfirstlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The lane argument must be uniform across the currently active threads of the
 // current wave. Otherwise, the result is undefined.
 def int_amdgcn_readlane :
-  ClangBuiltin<"__builtin_amdgcn_readlane">,
-  Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+  Intrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty],
             [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
 // The value to write and lane select arguments must be uniform across the
 // currently active threads of the current wave. Otherwise, the result is
 // undefined.
 def int_amdgcn_writelane :
-  ClangBuiltin<"__builtin_amdgcn_writelane">,
-  Intrinsic<[llvm_i32_ty], [
-    llvm_i32_ty,    // uniform value to write: returned by the selected lane
-    llvm_i32_ty,    // uniform lane select
-    llvm_i32_ty     // returned by all lanes other than the selected one
+  Intrinsic<[llvm_any_ty], [
+    LLVMMatchType<0>,   // uniform value to write: returned by the selected lane
+    llvm_i32_ty,        // uniform lane select
+    LLVMMatchType<0>    // returned by all lanes other than the selected one
   ],
   [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
@@ -433,7 +433,7 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
   // Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
   // combine them with a scalar operation.
   Function *ReadLane =
-      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+      Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
   V = B.CreateBitCast(V, IntNTy);
   Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
   Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
@@ -523,10 +523,10 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
                      {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
                       B.getInt32(0xf), B.getFalse()});
   } else {
-    Function *ReadLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
-    Function *WriteLane =
-        Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+    Function *ReadLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_readlane, B.getInt32Ty());
+    Function *WriteLane = Intrinsic::getDeclaration(
+        M, Intrinsic::amdgcn_writelane, B.getInt32Ty());
 
     // On GFX10 all DPP operations are confined to a single row. To get cross-
     // row operations we have to use permlane or readlane.
 
@@ -5496,6 +5496,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(LDS)
   NODE_NAME_CASE(FPTRUNC_ROUND_UPWARD)
   NODE_NAME_CASE(FPTRUNC_ROUND_DOWNWARD)
+  NODE_NAME_CASE(READLANE)
+  NODE_NAME_CASE(READFIRSTLANE)
+  NODE_NAME_CASE(WRITELANE)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
   NODE_NAME_CASE(LOAD_D16_HI)
 
@@ -558,6 +558,10 @@ enum NodeType : unsigned {
   FPTRUNC_ROUND_UPWARD,
   FPTRUNC_ROUND_DOWNWARD,
 
+  READLANE,
+  READFIRSTLANE,
+  WRITELANE,
+
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LOAD_D16_HI,
 
@@ -342,6 +342,22 @@ def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm_impl : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
+def AMDGPUReadfirstlaneOp : SDTypeProfile<1, 1, [
+  SDTCisSameAs<0, 1>
+]>;
+
+def AMDGPUReadlaneOp : SDTypeProfile<1, 2, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+def AMDGPUDWritelaneOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisInt<2>, SDTCisSameAs<0, 3>
+]>;
+
+def AMDGPUreadlane_impl : SDNode<"AMDGPUISD::READLANE", AMDGPUReadlaneOp>;
+def AMDGPUreadfirstlane_impl : SDNode<"AMDGPUISD::READFIRSTLANE", AMDGPUReadfirstlaneOp>;
+def AMDGPUwritelane_impl : SDNode<"AMDGPUISD::WRITELANE", AMDGPUDWritelaneOp>;
+
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
   SDTCisInt<0>,       // i8 tgt
@@ -506,3 +522,16 @@ def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc
 def AMDGPUperm : PatFrags<(ops node:$src0, node:$src1, node:$src2),
   [(int_amdgcn_perm node:$src0, node:$src1, node:$src2),
    (AMDGPUperm_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUreadlane : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_readlane node:$src0, node:$src1),
+   (AMDGPUreadlane_impl node:$src0, node:$src1)]>;
+
+def AMDGPUreadfirstlane : PatFrags<(ops node:$src),
+  [(int_amdgcn_readfirstlane node:$src),
+   (AMDGPUreadfirstlane_impl node:$src)]>;
+
+def AMDGPUwritelane : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_writelane node:$src0, node:$src1, node:$src2),
+   (AMDGPUwritelane_impl node:$src0, node:$src1, node:$src2)]>;
+
@@ -5387,6 +5387,192 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeLaneOp(LegalizerHelper &Helper,
+                                         MachineInstr &MI,
+                                         Intrinsic::ID IID) const {
+
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+
+  auto createLaneOp = [&](Register Src0, Register Src1,
+                          Register Src2) -> Register {
+    auto LaneOp = B.buildIntrinsic(IID, {S32}).addUse(Src0);
+    switch (IID) {
+    case Intrinsic::amdgcn_readfirstlane:
+      return LaneOp.getReg(0);
+    case Intrinsic::amdgcn_readlane:
+      return LaneOp.addUse(Src1).getReg(0);
+    case Intrinsic::amdgcn_writelane:
+      return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
+    default:
+      llvm_unreachable("unhandled lane op");
+    }
+  };
+
+  Register Src1, Src2;
+  if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane) {
+    Src1 = MI.getOperand(3).getReg();
+    if (IID == Intrinsic::amdgcn_writelane) {
+      Src2 = MI.getOperand(4).getReg();
+    }
+  }
+
+  LLT Ty = MRI.getType(DstReg);
+  unsigned Size = Ty.getSizeInBits();
+
+  if (Size == 32) {
+    // Already legal
+    return true;
+  }
+
+  if (Size < 32) {
+    Register Src0Cast = MRI.getType(Src0).isScalar()
+                            ? Src0
+                            : B.buildBitcast(LLT::scalar(Size), Src0).getReg(0);
+    Src0 = B.buildAnyExt(S32, Src0Cast).getReg(0);
+    if (Src2.isValid()) {
+      Register Src2Cast =
+          MRI.getType(Src2).isScalar()
+              ? Src2
+              : B.buildBitcast(LLT::scalar(Size), Src2).getReg(0);
+      Src2 = B.buildAnyExt(LLT::scalar(32), Src2Cast).getReg(0);
+    }
+
+    Register LaneOpDst = createLaneOp(Src0, Src1, Src2);
+    if (Ty.isScalar())
+      B.buildTrunc(DstReg, LaneOpDst);
+    else {
+      auto Trunc = B.buildTrunc(LLT::scalar(Size), LaneOpDst);
+      B.buildBitcast(DstReg, Trunc);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if ((Size % 32) == 0) {
+    SmallVector<Register, 2> PartialRes;
+    unsigned NumParts = Size / 32;
+    auto IsS16Vec = Ty.isVector() && Ty.getElementType() == S16;
+    MachineInstrBuilder Src0Parts;
+
+    if (Ty.isPointer()) {
+      auto PtrToInt = B.buildPtrToInt(LLT::scalar(Size), Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else if (Ty.isPointerVector()) {
+      LLT IntVecTy = Ty.changeElementType(
+          LLT::scalar(Ty.getElementType().getSizeInBits()));
+      auto PtrToInt = B.buildPtrToInt(IntVecTy, Src0);
+      Src0Parts = B.buildUnmerge(S32, PtrToInt);
+    } else
+      Src0Parts =
+          IsS16Vec ? B.buildUnmerge(V2S16, Src0) : B.buildUnmerge(S32, Src0);
+
+    switch (IID) {
+    case Intrinsic::amdgcn_readlane: {
+      Register Src1 = MI.getOperand(3).getReg();
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readlane, {S32})
+                 .addUse(Src0)
+                 .addUse(Src1))
+                .getReg(0));
+      }
+      break;
+    }
+    case Intrinsic::amdgcn_readfirstlane: {
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_readfirstlane, {S32})
+                 .addUse(Src0)
+                 .getReg(0)));
+      }
+
+      break;
+    }
+    case Intrinsic::amdgcn_writelane: {
+      Register Src1 = MI.getOperand(3).getReg();
+      Register Src2 = MI.getOperand(4).getReg();
+      MachineInstrBuilder Src2Parts;
+
+      if (Ty.isPointer()) {
+        auto PtrToInt = B.buildPtrToInt(S64, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else if (Ty.isPointerVector()) {
+        LLT IntVecTy = Ty.changeElementType(
+            LLT::scalar(Ty.getElementType().getSizeInBits()));
+        auto PtrToInt = B.buildPtrToInt(IntVecTy, Src2);
+        Src2Parts = B.buildUnmerge(S32, PtrToInt);
+      } else
+        Src2Parts =
+            IsS16Vec ? B.buildUnmerge(V2S16, Src2) : B.buildUnmerge(S32, Src2);
+
+      for (unsigned i = 0; i < NumParts; ++i) {
+        Src0 = IsS16Vec ? B.buildBitcast(S32, Src0Parts.getReg(i)).getReg(0)
+                        : Src0Parts.getReg(i);
+        Src2 = IsS16Vec ? B.buildBitcast(S32, Src2Parts.getReg(i)).getReg(0)
+                        : Src2Parts.getReg(i);
+        PartialRes.push_back(
+            (B.buildIntrinsic(Intrinsic::amdgcn_writelane, {S32})
+                 .addUse(Src0)
+                 .addUse(Src1)
+                 .addUse(Src2))
+                .getReg(0));
+      }
+
+      break;
+    }
+    }
+
+    if (Ty.isPointerVector()) {
+      unsigned PtrSize = Ty.getElementType().getSizeInBits();
+      SmallVector<Register, 2> PtrElements;
+      if (PtrSize == 32) {
+        // Handle 32 bit pointers
+        for (unsigned i = 0; i < NumParts; i++)
+          PtrElements.push_back(
+              B.buildIntToPtr(Ty.getElementType(), PartialRes[i]).getReg(0));
+      } else {
+        // Handle legalization of <? x [pointer type bigger than 32 bits]>
+        SmallVector<Register, 2> PtrParts;
+        unsigned NumS32Parts = PtrSize / 32;
+        unsigned PartIdx = 0;
+        for (unsigned i = 0, j = 1; i < NumParts; i += NumS32Parts, j++) {
+          // Merge S32 components of a pointer element first.
+          for (; PartIdx < (j * NumS32Parts); PartIdx++)
+            PtrParts.push_back(PartialRes[PartIdx]);
+
+          auto MergedPtr =
+              B.buildMergeLikeInstr(LLT::scalar(PtrSize), PtrParts);
+          PtrElements.push_back(
+              B.buildIntToPtr(Ty.getElementType(), MergedPtr).getReg(0));
+          PtrParts.clear();
+        }
+      }
+
+      B.buildMergeLikeInstr(DstReg, PtrElements);
+    } else {
+      if (IsS16Vec) {
+        for (unsigned i = 0; i < NumParts; i++)
+          PartialRes[i] = B.buildBitcast(V2S16, PartialRes[i]).getReg(0);
+      }
+      B.buildMergeLikeInstr(DstReg, PartialRes);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
 bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
                                             MachineRegisterInfo &MRI,
                                             MachineIRBuilder &B) const {
@@ -7330,6 +7516,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     Observer.changedInstr(MI);
     return true;
   }
+  case Intrinsic::amdgcn_readlane:
+  case Intrinsic::amdgcn_writelane:
+  case Intrinsic::amdgcn_readfirstlane:
+    return legalizeLaneOp(Helper, MI, IntrID);
   default: {
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrID))
 
@@ -208,6 +208,9 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
   bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
                             Intrinsic::ID IID) const;
 
+  bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI,
+                      Intrinsic::ID IID) const;
+
   bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
 
   bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
Original file line number	Diff line number	Diff line change
`@@ -306,14 +306,14 @@ void test_ds_bpermute(global int* out, int a, int b)`
`306`	`306`	`}`
`307`	`307`
`308`	`308`	`// CHECK-LABEL: @test_readfirstlane`
`309`		`-// CHECK: call i32 @llvm.amdgcn.readfirstlane(i32 %a)`
	`309`	`+// CHECK: call i32 @llvm.amdgcn.readfirstlane.i32(i32 %a)`
`310`	`310`	`void test_readfirstlane(global int* out, int a)`
`311`	`311`	`{`
`312`	`312`	`*out = __builtin_amdgcn_readfirstlane(a);`
`313`	`313`	`}`
`314`	`314`
`315`	`315`	`// CHECK-LABEL: @test_readlane`
`316`		`-// CHECK: call i32 @llvm.amdgcn.readlane(i32 %a, i32 %b)`
	`316`	`+// CHECK: call i32 @llvm.amdgcn.readlane.i32(i32 %a, i32 %b)`
`317`	`317`	`void test_readlane(global int* out, int a, int b)`
`318`	`318`	`{`
`319`	`319`	`*out = __builtin_amdgcn_readlane(a, b);`