[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm #82968

ecnelises · 2024-02-26T08:36:49Z

These builtins are already there in Clang, however current codegen may produce suboptimal results due to their complex behavior. Implement them as intrinsics to ensure expected instructions are emitted.

llvmbot · 2024-02-26T08:37:17Z

@llvm/pr-subscribers-clang
@llvm/pr-subscribers-clang-codegen
@llvm/pr-subscribers-llvm-ir

@llvm/pr-subscribers-backend-powerpc

Author: Qiu Chaofan (ecnelises)

Changes

These builtins are already there in Clang, however current codegen may produce suboptimal results due to their complex behavior. Implement them as intrinsics to ensure expected instructions are emitted.

Full diff: https://github.com/llvm/llvm-project/pull/82968.diff

7 Files Affected:

(modified) clang/lib/CodeGen/CGBuiltin.cpp (+8-21)
(modified) clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c (+8-16)
(modified) llvm/include/llvm/IR/IntrinsicsPowerPC.td (+12)
(modified) llvm/lib/Target/PowerPC/PPCISelLowering.cpp (+52)
(modified) llvm/test/CodeGen/PowerPC/rldimi.ll (+15)
(modified) llvm/test/CodeGen/PowerPC/rlwimi.ll (+85-38)
(modified) llvm/test/CodeGen/PowerPC/rlwinm.ll (+79-29)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 734eb5a035ca49..5d55be6e9e99df 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -17080,37 +17080,24 @@ Value *CodeGenFunction::EmitPPCBuiltinExpr(unsigned BuiltinID,
     }
     return Builder.CreateCall(CGM.getIntrinsic(ID), Ops, "");
   }
-  // Rotate and insert under mask operation.
-  // __rldimi(rs, is, shift, mask)
-  // (rotl64(rs, shift) & mask) | (is & ~mask)
-  // __rlwimi(rs, is, shift, mask)
-  // (rotl(rs, shift) & mask) | (is & ~mask)
   case PPC::BI__builtin_ppc_rldimi:
   case PPC::BI__builtin_ppc_rlwimi: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     Value *Op2 = EmitScalarExpr(E->getArg(2));
     Value *Op3 = EmitScalarExpr(E->getArg(3));
-    llvm::Type *Ty = Op0->getType();
-    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    if (BuiltinID == PPC::BI__builtin_ppc_rldimi)
-      Op2 = Builder.CreateZExt(Op2, Int64Ty);
-    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op2});
-    Value *X = Builder.CreateAnd(Shift, Op3);
-    Value *Y = Builder.CreateAnd(Op1, Builder.CreateNot(Op3));
-    return Builder.CreateOr(X, Y);
-  }
-  // Rotate and insert under mask operation.
-  // __rlwnm(rs, shift, mask)
-  // rotl(rs, shift) & mask
+    return Builder.CreateCall(
+        CGM.getIntrinsic(BuiltinID == PPC::BI__builtin_ppc_rldimi
+                             ? Intrinsic::ppc_rldimi
+                             : Intrinsic::ppc_rlwimi),
+        {Op0, Op1, Op2, Op3});
+  }
   case PPC::BI__builtin_ppc_rlwnm: {
     Value *Op0 = EmitScalarExpr(E->getArg(0));
     Value *Op1 = EmitScalarExpr(E->getArg(1));
     Value *Op2 = EmitScalarExpr(E->getArg(2));
-    llvm::Type *Ty = Op0->getType();
-    Function *F = CGM.getIntrinsic(Intrinsic::fshl, Ty);
-    Value *Shift = Builder.CreateCall(F, {Op0, Op0, Op1});
-    return Builder.CreateAnd(Shift, Op2);
+    return Builder.CreateCall(CGM.getIntrinsic(Intrinsic::ppc_rlwnm),
+                              {Op0, Op1, Op2});
   }
   case PPC::BI__builtin_ppc_poppar4:
   case PPC::BI__builtin_ppc_poppar8: {
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
index d96bfb4621421e..b218547c00d931 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-xlcompat-rotate.c
@@ -16,11 +16,8 @@ void test_builtin_ppc_rldimi() {
   // CHECK:       %res = alloca i64, align 8
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i64, ptr @ull, align 8
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i64, ptr @ull, align 8
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.fshl.i64(i64 [[RA]], i64 [[RA]], i64 63)
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i64 [[RC]], 72057593769492480
-  // CHECK-NEXT:  [[RE:%[0-9]+]] = and i64 [[RB]], -72057593769492481
-  // CHECK-NEXT:  [[RF:%[0-9]+]] = or i64 [[RD]], [[RE]]
-  // CHECK-NEXT:  store i64 [[RF]], ptr %res, align 8
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i64 @llvm.ppc.rldimi(i64 [[RA]], i64 [[RB]], i32 63, i64 72057593769492480)
+  // CHECK-NEXT:  store i64 [[RC]], ptr %res, align 8
   // CHECK-NEXT:  ret void
 
   /*shift = 63, mask = 0x00FFFFFFF0000000 = 72057593769492480, ~mask = 0xFF0000000FFFFFFF = -72057593769492481*/
@@ -32,11 +29,8 @@ void test_builtin_ppc_rlwimi() {
   // CHECK:       %res = alloca i32, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i32, ptr @ui, align 4
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i32 [[RC]], 16776960
-  // CHECK-NEXT:  [[RE:%[0-9]+]] = and i32 [[RB]], -16776961
-  // CHECK-NEXT:  [[RF:%[0-9]+]] = or i32 [[RD]], [[RE]]
-  // CHECK-NEXT:  store i32 [[RF]], ptr %res, align 4
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwimi(i32 [[RA]], i32 [[RB]], i32 31, i32 16776960)
+  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*shift = 31, mask = 0xFFFF00 = 16776960, ~mask = 0xFFFFFFFFFF0000FF = -16776961*/
@@ -47,9 +41,8 @@ void test_builtin_ppc_rlwnm() {
   // CHECK-LABEL: test_builtin_ppc_rlwnm
   // CHECK:       %res = alloca i32, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
-  // CHECK-NEXT:  [[RB:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 31)
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = and i32 [[RB]], 511
-  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
+  // CHECK-NEXT:  [[RB:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 31, i32 511)
+  // CHECK-NEXT:  store i32 [[RB]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*shift = 31, mask = 0x1FF = 511*/
@@ -63,9 +56,8 @@ void test_builtin_ppc_rlwnm2(unsigned int shift) {
   // CHECK-NEXT:  store i32 %shift, ptr %shift.addr, align 4
   // CHECK-NEXT:  [[RA:%[0-9]+]] = load i32, ptr @ui, align 4
   // CHECK-NEXT:  [[RB:%[0-9]+]] = load i32, ptr %shift.addr, align 4
-  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.fshl.i32(i32 [[RA]], i32 [[RA]], i32 [[RB]])
-  // CHECK-NEXT:  [[RD:%[0-9]+]] = and i32 [[RC]], 511
-  // CHECK-NEXT:  store i32 [[RD]], ptr %res, align 4
+  // CHECK-NEXT:  [[RC:%[0-9]+]] = call i32 @llvm.ppc.rlwnm(i32 [[RA]], i32 [[RB]], i32 511)
+  // CHECK-NEXT:  store i32 [[RC]], ptr %res, align 4
   // CHECK-NEXT:  ret void
 
   /*mask = 0x1FF = 511*/
diff --git a/llvm/include/llvm/IR/IntrinsicsPowerPC.td b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
index bfc2b17043bc79..ee9a04241ac2ec 100644
--- a/llvm/include/llvm/IR/IntrinsicsPowerPC.td
+++ b/llvm/include/llvm/IR/IntrinsicsPowerPC.td
@@ -182,6 +182,18 @@ let TargetPrefix = "ppc" in {  // All intrinsics start with "llvm.ppc.".
   def int_ppc_fctuwz
       : ClangBuiltin<"__builtin_ppc_fctuwz">,
         DefaultAttrsIntrinsic<[llvm_double_ty], [llvm_double_ty], [IntrNoMem]>;
+  def int_ppc_rldimi
+      : ClangBuiltin<"__builtin_ppc_rldimi">,
+        DefaultAttrsIntrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty, llvm_i32_ty, llvm_i64_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_ppc_rlwimi
+      : ClangBuiltin<"__builtin_ppc_rlwimi">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+  def int_ppc_rlwnm
+      : ClangBuiltin<"__builtin_ppc_rlwnm">,
+        DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
+                              [IntrNoMem, ImmArg<ArgIndex<2>>]>;
 
   // XL compatible select functions
   // TODO: Add llvm_f128_ty support.
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 51becf1d5b8584..f84addbf728ad0 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -641,6 +641,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // We want to custom lower some of our intrinsics.
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  // setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f64, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::ppcf128, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -10722,6 +10723,20 @@ static bool getVectorCompareInfo(SDValue Intrin, int &CompareOpc,
   return true;
 }
 
+bool isContiguousMask(const APInt &Val, unsigned &MB, unsigned &ME,
+                      unsigned BitWidth) {
+  unsigned MaskLen = 0;
+  if (Val.isShiftedMask(MB, MaskLen)) {
+    MB = (BitWidth - MB - MaskLen) % BitWidth;
+  } else if ((~Val).isShiftedMask(MB, MaskLen)) {
+    MB = (BitWidth - MB) % BitWidth;
+  } else {
+    return false;
+  }
+  ME = (MB + MaskLen - 1) % BitWidth;
+  return true;
+}
+
 /// LowerINTRINSIC_WO_CHAIN - If this is an intrinsic that we want to custom
 /// lower, do it, otherwise return null.
 SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
@@ -10737,6 +10752,43 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getRegister(PPC::X13, MVT::i64);
     return DAG.getRegister(PPC::R2, MVT::i32);
 
+  case Intrinsic::ppc_rldimi: {
+    uint64_t SH = Op.getConstantOperandVal(3);
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 64) ||
+        ME != 63 - SH)
+      llvm_unreachable("invalid rldimi mask!");
+    return SDValue(DAG.getMachineNode(
+                       PPC::RLDIMI, dl, MVT::i64,
+                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                        DAG.getTargetConstant(MB, dl, MVT::i32)}),
+                   0);
+  }
+
+  case Intrinsic::ppc_rlwimi: {
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(4), MB, ME, 32))
+      llvm_unreachable("invalid rlwimi mask!");
+    return SDValue(DAG.getMachineNode(
+                       PPC::RLWIMI, dl, MVT::i32,
+                       {Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
+                        DAG.getTargetConstant(MB, dl, MVT::i32),
+                        DAG.getTargetConstant(ME, dl, MVT::i32)}),
+                   0);
+  }
+
+  case Intrinsic::ppc_rlwnm: {
+    unsigned MB = 0, ME = 0;
+    if (!isContiguousMask(Op.getConstantOperandAPInt(3), MB, ME, 32))
+      llvm_unreachable("invalid rlwnm mask!");
+    return SDValue(
+        DAG.getMachineNode(PPC::RLWNM, dl, MVT::i32,
+                           {Op.getOperand(1), Op.getOperand(2),
+                            DAG.getTargetConstant(MB, dl, MVT::i32),
+                            DAG.getTargetConstant(ME, dl, MVT::i32)}),
+        0);
+  }
+
   case Intrinsic::ppc_mma_disassemble_acc: {
     if (Subtarget.isISAFuture()) {
       EVT ReturnTypes[] = {MVT::v256i1, MVT::v256i1};
diff --git a/llvm/test/CodeGen/PowerPC/rldimi.ll b/llvm/test/CodeGen/PowerPC/rldimi.ll
index 4e26ddfc37f99e..322975f547c996 100644
--- a/llvm/test/CodeGen/PowerPC/rldimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rldimi.ll
@@ -58,3 +58,18 @@ entry:
   %8 = or i64 %6, %7
   ret i64 %8
 }
+
+define i64 @rldimi_intrinsic(i64 %a) {
+; CHECK-LABEL: rldimi_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rldimi 3, 3, 8, 0
+; CHECK-NEXT:    rldimi 3, 3, 16, 0
+; CHECK-NEXT:    rldimi 3, 3, 32, 0
+; CHECK-NEXT:    blr
+  %r1 = call i64 @llvm.ppc.rldimi(i64 %a, i64 %a, i32 8, i64 -256)
+  %r2 = call i64 @llvm.ppc.rldimi(i64 %r1, i64 %r1, i32 16, i64 -65536)
+  %r3 = call i64 @llvm.ppc.rldimi(i64 %r2, i64 %r2, i32 32, i64 -4294967296)
+  ret i64 %r3
+}
+
+declare i64 @llvm.ppc.rldimi(i64, i64, i32 immarg, i64 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwimi.ll b/llvm/test/CodeGen/PowerPC/rlwimi.ll
index e701236b840b2c..8b126cd3393c10 100644
--- a/llvm/test/CodeGen/PowerPC/rlwimi.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwimi.ll
@@ -1,70 +1,117 @@
-; All of these ands and shifts should be folded into rlwimi's
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | not grep and
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- | grep rlwimi | count 8
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
 define i32 @test1(i32 %x, i32 %y) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 16, 0, 15
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %x, 16		; <i32> [#uses=1]
-	%tmp.7 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.7, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.3 = shl i32 %x, 16
+  %tmp.7 = and i32 %y, 65535
+  %tmp.9 = or i32 %tmp.7, %tmp.3
+  ret i32 %tmp.9
 }
 
 define i32 @test2(i32 %x, i32 %y) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 16, 0, 15
+; CHECK-NEXT:    blr
 entry:
-	%tmp.7 = and i32 %x, 65535		; <i32> [#uses=1]
-	%tmp.3 = shl i32 %y, 16		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.7, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.7 = and i32 %x, 65535
+  %tmp.3 = shl i32 %y, 16
+  %tmp.9 = or i32 %tmp.7, %tmp.3
+  ret i32 %tmp.9
 }
 
 define i32 @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 16, 16, 31
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = lshr i32 %x, 16		; <i32> [#uses=1]
-	%tmp.6 = and i32 %y, -65536		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.6, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.3 = lshr i32 %x, 16
+  %tmp.6 = and i32 %y, -65536
+  %tmp.7 = or i32 %tmp.6, %tmp.3
+  ret i32 %tmp.7
 }
 
 define i32 @test4(i32 %x, i32 %y) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 16, 16, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.6 = and i32 %x, -65536		; <i32> [#uses=1]
-	%tmp.3 = lshr i32 %y, 16		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.6, %tmp.3		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.6 = and i32 %x, -65536
+  %tmp.3 = lshr i32 %y, 16
+  %tmp.7 = or i32 %tmp.6, %tmp.3
+  ret i32 %tmp.7
 }
 
 define i32 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 4, 3, 1, 0, 15
+; CHECK-NEXT:    mr 3, 4
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %x, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, -65536		; <i32> [#uses=1]
-	%tmp.7 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.4, %tmp.7		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.3 = shl i32 %x, 1
+  %tmp.4 = and i32 %tmp.3, -65536
+  %tmp.7 = and i32 %y, 65535
+  %tmp.9 = or i32 %tmp.4, %tmp.7
+  ret i32 %tmp.9
 }
 
 define i32 @test6(i32 %x, i32 %y) {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 1, 0, 15
+; CHECK-NEXT:    blr
 entry:
-	%tmp.7 = and i32 %x, 65535		; <i32> [#uses=1]
-	%tmp.3 = shl i32 %y, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, -65536		; <i32> [#uses=1]
-	%tmp.9 = or i32 %tmp.4, %tmp.7		; <i32> [#uses=1]
-	ret i32 %tmp.9
+  %tmp.7 = and i32 %x, 65535
+  %tmp.3 = shl i32 %y, 1
+  %tmp.4 = and i32 %tmp.3, -65536
+  %tmp.9 = or i32 %tmp.4, %tmp.7
+  ret i32 %tmp.9
 }
 
 define i32 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andis. 3, 3, 65535
+; CHECK-NEXT:    rldimi 3, 4, 0, 48
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = and i32 %x, -65536		; <i32> [#uses=1]
-	%tmp.5 = and i32 %y, 65535		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.5, %tmp.2		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.2 = and i32 %x, -65536
+  %tmp.5 = and i32 %y, 65535
+  %tmp.7 = or i32 %tmp.5, %tmp.2
+  ret i32 %tmp.7
 }
 
 define i32 @test8(i32 %bar) {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 3, 1, 30, 30
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = shl i32 %bar, 1		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, 2		; <i32> [#uses=1]
-	%tmp.6 = and i32 %bar, -3		; <i32> [#uses=1]
-	%tmp.7 = or i32 %tmp.4, %tmp.6		; <i32> [#uses=1]
-	ret i32 %tmp.7
+  %tmp.3 = shl i32 %bar, 1
+  %tmp.4 = and i32 %tmp.3, 2
+  %tmp.6 = and i32 %bar, -3
+  %tmp.7 = or i32 %tmp.4, %tmp.6
+  ret i32 %tmp.7
 }
+
+define i32 @test9(i32 %a, i32 %b) {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwimi 3, 4, 8, 20, 26
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwimi(i32 %a, i32 %b, i32 8, i32 4064)
+  ret i32 %r
+}
+
+declare i32 @llvm.ppc.rlwimi(i32, i32, i32 immarg, i32 immarg)
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm.ll b/llvm/test/CodeGen/PowerPC/rlwinm.ll
index 2f3b3bf003cf65..73e4b5f6b7ff60 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm.ll
+++ b/llvm/test/CodeGen/PowerPC/rlwinm.ll
@@ -1,61 +1,111 @@
-; All of these ands and shifts should be folded into rlwimi's
-; RUN: llc -verify-machineinstrs < %s -mtriple=ppc32-- -o %t
-; RUN: not grep and %t
-; RUN: not grep srawi %t
-; RUN: not grep srwi %t
-; RUN: not grep slwi %t
-; RUN: grep rlwinm %t | count 8
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
 define i32 @test1(i32 %a) {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 0, 4, 19
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 268431360		; <i32> [#uses=1]
-	ret i32 %tmp.1
+  %tmp.1 = and i32 %a, 268431360
+  ret i32 %tmp.1
 }
 
 define i32 @test2(i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldicl 3, 3, 36, 24
+; CHECK-NEXT:    rldicl 3, 3, 28, 32
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, -268435441		; <i32> [#uses=1]
-	ret i32 %tmp.1
+  %tmp.1 = and i32 %a, -268435441
+  ret i32 %tmp.1
 }
 
 define i32 @test3(i32 %a) {
+; CHECK-LABEL: test3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = ashr i32 %a, 8		; <i32> [#uses=1]
-	%tmp.3 = and i32 %tmp.2, 255		; <i32> [#uses=1]
-	ret i32 %tmp.3
+  %tmp.2 = ashr i32 %a, 8
+  %tmp.3 = and i32 %tmp.2, 255
+  ret i32 %tmp.3
 }
 
 define i32 @test4(i32 %a) {
+; CHECK-LABEL: test4:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.3 = lshr i32 %a, 8		; <i32> [#uses=1]
-	%tmp.4 = and i32 %tmp.3, 255		; <i32> [#uses=1]
-	ret i32 %tmp.4
+  %tmp.3 = lshr i32 %a, 8
+  %tmp.4 = and i32 %tmp.3, 255
+  ret i32 %tmp.4
 }
 
 define i32 @test5(i32 %a) {
+; CHECK-LABEL: test5:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 8, 0, 8
+; CHECK-NEXT:    blr
 entry:
-	%tmp.2 = shl i32 %a, 8		; <i32> [#uses=1]
-	%tmp.3 = and i32 %tmp.2, -8388608		; <i32> [#uses=1]
-	ret i32 %tmp.3
+  %tmp.2 = shl i32 %a, 8
+  %tmp.3 = and i32 %tmp.2, -8388608
+  ret i32 %tmp.3
 }
 
 define i32 @test6(i32 %a) {
+; CHECK-LABEL: test6:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 65280		; <i32> [#uses=1]
-	%tmp.2 = ashr i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 65280
+  %tmp.2 = ashr i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
 
 define i32 @test7(i32 %a) {
+; CHECK-LABEL: test7:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 24, 24, 31
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 65280		; <i32> [#uses=1]
-	%tmp.2 = lshr i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 65280
+  %tmp.2 = lshr i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
 
 define i32 @test8(i32 %a) {
+; CHECK-LABEL: test8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 8, 0, 7
+; CHECK-NEXT:    blr
 entry:
-	%tmp.1 = and i32 %a, 16711680		; <i32> [#uses=1]
-	%tmp.2 = shl i32 %tmp.1, 8		; <i32> [#uses=1]
-	ret i32 %tmp.2
+  %tmp.1 = and i32 %a, 16711680
+  %tmp.2 = shl i32 %tmp.1, 8
+  ret i32 %tmp.2
 }
+
+define i32 @test9(i32 %a, i32 %s) {
+; CHECK-LABEL: test9:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwnm 3, 3, 4, 23, 31
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 %s, i32 511)
+  ret i32 %r
+}
+
+define i32 @test10(i32 %a) {
+; CHECK-LABEL: test10:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rlwinm 3, 3, 31, 23, 31
+; CHECK-NEXT:    blr
+entry:
+  %r = call i32 @llvm.ppc.rlwnm(i32 %a, i32 31, i32 511)
+  ret i32 %r
+}
+
+declare i32 @llvm.ppc.rlwnm(i32, i32, i32 immarg)

efriedma-quic

If you run into issues using normal integer ops, please file bugs. Most people aren't going to hand-tune their code like this; builtins like this are at best an ugly workaround.

That said, I guess I'm not strongly against adding a backdoor to force a particular instruction here.

llvm/lib/Target/PowerPC/PPCISelLowering.cpp

chenzheng1030 · 2024-02-27T03:13:47Z

If you run into issues using normal integer ops, please file bugs. Most people aren't going to hand-tune their code like this; builtins like this are at best an ugly workaround.

Yes, a user should not try to write source code(using compiler builtins) to just emit one "powerful" instruction. Instead, it is compiler's responsibility to generate these instructions according to user's source codes.

I think the reason why these builtins were added is LLVM PowerPC target wants to keep its compatibility with XLC/C++ commercial compiler related to these builtins.

github-actions · 2024-02-27T06:11:48Z

✅ With the latest revision this PR passed the C/C++ code formatter.

chenzheng1030

LGTM except two comments in the case change. One is a nit and the other one should be other issue unrelated to this patch.

Thanks for implementing this.

llvm/test/CodeGen/PowerPC/rlwinm.ll

llvm/test/CodeGen/PowerPC/rlwimi.ll

llvm/test/CodeGen/PowerPC/rldimi.ll

chenzheng1030 · 2024-03-04T02:57:34Z

llvm/test/CodeGen/PowerPC/rlwinm.ll

+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rldicl 3, 3, 36, 24
+; CHECK-NEXT:    rldicl 3, 3, 28, 32


After the triple change, now two rldicl are emitted instead of a single rlwinm. Would you please help to add a FIXME here? or create an issue in github? Thanks.

rlwinm is a 32-bit instruction, in 64-bit mode both the input arg and return values are sign-extended. rldicl here does more stuff including sign-extending it (so here we see no extsw exists)

hmm, OK. The rlwinm does not match the 64-bit semantic here. Would you please just remove this case. It would be strange to keep it in the rlwinm file now.

Please don't treat rlwinm as a 32-bit instruction. It alters and well defined the high 32 bit of a GPR as well especially the MB/ME are wrapped.

Yes, I just meant rlwinm does not sign-ext higher 32 bits.

chenzheng1030 · 2024-03-04T03:01:16Z

The failure in the buildkite should be unrelated. But would be better to double confirm.

…ltins

[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm

a06fa5e

These builtins are already there in Clang, however current codegen may produce suboptimal results due to their complex behavior. Implement them as intrinsics to ensure expected instructions are emitted.

llvmbot added clang Clang issues not falling into any other category backend:PowerPC clang:codegen IR generation bugs: mangling, exceptions, etc. llvm:ir labels Feb 26, 2024

ecnelises mentioned this pull request Feb 26, 2024

[PowerPC] Check value uses in ValueBit tracking #66040

Closed

efriedma-quic reviewed Feb 26, 2024

View reviewed changes

llvm/lib/Target/PowerPC/PPCISelLowering.cpp Outdated Show resolved Hide resolved

chenzheng1030 reviewed Feb 27, 2024

View reviewed changes

llvm/lib/Target/PowerPC/PPCISelLowering.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/PowerPC/PPCISelLowering.cpp Outdated Show resolved Hide resolved

Use isRunOfOnes

d9c9b4e

Make clang-format happy

954f9b6

chenzheng1030 approved these changes Mar 4, 2024

View reviewed changes

ecnelises and others added 2 commits March 4, 2024 14:41

Merge commit '1f613bce19ea78789934b2a47be8c6a13925f0fa' into ppcrlbui…

e19475b

…ltins

Delete unrelated case

78b5446

ecnelises merged commit 906580b into llvm:main Mar 4, 2024

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm #82968

[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm #82968

Uh oh!

ecnelises commented Feb 26, 2024

Uh oh!

llvmbot commented Feb 26, 2024 •

edited

Loading

Uh oh!

efriedma-quic left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

chenzheng1030 commented Feb 27, 2024

Uh oh!

github-actions bot commented Feb 27, 2024 •

edited

Loading

Uh oh!

chenzheng1030 left a comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

chenzheng1030 Mar 4, 2024

Uh oh!

ecnelises Mar 4, 2024

Uh oh!

chenzheng1030 Mar 4, 2024

Uh oh!

ecnelises Mar 4, 2024

Uh oh!

chenzheng1030 commented Mar 4, 2024

Uh oh!

Uh oh!

[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm #82968

[PowerPC] Add intrinsics for rldimi/rlwimi/rlwnm #82968

Uh oh!

Conversation

ecnelises commented Feb 26, 2024

Uh oh!

llvmbot commented Feb 26, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

efriedma-quic left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

chenzheng1030 commented Feb 27, 2024

Uh oh!

github-actions bot commented Feb 27, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

chenzheng1030 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

chenzheng1030 Mar 4, 2024

Choose a reason for hiding this comment

Uh oh!

ecnelises Mar 4, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 Mar 4, 2024

Choose a reason for hiding this comment

Uh oh!

ecnelises Mar 4, 2024

Choose a reason for hiding this comment

Uh oh!

chenzheng1030 commented Mar 4, 2024

Uh oh!

Uh oh!

llvmbot commented Feb 26, 2024 •

edited

Loading

github-actions bot commented Feb 27, 2024 •

edited

Loading