[LLVM][AArch64] Correctly lower funnel shifts by constants. (#140058)

paulwalker-arm · web-flow · commit 5dfaf8418d6b · 2025-05-20T11:15:21.000+01:00
Prevent LowerFunnelShift from creating an invalid ISD::FSHR when
lowering "ISD::FSHL X, Y, 0". Such inputs are rare because it's a NOP
that DAGCombiner will optimise away. However, we should not rely on this
and so this PR mirrors the same optimisation.
    
Ensure LowerFunnelShift normalises constant shift amounts because isel
rules expect them to be in the range [0, src bit length).
    
NOTE: To simiplify testing, this PR also adds a command line option to
disable the DAG combiner (-combiner-disabled).
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -149,6 +149,10 @@ static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
     cl::desc("DAG combiner enable load/<replace bytes>/store with "
              "a narrower store"));
 
+static cl::opt<bool> DisableCombines("combiner-disabled", cl::Hidden,
+                                     cl::init(false),
+                                     cl::desc("Disable the DAG combiner"));
+
 namespace {
 
   class DAGCombiner {
@@ -248,7 +252,8 @@ namespace {
           STI(D.getSubtarget().getSelectionDAGInfo()), OptLevel(OL),
           BatchAA(BatchAA) {
       ForCodeSize = DAG.shouldOptForSize();
-      DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
+      DisableGenericCombines =
+          DisableCombines || (STI && STI->disableGenericCombines(OptLevel));
 
       MaximumLegalStoreInBits = 0;
       // We use the minimum store size here, since that's all we can guarantee
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7262,20 +7262,34 @@ static SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) {
 // FSHL is converted to FSHR before deciding what to do with it
 static SDValue LowerFunnelShift(SDValue Op, SelectionDAG &DAG) {
   SDValue Shifts = Op.getOperand(2);
-  // Check if the shift amount is a constant
+  // Check if the shift amount is a constant and normalise to [0, SrcBitLen)
   // If opcode is FSHL, convert it to FSHR
   if (auto *ShiftNo = dyn_cast<ConstantSDNode>(Shifts)) {
     SDLoc DL(Op);
     MVT VT = Op.getSimpleValueType();
+    unsigned int NewShiftNo = ShiftNo->getZExtValue() % VT.getFixedSizeInBits();
 
     if (Op.getOpcode() == ISD::FSHL) {
-      unsigned int NewShiftNo =
-          VT.getFixedSizeInBits() - ShiftNo->getZExtValue();
+      if (NewShiftNo == 0)
+        return Op.getOperand(0);
+
+      NewShiftNo = VT.getFixedSizeInBits() - NewShiftNo;
+      return DAG.getNode(
+          ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
+          DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
+    }
+
+    if (Op.getOpcode() == ISD::FSHR) {
+      if (NewShiftNo == 0)
+        return Op.getOperand(1);
+
+      if (ShiftNo->getZExtValue() == NewShiftNo)
+        return Op;
+
+      // Rewrite using the normalised shift amount.
       return DAG.getNode(
           ISD::FSHR, DL, VT, Op.getOperand(0), Op.getOperand(1),
           DAG.getConstant(NewShiftNo, DL, Shifts.getValueType()));
-    } else if (Op.getOpcode() == ISD::FSHR) {
-      return Op;
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/fsh-combiner-disabled.ll b/llvm/test/CodeGen/AArch64/fsh-combiner-disabled.ll
@@ -0,0 +1,134 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc %s -o - | FileCheck %s
+; RUN: llc -combiner-disabled %s -o - | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; Verify lowering code in isolation to ensure we can lower shifts that would
+; normally be optimised away.
+
+define i32 @fshl_i32_by_zero(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshl_i32_by_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 0)
+  ret i32 %r
+}
+
+define i32 @fshl_i32_by_half_srclen(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshl_i32_by_half_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w1, w2, #16
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 16)
+  ret i32 %r
+}
+
+define i32 @fshl_i32_by_srclen(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshl_i32_by_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w1
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 32)
+  ret i32 %r
+}
+
+define i32 @fshl_i32_by_srclen_plus1(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshl_i32_by_srclen_plus1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w1, w2, #31
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 33)
+  ret i32 %r
+}
+
+define i64 @fshl_i64_by_zero(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshl_i64_by_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 0)
+  ret i64 %r
+}
+
+define i64 @fshl_i64_by_srclen(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshl_i64_by_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x1
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 64)
+  ret i64 %r
+}
+
+define i64 @fshl_i64_by_srclen_plus1(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshl_i64_by_srclen_plus1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr x0, x1, x2, #63
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 65)
+  ret i64 %r
+}
+
+define i32 @fshr_i32_by_zero(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshr_i32_by_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w2
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 0)
+  ret i32 %r
+}
+
+define i32 @fshr_i32_by_srclen(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshr_i32_by_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w0, w2
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 32)
+  ret i32 %r
+}
+
+define i32 @fshr_i32_by_half_srclen(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshr_i32_by_half_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w1, w2, #16
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 16)
+  ret i32 %r
+}
+
+define i32 @fshr_i32_by_srclen_plus1(i32 %unused, i32 %a, i32 %b) {
+; CHECK-LABEL: fshr_i32_by_srclen_plus1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr w0, w1, w2, #1
+; CHECK-NEXT:    ret
+  %r = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 33)
+  ret i32 %r
+}
+
+define i64 @fshr_i64_by_zero(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshr_i64_by_zero:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 0)
+  ret i64 %r
+}
+
+define i64 @fshr_i64_by_srclen(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshr_i64_by_srclen:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov x0, x2
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 64)
+  ret i64 %r
+}
+
+define i64 @fshr_i64_by_srclen_plus1(i64 %unused, i64 %a, i64 %b) {
+; CHECK-LABEL: fshr_i64_by_srclen_plus1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    extr x0, x1, x2, #1
+; CHECK-NEXT:    ret
+  %r = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 65)
+  ret i64 %r
+}