[SDAG] SimplifyDemandedBits - generalize fold for 2 LSB of X*X

rotateright · rotateright · commit fc6bee1c11d4 · 2022-02-07T15:38:50.000-05:00
This is translated from recent changes to the IR version of this function:
D119060
D119139
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2265,9 +2265,14 @@ bool TargetLowering::SimplifyDemandedBits(
     break;
   }
   case ISD::MUL:
-    // 'Quadratic Reciprocity': mul(x,x) -> 0 if we're only demanding bit[1]
-    if (DemandedBits == 2 && Op.getOperand(0) == Op.getOperand(1))
-      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
+    // For a squared value "X * X", the bottom 2 bits are 0 and X[0] because:
+    // X * X is odd iff X is odd.
+    // 'Quadratic Reciprocity': X * X -> 0 for bit[1]
+    if (Op.getOperand(0) == Op.getOperand(1) && DemandedBits.ult(4)) {
+      SDValue One = TLO.DAG.getConstant(1, dl, VT);
+      SDValue And1 = TLO.DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), One);
+      return TLO.CombineTo(Op, And1);
+    }
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB: {
diff --git a/llvm/test/CodeGen/AArch64/combine-mul.ll b/llvm/test/CodeGen/AArch64/combine-mul.ll
@@ -108,8 +108,7 @@ define i32 @one_demanded_low_bit(i32 %x) {
 define i16 @squared_one_demanded_low_bit(i16 %x) {
 ; CHECK-LABEL: squared_one_demanded_low_bit:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w8, w0, w0
-; CHECK-NEXT:    and w0, w8, #0x1
+; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
   %mul = mul i16 %x, %x
   %and = and i16 %mul, 1
@@ -120,7 +119,6 @@ define <4 x i32> @squared_one_demanded_low_bit_splat(<4 x i32> %x) {
 ; CHECK-LABEL: squared_one_demanded_low_bit_splat:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    mvni v1.4s, #1
-; CHECK-NEXT:    mul v0.4s, v0.4s, v0.4s
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret
   %mul = mul <4 x i32> %x, %x
@@ -131,8 +129,7 @@ define <4 x i32> @squared_one_demanded_low_bit_splat(<4 x i32> %x) {
 define i32 @squared_demanded_2_low_bits(i32 %x) {
 ; CHECK-LABEL: squared_demanded_2_low_bits:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    mul w8, w0, w0
-; CHECK-NEXT:    and w0, w8, #0x3
+; CHECK-NEXT:    and w0, w0, #0x1
 ; CHECK-NEXT:    ret
   %mul = mul i32 %x, %x
   %and = and i32 %mul, 3
@@ -142,13 +139,7 @@ define i32 @squared_demanded_2_low_bits(i32 %x) {
 define <2 x i64> @squared_demanded_2_low_bits_splat(<2 x i64> %x) {
 ; CHECK-LABEL: squared_demanded_2_low_bits_splat:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov x8, d0
-; CHECK-NEXT:    mov x9, v0.d[1]
-; CHECK-NEXT:    mul x8, x8, x8
-; CHECK-NEXT:    mul x9, x9, x9
-; CHECK-NEXT:    fmov d0, x8
 ; CHECK-NEXT:    mov x8, #-2
-; CHECK-NEXT:    mov v0.d[1], x9
 ; CHECK-NEXT:    dup v1.2d, x8
 ; CHECK-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-NEXT:    ret