Re apply 130577 narrow math for and operand #133896

Shoreshen · 2025-04-01T10:51:32Z

Which is reverted in #133880

The old application failed in address sanitizer due to tryNarrowMathIfNoOverflow was called after I.eraseFromParent(); in AMDGPUCodeGenPrepareImpl::visitBinaryOperator, it create a use after free failure.

To fix this, tryNarrowMathIfNoOverflow will be called before and directly return if tryNarrowMathIfNoOverflow result in true.

… if prof…" This reverts commit 7f14b2a.

…and-operand

llvmbot · 2025-04-02T01:13:53Z

@llvm/pr-subscribers-backend-amdgpu

Author: None (Shoreshen)

Changes

Re-apply #130577

Which is reverted in #133880

The old application failed in address sanitizer due to tryNarrowMathIfNoOverflow was called after I.eraseFromParent(); in AMDGPUCodeGenPrepareImpl::visitBinaryOperator, it create a use after free failure.

To fix this, tryNarrowMathIfNoOverflow will be called before and directly return if tryNarrowMathIfNoOverflow result in true.

Patch is 21.36 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133896.diff

5 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp (+84)
(modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll (+4-1)
(modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll (+24-28)
(added) llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll (+231)
(modified) llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll (+4-5)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 9c482aeb3ea5c..b0914c17b9827 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -1561,6 +1561,87 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
   llvm_unreachable("not a division");
 }
 
+Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded,
+                            const TargetLowering *TLI, const DataLayout &DL) {
+  if (MaxBitsNeeded >= OrigBit)
+    return nullptr;
+
+  Type *NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
+  while (OrigBit > MaxBitsNeeded) {
+    if (TLI->isOperationLegalOrCustom(
+            TLI->InstructionOpcodeToISD(I->getOpcode()),
+            TLI->getValueType(DL, NewType, true)))
+      return NewType;
+
+    MaxBitsNeeded *= 2;
+    NewType = I->getType()->getWithNewBitWidth(MaxBitsNeeded);
+  }
+  return nullptr;
+}
+
+static bool tryNarrowMathIfNoOverflow(Instruction *I, const TargetLowering *TLI,
+                                      const TargetTransformInfo &TTI,
+                                      const DataLayout &DL) {
+  unsigned Opc = I->getOpcode();
+  Type *OldType = I->getType();
+
+  if (Opc != Instruction::Add && Opc != Instruction::Mul)
+    return false;
+
+  unsigned OrigBit = OldType->getScalarSizeInBits();
+  unsigned MaxBitsNeeded = OrigBit;
+
+  switch (Opc) {
+  case Instruction::Add:
+    MaxBitsNeeded = KnownBits::add(computeKnownBits(I->getOperand(0), DL),
+                                   computeKnownBits(I->getOperand(1), DL))
+                        .countMaxActiveBits();
+    break;
+  case Instruction::Mul:
+    MaxBitsNeeded = KnownBits::mul(computeKnownBits(I->getOperand(0), DL),
+                                   computeKnownBits(I->getOperand(1), DL))
+                        .countMaxActiveBits();
+    break;
+  default:
+    llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
+                     "Instruction::Mul.");
+  }
+
+  MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
+  Type *NewType = findSmallestLegalBits(I, OrigBit, MaxBitsNeeded, TLI, DL);
+
+  if (!NewType)
+    return false;
+
+  // Old cost
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
+  // New cost of new op
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
+  // New cost of narrowing 2 operands (use trunc)
+  NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
+                                      TTI.getCastContextHint(I),
+                                      TTI::TCK_RecipThroughput);
+  // New cost of zext narrowed result to original type
+  NewCost +=
+      TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
+                           TTI.getCastContextHint(I), TTI::TCK_RecipThroughput);
+  if (NewCost >= OldCost)
+    return false;
+
+  IRBuilder<> Builder(I);
+  Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
+  Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
+  Value *Arith =
+      Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
+
+  Value *Zext = Builder.CreateZExt(Arith, OldType);
+  I->replaceAllUsesWith(Zext);
+  I->eraseFromParent();
+  return true;
+}
+
 bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
   if (foldBinOpIntoSelect(I))
     return true;
@@ -1571,6 +1652,9 @@ bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
 
   if (UseMul24Intrin && replaceMulWithMul24(I))
     return true;
+  if (tryNarrowMathIfNoOverflow(&I, ST.getTargetLowering(),
+                                TM.getTargetTransformInfo(F), DL))
+    return true;
 
   bool Changed = false;
   Instruction::BinaryOps Opc = I.getOpcode();
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
index 296b817bc8f75..d7c35a8b007c6 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll
@@ -414,7 +414,10 @@ define i64 @umul24_i64_2(i64 %lhs, i64 %rhs) {
 ; DISABLED-LABEL: @umul24_i64_2(
 ; DISABLED-NEXT:    [[LHS24:%.*]] = and i64 [[LHS:%.*]], 65535
 ; DISABLED-NEXT:    [[RHS24:%.*]] = and i64 [[RHS:%.*]], 65535
-; DISABLED-NEXT:    [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]]
+; DISABLED-NEXT:    [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
+; DISABLED-NEXT:    [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
+; DISABLED-NEXT:    [[TMP3:%.*]] = mul i32 [[TMP1]], [[TMP2]]
+; DISABLED-NEXT:    [[MUL:%.*]] = zext i32 [[TMP3]] to i64
 ; DISABLED-NEXT:    ret i64 [[MUL]]
 ;
   %lhs24 = and i64 %lhs, 65535
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 62083b3e67ab6..e2dfcf55b7856 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -1823,22 +1823,22 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264:       ; %bb.0: ; %entry
 ; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1264-NEXT:    s_mov_b32 s9, 0
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1264-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1264-NEXT:    s_wait_alu 0xfffe
+; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
-; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
@@ -1860,20 +1860,19 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-LABEL: add_i64_constant:
 ; GFX1232:       ; %bb.0: ; %entry
 ; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1232-NEXT:    s_mov_b32 s5, 0
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
 ; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB3_2
 ; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s7
+; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s6
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
@@ -1881,8 +1880,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB3_2:
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
@@ -5372,22 +5370,22 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1264:       ; %bb.0: ; %entry
 ; GFX1264-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1264-NEXT:    s_mov_b64 s[6:7], exec
-; GFX1264-NEXT:    s_mov_b32 s9, 0
-; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_mov_b64 s[4:5], exec
+; GFX1264-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
 ; GFX1264-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1264-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
 ; GFX1264-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1264-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1264-NEXT:  ; %bb.1:
-; GFX1264-NEXT:    s_bcnt1_i32_b64 s8, s[6:7]
+; GFX1264-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX1264-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1264-NEXT:    s_wait_alu 0xfffe
+; GFX1264-NEXT:    s_mul_i32 s6, s6, 5
 ; GFX1264-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1264-NEXT:    s_mul_u64 s[6:7], s[8:9], 5
-; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    s_wait_alu 0xfffe
 ; GFX1264-NEXT:    v_mov_b32_e32 v0, s6
-; GFX1264-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1264-NEXT:    s_mov_b32 s10, -1
 ; GFX1264-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264-NEXT:    s_mov_b32 s8, s2
 ; GFX1264-NEXT:    s_mov_b32 s9, s3
@@ -5412,20 +5410,19 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-LABEL: sub_i64_constant:
 ; GFX1232:       ; %bb.0: ; %entry
 ; GFX1232-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX1232-NEXT:    s_mov_b32 s7, exec_lo
-; GFX1232-NEXT:    s_mov_b32 s5, 0
-; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s7, 0
 ; GFX1232-NEXT:    s_mov_b32 s6, exec_lo
+; GFX1232-NEXT:    s_mov_b32 s4, exec_lo
+; GFX1232-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
 ; GFX1232-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1232-NEXT:    v_cmpx_eq_u32_e32 0, v2
 ; GFX1232-NEXT:    s_cbranch_execz .LBB9_2
 ; GFX1232-NEXT:  ; %bb.1:
-; GFX1232-NEXT:    s_bcnt1_i32_b32 s4, s7
+; GFX1232-NEXT:    s_bcnt1_i32_b32 s5, s6
 ; GFX1232-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX1232-NEXT:    s_mul_u64 s[4:5], s[4:5], 5
+; GFX1232-NEXT:    s_mul_i32 s5, s5, 5
 ; GFX1232-NEXT:    s_mov_b32 s10, -1
-; GFX1232-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1232-NEXT:    v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    s_mov_b32 s8, s2
 ; GFX1232-NEXT:    s_mov_b32 s9, s3
@@ -5433,8 +5430,7 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX1232-NEXT:    s_wait_loadcnt 0x0
 ; GFX1232-NEXT:    global_inv scope:SCOPE_DEV
 ; GFX1232-NEXT:  .LBB9_2:
-; GFX1232-NEXT:    s_wait_alu 0xfffe
-; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s6
+; GFX1232-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX1232-NEXT:    s_wait_kmcnt 0x0
 ; GFX1232-NEXT:    v_readfirstlane_b32 s2, v0
 ; GFX1232-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
diff --git a/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
new file mode 100644
index 0000000000000..3f49b1e550595
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/narrow_math_for_and.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck %s
+
+define i64 @narrow_add(i64 %a, i64 %b) {
+; CHECK-LABEL: narrow_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_nc_u32 v0, v0, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 2147483647
+  %zext1 = and i64 %b, 2147483647
+  %add = add i64 %zext0, %zext1
+  ret i64 %add
+}
+
+define i64 @narrow_add_1(i64 %a, i64 %b) {
+; CHECK-LABEL: narrow_add_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 2147483647
+  %zext1 = and i64 %b, 2147483648
+  %add = add i64 %zext0, %zext1
+  ret i64 %add
+}
+
+define <2 x i64> @narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: narrow_add_vec:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, 30, v2
+; CHECK-NEXT:    v_and_b32_e32 v3, 0x7ffffffe, v6
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v2, s0, v2, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and <2 x i64> %a, <i64 2147483647, i64 30>
+  %zext1 = and <2 x i64> %b, <i64 2147483647, i64 2147483646>
+  %add = add <2 x i64> %zext0, %zext1
+  ret <2 x i64> %add
+}
+
+define <2 x i32> @narrow_add_vec_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: narrow_add_vec_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x3fff, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x4000, v0
+; CHECK-NEXT:    v_and_b32_e32 v3, 0x4001, v3
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x4000, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; CHECK-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_pk_add_u16 v1, v0, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0xc000, v1
+; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and <2 x i32> %a, <i32 16384, i32 16383>
+  %zext1 = and <2 x i32> %b, <i32 16384, i32 16385>
+  %add = add <2 x i32> %zext0, %zext1
+  ret <2 x i32> %add
+}
+
+define i64 @narrow_mul(i64 %a, i64 %b) {
+; CHECK-LABEL: narrow_mul:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 2, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 2147483647
+  %zext1 = and i64 %b, 2
+  %mul = mul i64 %zext0, %zext1
+  ret i64 %mul
+}
+
+define i64 @narrow_mul_1(i64 %a, i64 %b) {
+; CHECK-LABEL: narrow_mul_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0xf73594, v0
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x100, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v2
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 16201108
+  %zext1 = and i64 %b, 256
+  %mul = mul i64 %zext0, %zext1
+  ret i64 %mul
+}
+
+define <2 x i64> @narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: narrow_mul_vec:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x2d48aff, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x50, v4
+; CHECK-NEXT:    v_and_b32_e32 v3, 50, v2
+; CHECK-NEXT:    v_and_b32_e32 v4, 20, v6
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; CHECK-NEXT:    v_mul_lo_u32 v0, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32_u24_e32 v3, v3, v4
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and <2 x i64> %a, <i64 47483647, i64 50>
+  %zext1 = and <2 x i64> %b, <i64 80, i64 20>
+  %mul = mul <2 x i64> %zext0, %zext1
+  ret <2 x i64> %mul
+}
+
+define <2 x i32> @narrow_add_mul_1(<2 x i32> %a, <2 x i32> %b) #0 {
+; CHECK-LABEL: narrow_add_mul_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x4000, v1
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x4000, v0
+; CHECK-NEXT:    v_and_b32_e32 v2, 3, v2
+; CHECK-NEXT:    v_and_b32_e32 v3, 2, v3
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
+; CHECK-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and <2 x i32> %a, <i32 16384, i32 16384>
+  %zext1 = and <2 x i32> %b, <i32 3, i32 2>
+  %mul = mul <2 x i32> %zext0, %zext1
+  ret <2 x i32> %mul
+}
+
+define i64 @no_narrow_add(i64 %a, i64 %b) {
+; CHECK-LABEL: no_narrow_add:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x80000000, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 2147483648
+  %zext1 = and i64 %b, 2147483648
+  %add = add i64 %zext0, %zext1
+  ret i64 %add
+}
+
+define i64 @no_narrow_add_1(i64 %a, i64 %b) {
+; CHECK-LABEL: no_narrow_add_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 1, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 4294967295
+  %zext1 = and i64 %b, 1
+  %add = add i64 %zext0, %zext1
+  ret i64 %add
+}
+
+define <2 x i64> @no_narrow_add_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: no_narrow_add_vec:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x80000000, v4
+; CHECK-NEXT:    v_and_b32_e32 v2, 30, v2
+; CHECK-NEXT:    v_and_b32_e32 v3, 0x7ffffffe, v6
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v0, s0, v0, v1
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, 0, s0
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; CHECK-NEXT:    v_add_co_u32 v2, s0, v2, v3
+; CHECK-NEXT:    v_add_co_ci_u32_e64 v3, null, 0, 0, s0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and <2 x i64> %a, <i64 2147483648, i64 30>
+  %zext1 = and <2 x i64> %b, <i64 2147483648, i64 2147483646>
+  %add = add <2 x i64> %zext0, %zext1
+  ret <2 x i64> %add
+}
+
+define i64 @no_narrow_mul(i64 %a, i64 %b) {
+; CHECK-LABEL: no_narrow_mul:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; CHECK-NEXT:    v_and_b32_e32 v1, 2, v2
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
+; CHECK-NEXT:    v_mov_b32_e32 v0, 0
+; CHECK-NEXT:    s_setpc_b64 s[30:31]
+  %zext0 = and i64 %a, 2147483648
+  %zext1 = and i64 %b, 2
+  %mul = mul i64 %zext0, %zext1
+  ret i64 %mul
+}
+
+define <2 x i64> @no_narrow_mul_vec(<2 x i64> %a, <2 x i64> %b) #0 {
+; CHECK-LABEL: no_narrow_mul_vec:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x8000, v0
+; CHECK-NEXT:    v_and_b32_e32 v3, 0x20000, v4
+; CHECK-NEXT:    v_and_b32_e32 v4, 50, v2
+; CHECK-NEXT:    v_and_b32_e32 v5, 20, v6
+; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; CHECK-NEXT:    v_mul_u32_u24_e32 v0, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32_u24_e32 v1, v1, v3
+; CHECK-NEXT:   ...
[truncated]

arsenm · 2025-04-02T04:39:00Z

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+  // Old cost
+  InstructionCost OldCost =
+      TTI.getArithmeticInstrCost(Opc, OldType, TTI::TCK_RecipThroughput);
+  // New cost of new op
+  InstructionCost NewCost =
+      TTI.getArithmeticInstrCost(Opc, NewType, TTI::TCK_RecipThroughput);
+  // New cost of narrowing 2 operands (use trunc)
+  NewCost += 2 * TTI.getCastInstrCost(Instruction::Trunc, NewType, OldType,
+                                      TTI.getCastContextHint(I),
+                                      TTI::TCK_RecipThroughput);
+  // New cost of zext narrowed result to original type
+  NewCost +=
+      TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
+                           TTI.getCastContextHint(I), TTI::TCK_RecipThroughput);


Can we move this whole back to the generic code?

Hi @arsenm , there are some problems of moving this to generic code

If put this in AggressiveInstCombine, the change will be reverted

If put this in general CodeGenPrepare, it will block some optimization from other backend, please see example

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

arsenm · 2025-04-02T04:41:00Z

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

@@ -1561,6 +1561,87 @@ void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
  llvm_unreachable("not a division");
 }

+Type *findSmallestLegalBits(Instruction *I, int OrigBit, int MaxBitsNeeded,


This seems to be reinventing TLI::getTypeToPromoteTo

Or potentially integrate the logic there.

Hi @arsenm @shiltian , I've removed the function by avoiding using DAG selection related functionalities...

As discussed with @arsenm I'll use DataLayout::getSmallestLegalIntType instead~ Thanks

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

jmmartinez

Added a minor comment, everything else looks good to me.

Shoreshen · 2025-04-15T01:26:09Z

Hi @shiltian @arsenm , for this PR is there anything I need to update?? If not I'm gonna merge it after all tests passed.. Thanks

Re-apply llvm#130577 Which is reverted in llvm#133880 The old application failed in address sanitizer due to `tryNarrowMathIfNoOverflow` was called after `I.eraseFromParent();` in `AMDGPUCodeGenPrepareImpl::visitBinaryOperator`, it create a use after free failure. To fix this, `tryNarrowMathIfNoOverflow` will be called before and directly return if `tryNarrowMathIfNoOverflow` result in true.

arsenm · 2025-04-17T11:14:53Z

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+This will cause non-byte load in consistency, for example:
+```
+    %load = load i1, ptr addrspace(4) %arg, align 4
+    %zext = zext i1 %load to
+    i64 %add = add i64 %zext
+```
+Instead of creating `s_and_b32 s0, s0, 1`,
+it will create `s_and_b32 s0, s0, 0xff`.
+We accept this change since the non-byte load assumes the upper bits
+within the byte are all 0.


Comment is off topic for what this actually does (should also prefer c++ style comments)

arsenm · 2025-04-17T11:15:13Z

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

+within the byte are all 0.
+*/
+static bool tryNarrowMathIfNoOverflow(Instruction *I,
+                                      const SITargetLowering *TLI,


Can use references

Shoreshen added 2 commits April 1, 2025 17:52

Revert "Revert "[AMDGPU][CodeGenPrepare] Narrow 64 bit math to 32 bit…

10bb8da

… if prof…" This reverts commit 7f14b2a.

Merge branch 'main' into revert-133880-revert-130577-narrow-math-for-…

fd6b530

…and-operand

Shoreshen requested review from shiltian and arsenm April 1, 2025 10:55

Shoreshen added 2 commits April 1, 2025 19:05

fix address sanitizer failure

69271c0

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

f831e51

llvmbot added the backend:AMDGPU label Apr 2, 2025

arsenm reviewed Apr 2, 2025

View reviewed changes

Shoreshen added 4 commits April 2, 2025 13:54

fix comments

a937f5b

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

9fd971f

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

990da17

remove isd related type check

80695a0

Shoreshen requested a review from arsenm April 7, 2025 08:04

Shoreshen added 5 commits April 8, 2025 09:20

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

37dc751

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

72560d7

fix comments

8d87d7c

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

92442bf

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

7db8bae

jmmartinez reviewed Apr 9, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Outdated Show resolved Hide resolved

Shoreshen added 3 commits April 11, 2025 09:19

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

96a3cd9

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

f2db6a2

fix comments

ef91659

jmmartinez reviewed Apr 11, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Outdated Show resolved Hide resolved

Shoreshen added 2 commits April 11, 2025 15:06

fix comment

11a745c

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

c84f1f3

jmmartinez reviewed Apr 14, 2025

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp Outdated Show resolved Hide resolved

jmmartinez approved these changes Apr 14, 2025

View reviewed changes

fix comment, consider one constant situation

468eec7

Shoreshen added 2 commits April 15, 2025 09:38

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

b185ed9

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

7332e7d

shiltian approved these changes Apr 16, 2025

View reviewed changes

Shoreshen added 5 commits April 16, 2025 11:59

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

3e06714

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

90bad11

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

6d34b08

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

8add585

Merge branch 'main' into re-apply-130577-narrow-math-for-and-operand

a5837c4

Shoreshen merged commit 121cd7c into llvm:main Apr 17, 2025
11 checks passed

arsenm reviewed Apr 17, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Re apply 130577 narrow math for and operand #133896

Re apply 130577 narrow math for and operand #133896

Uh oh!

Shoreshen commented Apr 1, 2025 •

edited

Loading

Uh oh!

llvmbot commented Apr 2, 2025

Uh oh!

arsenm Apr 2, 2025

Uh oh!

Shoreshen Apr 2, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Apr 2, 2025

Uh oh!

shiltian Apr 2, 2025

Uh oh!

Shoreshen Apr 8, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jmmartinez left a comment

Uh oh!

Shoreshen commented Apr 15, 2025

Uh oh!

Uh oh!

arsenm Apr 17, 2025

Uh oh!

arsenm Apr 17, 2025

Uh oh!

Uh oh!

Re apply 130577 narrow math for and operand #133896

Re apply 130577 narrow math for and operand #133896

Uh oh!

Conversation

Shoreshen commented Apr 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Apr 2, 2025

Uh oh!

arsenm Apr 2, 2025

Choose a reason for hiding this comment

Uh oh!

Shoreshen Apr 2, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

arsenm Apr 2, 2025

Choose a reason for hiding this comment

Uh oh!

shiltian Apr 2, 2025

Choose a reason for hiding this comment

Uh oh!

Shoreshen Apr 8, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

jmmartinez left a comment

Choose a reason for hiding this comment

Uh oh!

Shoreshen commented Apr 15, 2025

Uh oh!

Uh oh!

arsenm Apr 17, 2025

Choose a reason for hiding this comment

Uh oh!

arsenm Apr 17, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Shoreshen commented Apr 1, 2025 •

edited

Loading