Skip to content

Commit f7848c8

Browse files
Artem Gindinsonigcbot
authored andcommitted
Improve transformation of mul by power-of-2 constants
This mul-to-shl optimization was initially introduced in commit d2e586f. From a functional standpoint, this change addresses the fundamental issue with `nuw`/`nsw` tags being discarded altogether, when we should in fact respect the tags for all of the newly generated instructions. We should also skip the transformation for 1/-1, as apparently this can skew some `mad`-based optimization opportunities in i64 emulation cases. Tests are adjusted and expanded accordingly. Additionally, early exits are employed where appropriate, with more detailed TODOs for future improvements.
1 parent d9d8a53 commit f7848c8

File tree

5 files changed

+285
-148
lines changed

5 files changed

+285
-148
lines changed

IGC/Compiler/CustomSafeOptPass.cpp

Lines changed: 72 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,6 @@ cmp+sel to avoid expensive VxH mov.
6262
#include "GenISAIntrinsics/GenIntrinsicInst.h"
6363
#include "common/IGCConstantFolder.h"
6464
#include "common/LLVMWarningsPush.hpp"
65-
#include "llvm/Config/llvm-config.h"
66-
#include <llvmWrapper/ADT/APInt.h>
6765
#include "llvmWrapper/IR/IntrinsicInst.h"
6866
#include <llvmWrapper/IR/DIBuilder.h>
6967
#include <llvmWrapper/IR/DerivedTypes.h>
@@ -73,6 +71,7 @@ cmp+sel to avoid expensive VxH mov.
7371
#include <llvm/ADT/Statistic.h>
7472
#include <llvm/ADT/SetVector.h>
7573
#include <llvm/Analysis/ConstantFolding.h>
74+
#include <llvm/Analysis/InstructionSimplify.h>
7675
#include <llvm/IR/Constants.h>
7776
#include "llvm/IR/DebugInfo.h"
7877
#include <llvm/IR/Function.h>
@@ -2737,50 +2736,80 @@ void GenSpecificPattern::visitMul(llvm::BinaryOperator& I)
27372736
Value* ValOp = nullptr;
27382737
const APInt* ConstOp = nullptr;
27392738
using namespace llvm::PatternMatch;
2740-
if (match(&I, m_c_Mul(m_Value(ValOp), m_APInt(ConstOp))))
2741-
{
2742-
IRBuilder<> builder(&I);
2743-
if (ConstOp->isPowerOf2())
2744-
{
2745-
I.replaceAllUsesWith(
2746-
builder.CreateShl(ValOp, (uint64_t)ConstOp->exactLogBase2()));
2747-
I.eraseFromParent();
2748-
return;
2749-
}
2750-
else if (!IGCLLVM::isNegatedPowerOf2(*ConstOp))
2751-
return;
2739+
if (!match(&I, m_c_Mul(m_Value(ValOp), m_APInt(ConstOp))))
2740+
return;
2741+
2742+
const bool HasNUW = I.hasNoUnsignedWrap();
2743+
const bool HasNSW = I.hasNoSignedWrap();
2744+
IRBuilder<> builder(&I);
2745+
// 0. Skip the optimization for mul(x, {1; -1}) - in most cases, such
2746+
// instances are generated within i64 emulation sequences, and the emitter
2747+
// is geared toward such patterns.
2748+
if (ConstOp->isOne() || ConstOp->isAllOnes())
2749+
return;
27522750

2753-
APInt ConstOpAbs = ConstOp->abs();
2751+
// 1. 2^n case
2752+
if (ConstOp->isPowerOf2())
2753+
{
27542754
Value* Shl = builder.CreateShl(
2755-
ValOp, (uint64_t)ConstOpAbs.exactLogBase2());
2756-
for (User* UI : I.users())
2757-
{
2758-
// We're going a little further and making sure that we merge the
2759-
// shift result's negation with any subsequent adds:
2760-
// '%shift = shl i64 %x, n'
2761-
// '%res' = sub %var, %shift
2762-
// preventing the additional negation in between the shift and the
2763-
// `var` addition. This kind of a peephole optimization may not be
2764-
// available later down the pipeline.
2765-
// TODO: Consider moving this logic to add/sub visitors once the
2766-
// whole GenSpecificPattern iteration logic is guaranteed to allow
2767-
// deferred instructions.
2768-
Value* Addend = nullptr;
2769-
if (match(UI, m_c_Add(m_Specific(&I), m_Value(Addend))))
2770-
{
2771-
// Propagate the 'shl' <- 'sub' results instead. No way to
2772-
// erase the original adds within this pass just yet, as we'd
2773-
// be invalidating the InstVisitor iteration, but subsequent
2774-
// DCE instances will handle the orphaned instructions anyway.
2775-
builder.SetInsertPoint(cast<Instruction>(UI));
2776-
UI->replaceAllUsesWith(builder.CreateSub(Addend, Shl));
2777-
}
2778-
}
2779-
// Make sure to reset the insertion point for the shl negation after
2780-
// the possible transformations above
2781-
builder.SetInsertPoint(&I);
2782-
I.replaceAllUsesWith(builder.CreateNeg(Shl));
2755+
ValOp, (uint64_t)ConstOp->exactLogBase2(), "",
2756+
/*bool NUW=*/HasNUW, /*bool NSW=*/HasNSW);
2757+
I.replaceAllUsesWith(Shl);
27832758
I.eraseFromParent();
2759+
return;
2760+
}
2761+
2762+
// 2. -2^n case
2763+
if (!ConstOp->isNegatedPowerOf2())
2764+
return;
2765+
APInt ConstOpAbs = ConstOp->abs();
2766+
Value* Shl = builder.CreateShl(
2767+
ValOp, (uint64_t)ConstOpAbs.exactLogBase2(), "",
2768+
/*bool NUW=*/HasNUW, /*bool NSW=*/HasNSW);
2769+
// NB: We should retain NUW infromation in our case, so a simpler
2770+
// IRBuilder<>::CreateNeg doesn't cut it.
2771+
Value* Sub = builder.CreateSub(Constant::getNullValue(Shl->getType()), Shl,
2772+
"", /*bool NUW=*/HasNUW,/*bool NSW=*/HasNSW);
2773+
I.replaceAllUsesWith(Sub);
2774+
I.eraseFromParent();
2775+
2776+
// Go a little further and make sure to merge the shift result's negation
2777+
// with any subsequent adds:
2778+
// '%shift = shl i64 %x, n'
2779+
// '%res' = sub %var, %shift
2780+
// preventing the additional negation in between the shift and `var`
2781+
// addition.
2782+
//
2783+
// TODO: Consider also folding sdiv/srem uses with a constant operand by
2784+
// negating said operand and using the 'shl' result directly. No reason to
2785+
// consider 'mul's though - in case of matching overflow flags, we'd expect
2786+
// these to be constant-folded with the original 'mul' earlier in the
2787+
// pipeline.
2788+
//
2789+
// TODO: Consider re-using an LLVM routine of some sort for this
2790+
// optimization. That said, on LLVM 14, llvm::SimplifyBinOp() and
2791+
// higher-level IRBuilder<InstSimplifyFolder>::CreateAdd()/CreateBinOp()
2792+
// replacements fail to work out of the box.
2793+
// Alternatively, move this logic to add/sub visitors once the whole
2794+
// GenSpecificPattern iteration logic allows for deferred instructions.
2795+
for (User* UI : Sub->users())
2796+
{
2797+
Value* Addend = nullptr;
2798+
if (!match(UI, m_c_Add(m_Specific(Sub), m_Value(Addend))))
2799+
continue;
2800+
// Propagate the 'shl' <- 'sub' results instead. No way to erase the
2801+
// original adds within this pass just yet, as we'd invalidate the
2802+
// ongoing InstVisitor iteration, however subsequent DCE instances will
2803+
// handle the orphaned instructions anyway.
2804+
auto* AddI = cast<Instruction>(UI);
2805+
if (AddI->hasNoUnsignedWrap() != HasNUW ||
2806+
AddI->hasNoSignedWrap() != HasNSW)
2807+
continue;
2808+
2809+
builder.SetInsertPoint(AddI);
2810+
Value* Sub = builder.CreateSub(
2811+
Addend, Shl, "", /*bool NUW=*/HasNUW, /*bool NSW=*/HasNSW);
2812+
AddI->replaceAllUsesWith(Sub);
27842813
}
27852814
}
27862815

IGC/Compiler/tests/GenSpecificPattern/binary-typed-pointers.ll

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -102,50 +102,5 @@ define void @test_and64(double %src1) {
102102
ret void
103103
}
104104

105-
define void @test_simple_mul_pow2(i32 %arg) {
106-
%1 = mul i32 %arg, 16
107-
call void @use.i32(i32 %1)
108-
ret void
109-
}
110-
; CHECK-LABEL: @test_simple_mul_pow2(i32 %arg
111-
; CHECK: %[[SHL:.+]] = shl i32 %arg, 4
112-
; CHECK: call void @use.i32(i32 %[[SHL]])
113-
; CHECK: ret void
114-
115-
define void @test_mul_neg_pow2_with_adds(i32 %arg0, i32 %arg1, i32 %arg2) {
116-
entry:
117-
; COM: The mul itself + simple use
118-
%0 = mul i32 %arg0, -1024
119-
call void @use.i32(i32 %0)
120-
br label %.add1
121-
.add1:
122-
; COM: mul + addend
123-
%1 = add i32 %0, %arg1
124-
call void @use.i32(i32 %1)
125-
br label %.add2
126-
.add2:
127-
; COM: addend + mul
128-
%2 = add i32 %arg2, %0
129-
call void @use.i32(i32 %2)
130-
ret void
131-
}
132-
; CHECK-LABEL: @test_mul_neg_pow2_with_adds(i32 %arg0, i32 %arg1, i32 %arg2
133-
; COM: The mul itself + simple use
134-
; CHECK: entry:
135-
; CHECK: %[[SHL:.+]] = shl i32 %arg0, 10
136-
; CHECK: %[[NEG:.+]] = sub i32 0, %[[SHL]]
137-
; CHECK: call void @use.i32(i32 %[[NEG]])
138-
; CHECK: br label %.add1
139-
; COM: mul + addend -> addend - shl
140-
; CHECK: .add1:
141-
; CHECK: %[[SUB1:.+]] = sub i32 %arg1, %[[SHL]]
142-
; CHECK: call void @use.i32(i32 %[[SUB1]])
143-
; CHECK: br label %.add2
144-
; COM: addend + mul -> addend - shl
145-
; CHECK: .add2:
146-
; CHECK: %[[SUB2:.+]] = sub i32 %arg2, %[[SHL]]
147-
; CHECK: call void @use.i32(i32 %[[SUB2]])
148-
; CHECK: ret void
149-
150105
declare void @use.i64(i64)
151106
declare void @use.i32(i32)

IGC/Compiler/tests/GenSpecificPattern/binary.ll

Lines changed: 0 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -104,50 +104,5 @@ define void @test_and64(double %src1) {
104104
ret void
105105
}
106106

107-
define void @test_simple_mul_pow2(i32 %arg) {
108-
%1 = mul i32 %arg, 16
109-
call void @use.i32(i32 %1)
110-
ret void
111-
}
112-
; CHECK-LABEL: @test_simple_mul_pow2(i32 %arg
113-
; CHECK: %[[SHL:.+]] = shl i32 %arg, 4
114-
; CHECK: call void @use.i32(i32 %[[SHL]])
115-
; CHECK: ret void
116-
117-
define void @test_mul_neg_pow2_with_adds(i32 %arg0, i32 %arg1, i32 %arg2) {
118-
entry:
119-
; COM: The mul itself + simple use
120-
%0 = mul i32 %arg0, -1024
121-
call void @use.i32(i32 %0)
122-
br label %.add1
123-
.add1:
124-
; COM: mul + addend
125-
%1 = add i32 %0, %arg1
126-
call void @use.i32(i32 %1)
127-
br label %.add2
128-
.add2:
129-
; COM: addend + mul
130-
%2 = add i32 %arg2, %0
131-
call void @use.i32(i32 %2)
132-
ret void
133-
}
134-
; CHECK-LABEL: @test_mul_neg_pow2_with_adds(i32 %arg0, i32 %arg1, i32 %arg2
135-
; COM: The mul itself + simple use
136-
; CHECK: entry:
137-
; CHECK: %[[SHL:.+]] = shl i32 %arg0, 10
138-
; CHECK: %[[NEG:.+]] = sub i32 0, %[[SHL]]
139-
; CHECK: call void @use.i32(i32 %[[NEG]])
140-
; CHECK: br label %.add1
141-
; COM: mul + addend -> addend - shl
142-
; CHECK: .add1:
143-
; CHECK: %[[SUB1:.+]] = sub i32 %arg1, %[[SHL]]
144-
; CHECK: call void @use.i32(i32 %[[SUB1]])
145-
; CHECK: br label %.add2
146-
; COM: addend + mul -> addend - shl
147-
; CHECK: .add2:
148-
; CHECK: %[[SUB2:.+]] = sub i32 %arg2, %[[SHL]]
149-
; CHECK: call void @use.i32(i32 %[[SUB2]])
150-
; CHECK: ret void
151-
152107
declare void @use.i64(i64)
153108
declare void @use.i32(i32)

0 commit comments

Comments
 (0)