Skip to content

Commit 8c63a11

Browse files
committed
Reduce shl64 to shl32 if shift range is [63-32]
Signed-off-by: John Lu <[email protected]>
1 parent 0c70a26 commit 8c63a11

File tree

6 files changed

+80
-10
lines changed

6 files changed

+80
-10
lines changed

llvm/include/llvm/Transforms/InstCombine/InstCombiner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -521,6 +521,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner {
521521
bool AllowMultipleUsers = false) = 0;
522522

523523
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const;
524+
525+
bool shouldReduceShl64ToShl32();
524526
};
525527

526528
} // namespace llvm

llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,31 @@ static bool setShiftFlags(BinaryOperator &I, const SimplifyQuery &Q) {
10321032
return Changed;
10331033
}
10341034

1035+
static Instruction *transformClampedShift64(BinaryOperator &I,
1036+
const SimplifyQuery &Q,
1037+
InstCombiner::BuilderTy &Builder) {
1038+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
1039+
Type *I32Type = Type::getInt32Ty(I.getContext());
1040+
Type *I64Type = Type::getInt64Ty(I.getContext());
1041+
1042+
if (I.getType() == I64Type) {
1043+
KnownBits KnownAmt = computeKnownBits(Op1, /* Depth */ 0, Q);
1044+
if (KnownAmt.getMinValue().uge(32)) {
1045+
Value *TruncVal = Builder.CreateTrunc(Op0, I32Type);
1046+
Value *TruncShiftAmt = Builder.CreateTrunc(Op1, I32Type);
1047+
Value *AdjustedShiftAmt = Builder.CreateSub (TruncShiftAmt,
1048+
ConstantInt::get(I32Type, 32));
1049+
Value *Shl32 = Builder.CreateShl(TruncVal, AdjustedShiftAmt);
1050+
Value *VResult = Builder.CreateVectorSplat(2, ConstantInt::get(I32Type, 0));
1051+
1052+
VResult = Builder.CreateInsertElement(VResult, Shl32,
1053+
ConstantInt::get(I32Type, 1));
1054+
return CastInst::Create(Instruction::BitCast, VResult, I64Type);
1055+
}
1056+
}
1057+
return nullptr;
1058+
}
1059+
10351060
Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
10361061
const SimplifyQuery Q = SQ.getWithInstruction(&I);
10371062

@@ -1266,6 +1291,10 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
12661291
}
12671292
}
12681293

1294+
if (this->shouldReduceShl64ToShl32())
1295+
if (Instruction *V = transformClampedShift64(I, Q, Builder))
1296+
return V;
1297+
12691298
return nullptr;
12701299
}
12711300

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,13 @@ bool InstCombiner::isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
194194
return TTIForTargetIntrinsicsOnly.isValidAddrSpaceCast(FromAS, ToAS);
195195
}
196196

197+
bool InstCombiner::shouldReduceShl64ToShl32() {
198+
InstructionCost costShl32 = TTIForTargetIntrinsicsOnly.getArithmeticInstrCost(Instruction::Shl, Builder.getInt32Ty(), TTI::TCK_Latency);
199+
InstructionCost costShl64 = TTIForTargetIntrinsicsOnly.getArithmeticInstrCost(Instruction::Shl, Builder.getInt64Ty(), TTI::TCK_Latency);
200+
201+
return costShl32<costShl64;
202+
}
203+
197204
Value *InstCombinerImpl::EmitGEPOffset(GEPOperator *GEP, bool RewriteGEP) {
198205
if (!RewriteGEP)
199206
return llvm::emitGEPOffset(&Builder, DL, GEP);

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
174174
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
175175
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
176176
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41
177-
; CHECK-NEXT: v_and_b32_e32 v2, v2, v42
177+
; CHECK-NEXT: v_and_b32_e32 v2, v42, v2
178178
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
179179
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
180180
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
@@ -458,7 +458,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
458458
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
459459
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
460460
; CHECK-NEXT: v_lshlrev_b32_e32 v2, 31, v41
461-
; CHECK-NEXT: v_and_b32_e32 v2, v2, v42
461+
; CHECK-NEXT: v_and_b32_e32 v2, v42, v2
462462
; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
463463
; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
464464
; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload

llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -720,14 +720,15 @@ define double @test_pown_afn_nnan_ninf_f64(double %x, i32 %y) {
720720
; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to double
721721
; CHECK-NEXT: [[__YLOGX:%.*]] = fmul nnan ninf afn double [[__LOG2]], [[POWNI2F]]
722722
; CHECK-NEXT: [[__EXP2:%.*]] = call nnan ninf afn double @_Z4exp2d(double [[__YLOGX]])
723-
; CHECK-NEXT: [[__YTOU:%.*]] = zext i32 [[Y]] to i64
724-
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i64 [[__YTOU]], 63
725-
; CHECK-NEXT: [[TMP0:%.*]] = bitcast double [[X]] to i64
726-
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i64 [[__YEVEN]], [[TMP0]]
727-
; CHECK-NEXT: [[TMP1:%.*]] = bitcast double [[__EXP2]] to i64
728-
; CHECK-NEXT: [[TMP2:%.*]] = or i64 [[__POW_SIGN]], [[TMP1]]
729-
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i64 [[TMP2]] to double
730-
; CHECK-NEXT: ret double [[TMP3]]
723+
; CHECK-NEXT: [[TMP0:%.*]] = shl i32 [[Y]], 31
724+
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> <i32 0, i32 poison>, i32 [[TMP0]], i64 1
725+
; CHECK-NEXT: [[__YEVEN:%.*]] = bitcast <2 x i32> [[TMP1]] to i64
726+
; CHECK-NEXT: [[TMP2:%.*]] = bitcast double [[X]] to i64
727+
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i64 [[TMP2]], [[__YEVEN]]
728+
; CHECK-NEXT: [[TMP3:%.*]] = bitcast double [[__EXP2]] to i64
729+
; CHECK-NEXT: [[TMP4:%.*]] = or i64 [[__POW_SIGN]], [[TMP3]]
730+
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i64 [[TMP4]] to double
731+
; CHECK-NEXT: ret double [[TMP5]]
731732
;
732733
entry:
733734
%call = tail call nnan ninf afn double @_Z4powndi(double %x, i32 %y)
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
; RUN: opt < %s -passes=instcombine -S | FileCheck %s
2+
3+
4+
target triple = "amdgcn-amd-amdhsa"
5+
6+
define i64 @func_range(i64 noundef %arg0, ptr %arg1.ptr) {
7+
%shift.amt = load i64, ptr %arg1.ptr, !range !0
8+
%shl = shl i64 %arg0, %shift.amt
9+
ret i64 %shl
10+
11+
; CHECK: define i64 @func_range(i64 noundef %arg0, ptr %arg1.ptr) {
12+
; CHECK: %shift.amt = load i64, ptr %arg1.ptr, align 8, !range !0
13+
; CHECK: %1 = trunc i64 %arg0 to i32
14+
; CHECK: %2 = trunc nuw nsw i64 %shift.amt to i32
15+
; CHECK: %3 = add nsw i32 %2, -32
16+
; CHECK: %4 = shl i32 %1, %3
17+
; CHECK: %5 = insertelement <2 x i32> <i32 0, i32 poison>, i32 %4, i64 1
18+
; CHECK: %shl = bitcast <2 x i32> %5 to i64
19+
; CHECK: ret i64 %shl
20+
21+
}
22+
!0 = !{i64 32, i64 64}
23+
24+
define i64 @func_max(i64 noundef %arg0, i64 noundef %arg1) {
25+
%max = call i64 @llvm.umax.i64(i64 %arg1, i64 32)
26+
%min = call i64 @llvm.umin.i64(i64 %max, i64 63)
27+
%shl = shl i64 %arg0, %min
28+
ret i64 %shl
29+
}
30+
31+

0 commit comments

Comments
 (0)