Skip to content

Commit 7a7d735

Browse files
Artem Gindinsonigcbot
authored andcommitted
Explicitly handle LLVM min/max intrinsics in i64 emulation
The existing "default" logic for emulating i64 intrinsic calls is not suitable for LLVM 12+ min/max intrinsics - upon the LLVM 14 switch, any such occurence out of LLVM's instruction simplification gets lowered into invalid assembly, resulting in GPU hangs. Rework such calls into `cmp` intructions with corresponding predicates, deferring the resulting `cmp`/`select` sequence to the dedicated `InstExpander::` visitors. Additionally, add a LIT case for `llvm.abs.i64` emulation as a follow-up on commit 8863ed6.
1 parent 2791a2f commit 7a7d735

File tree

2 files changed

+239
-16
lines changed

2 files changed

+239
-16
lines changed

IGC/Compiler/CISACodeGen/Emu64OpsPass.cpp

Lines changed: 64 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1924,8 +1924,21 @@ bool InstExpander::visitCall(CallInst& Call) {
19241924
IGC_ASSERT(nullptr != Emu);
19251925

19261926
const Function* F = Call.getCalledFunction();
1927+
bool doInt64BitCall = Emu->isInt64(&Call);
1928+
if (!doInt64BitCall) {
1929+
for (auto& Op : Call.operands()) {
1930+
if (Emu->isInt64(Op.get())) {
1931+
doInt64BitCall = true;
1932+
break;
1933+
}
1934+
}
1935+
}
1936+
if (!doInt64BitCall) {
1937+
return false;
1938+
}
19271939
if (F && F->isDeclaration()) {
1928-
switch (F->getIntrinsicID()) {
1940+
Intrinsic::ID IntrID = F->getIntrinsicID();
1941+
switch (IntrID) {
19291942
default:
19301943
break;
19311944
// Ignore the following intrinsics in CG.
@@ -1945,9 +1958,6 @@ bool InstExpander::visitCall(CallInst& Call) {
19451958
// emulate @llvm.abs.i64
19461959
case Intrinsic::abs:
19471960
{
1948-
if (!Emu->isInt64(&Call))
1949-
return false;
1950-
19511961
Value* OldVal = Call.getArgOperand(0);
19521962
Value* Lo = nullptr, * Hi = nullptr;
19531963
std::tie(Lo, Hi) = Emu->getExpandedValues(OldVal);
@@ -1967,23 +1977,61 @@ bool InstExpander::visitCall(CallInst& Call) {
19671977
Emu->setExpandedValues(&Call, SelectLo, SelectHo);
19681978
return true;
19691979
}
1970-
#endif
1980+
// emulate LLVM min/max intrinsics
1981+
case Intrinsic::smax:
1982+
case Intrinsic::smin:
1983+
case Intrinsic::umax:
1984+
case Intrinsic::umin:
1985+
{
1986+
// The least significant halves' comparison is dependent on that
1987+
// for the most significant halves, so we gain nothing by lowering
1988+
// this into i32 min/max calls. Basic cmp/sel sequence should
1989+
// suffice
1990+
const DenseMap<Intrinsic::ID, CmpInst::Predicate> CmpPredMap {
1991+
{Intrinsic::smax, CmpInst::Predicate::ICMP_SGT},
1992+
{Intrinsic::smin, CmpInst::Predicate::ICMP_SLT},
1993+
{Intrinsic::umax, CmpInst::Predicate::ICMP_UGT},
1994+
{Intrinsic::umin, CmpInst::Predicate::ICMP_ULT}
1995+
};
1996+
Value* LHS = Call.getArgOperand(0), * RHS = Call.getArgOperand(1);
1997+
// FIXME: Note that we aren't producing expanded/emulated values
1998+
// here, but rather replacing the call uses with the result of a
1999+
// newly generated i64 instruction. To make that work, 2 criteria
2000+
// should be satisfied from the perspective of Emu64Ops::expandInsts
2001+
// algorithm:
2002+
// 1. Inst-over-BB iterators cannot be invalidated
2003+
// 2. Due to averse inst-over-BB iteration order, the cmp/sel
2004+
// sequence must be inserted after the current min/max call,
2005+
// before its first use - regardless of the fact that the call
2006+
// itself will be unlinked from those uses and marked for
2007+
// deletion.
2008+
// For 1, we're entirely relying on IRBuilder's internal validation
2009+
// of instruction numbering within the BB. For 2, we're basically
2010+
// exploiting the knowledge that the inst-over-BB iteration in the
2011+
// parent method strictly heeds the averse order.
2012+
// TODO: Instead of hacking the iteration logic from within the
2013+
// helper InstExpander method, we should encapsulate this use-case
2014+
// (inserting new i64 insts into the emulation queue) at the
2015+
// Emu64Ops class level. One of the options is implementing a util
2016+
// akin to LLVM's InstructionWorklist, which would support averse
2017+
// iteration order and handle the deferred instructions upon their
2018+
// creation. Such a worklist class might have its use in a broader
2019+
// set of IGC passes, hence implementing this a "global" IGC util
2020+
// could be an idea.
2021+
IRB->SetInsertPoint(&*std::next(BasicBlock::iterator(Call)));
2022+
auto* Cmp = cast<Instruction>(
2023+
IRB->CreateICmp(CmpPredMap.lookup(IntrID), LHS, RHS));
2024+
Call.replaceAllUsesWith(IRB->CreateSelect(Cmp, LHS, RHS));
2025+
return true;
19712026
}
1972-
}
1973-
bool doInt64BitCall = Emu->isInt64(&Call);
1974-
if (!doInt64BitCall) {
1975-
for (auto& Op : Call.operands()) {
1976-
if (Emu->isInt64(Op.get())) {
1977-
doInt64BitCall = true;
1978-
break;
1979-
}
2027+
#endif
19802028
}
19812029
}
1982-
if (!doInt64BitCall) {
1983-
return false;
1984-
}
19852030

19862031
// Recreate Call with its operands/result emulated
2032+
// TODO: Investigate whether we should replace the call with two
2033+
// i32-operating calls for Lo and Hi instead (at least for certain
2034+
// intrinsics)
19872035
auto* CallCopy = Call.clone();
19882036
IGC_ASSERT(nullptr != CallCopy);
19892037
CallCopy->insertBefore(&Call);

IGC/Compiler/tests/Emu64Ops/calls.ll

Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2023 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
;
9+
; REQUIRES: llvm-14-plus
10+
;
11+
; RUN: igc_opt --platformdg2 --enable-debugify --igc-emu64ops -S < %s 2>&1 | FileCheck %s
12+
; ------------------------------------------------
13+
; Emu64Ops
14+
; ------------------------------------------------
15+
16+
; Debug-info related check
17+
; CHECK-NOT: WARNING
18+
; CHECK: CheckModuleDebugify: PASS
19+
20+
; CHECK-LABEL: @test_abs(
21+
; CHECK: %[[CAST:.+]] = bitcast i64 %arg to <2 x i32>
22+
; CHECK: %[[ARG_LO:.+]] = extractelement <2 x i32> %[[CAST]], i32 0
23+
; CHECK: %[[ARG_HI:.+]] = extractelement <2 x i32> %[[CAST]], i32 1
24+
;
25+
; CHECK: %[[COND_NEG:.+]] = icmp slt i32 %[[ARG_HI]], 0
26+
; CHECK: %[[NEGATE:.+]] = call { i32, i32 } @llvm.genx.GenISA.sub.pair(
27+
; CHECK-SAME: i32 0, i32 0, i32 %[[ARG_LO]], i32 %[[ARG_HI]])
28+
; CHECK: %[[NEG_LO:.+]] = extractvalue { i32, i32 } %[[NEGATE]], 0
29+
; CHECK: %[[NEG_HI:.+]] = extractvalue { i32, i32 } %[[NEGATE]], 1
30+
;
31+
; CHECK: %[[SEL_LO:.+]] = select i1 %[[COND_NEG]], i32 %[[NEG_LO]], i32 %[[ARG_LO]]
32+
; CHECK: %[[SEL_HI:.+]] = select i1 %[[COND_NEG]], i32 %[[NEG_HI]], i32 %[[ARG_HI]]
33+
; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
34+
; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
35+
; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
36+
; CHECK: call void @use.i64(i64 %[[RES_CAST]])
37+
; CHECK: ret void
38+
define void @test_abs(i64 %arg) {
39+
%1 = call i64 @llvm.abs.i64(i64 %arg, i1 false)
40+
call void @use.i64(i64 %1)
41+
ret void
42+
}
43+
44+
; CHECK-LABEL: @test_smax(
45+
; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
46+
; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
47+
; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
48+
; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
49+
; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
50+
; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
51+
;
52+
; COM: Comparing LSBs in case MSB halves are equal
53+
; CHECK: %[[CMP_LO:.+]] = icmp ugt i32 %[[LHS_LO]], %[[RHS_LO]]
54+
; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
55+
; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
56+
; COM: Comparing signed MSBs - sgt
57+
; CHECK: %[[COND_HI:.+]] = icmp sgt i32 %[[LHS_HI]], %[[RHS_HI]]
58+
; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
59+
;
60+
; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
61+
; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
62+
; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
63+
; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
64+
; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
65+
; CHECK: call void @use.i64(i64 %[[RES_CAST]])
66+
; CHECK: ret void
67+
define void @test_smax(i64 %argL, i64 %argR) {
68+
%1 = call i64 @llvm.smax.i64(i64 %argL, i64 %argR)
69+
call void @use.i64(i64 %1)
70+
ret void
71+
}
72+
73+
; CHECK-LABEL: @test_smin(
74+
; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
75+
; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
76+
; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
77+
; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
78+
; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
79+
; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
80+
;
81+
; COM: Comparing LSBs in case MSB halves are equal
82+
; CHECK: %[[CMP_LO:.+]] = icmp ult i32 %[[LHS_LO]], %[[RHS_LO]]
83+
; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
84+
; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
85+
; COM: Comparing signed MSBs - slt
86+
; CHECK: %[[COND_HI:.+]] = icmp slt i32 %[[LHS_HI]], %[[RHS_HI]]
87+
; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
88+
;
89+
; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
90+
; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
91+
; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
92+
; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
93+
; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
94+
; CHECK: call void @use.i64(i64 %[[RES_CAST]])
95+
; CHECK: ret void
96+
define void @test_smin(i64 %argL, i64 %argR) {
97+
%1 = call i64 @llvm.smin.i64(i64 %argL, i64 %argR)
98+
call void @use.i64(i64 %1)
99+
ret void
100+
}
101+
102+
; CHECK-LABEL: @test_umax(
103+
; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
104+
; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
105+
; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
106+
; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
107+
; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
108+
; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
109+
;
110+
; COM: Comparing LSBs in case MSB halves are equal
111+
; CHECK: %[[CMP_LO:.+]] = icmp ugt i32 %[[LHS_LO]], %[[RHS_LO]]
112+
; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
113+
; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
114+
; COM: Comparing unsigned MSBs - ugt
115+
; CHECK: %[[COND_HI:.+]] = icmp ugt i32 %[[LHS_HI]], %[[RHS_HI]]
116+
; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
117+
;
118+
; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
119+
; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
120+
; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
121+
; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
122+
; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
123+
; CHECK: call void @use.i64(i64 %[[RES_CAST]])
124+
; CHECK: ret void
125+
define void @test_umax(i64 %argL, i64 %argR) {
126+
%1 = call i64 @llvm.umax.i64(i64 %argL, i64 %argR)
127+
call void @use.i64(i64 %1)
128+
ret void
129+
}
130+
131+
; CHECK-LABEL: @test_umin(
132+
; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
133+
; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
134+
; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
135+
; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
136+
; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
137+
; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
138+
;
139+
; COM: Comparing LSBs in case MSB halves are equal
140+
; CHECK: %[[CMP_LO:.+]] = icmp ult i32 %[[LHS_LO]], %[[RHS_LO]]
141+
; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
142+
; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
143+
; COM: Comparing unsigned MSBs - ult
144+
; CHECK: %[[COND_HI:.+]] = icmp ult i32 %[[LHS_HI]], %[[RHS_HI]]
145+
; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
146+
;
147+
; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
148+
; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
149+
; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
150+
; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
151+
; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
152+
; CHECK: call void @use.i64(i64 %[[RES_CAST]])
153+
; CHECK: ret void
154+
define void @test_umin(i64 %argL, i64 %argR) {
155+
%1 = call i64 @llvm.umin.i64(i64 %argL, i64 %argR)
156+
call void @use.i64(i64 %1)
157+
ret void
158+
}
159+
160+
declare i64 @llvm.abs.i64(i64, i1)
161+
declare i64 @llvm.smax.i64(i64, i64)
162+
declare i64 @llvm.smin.i64(i64, i64)
163+
declare i64 @llvm.umax.i64(i64, i64)
164+
declare i64 @llvm.umin.i64(i64, i64)
165+
declare void @use.i64(i64)
166+
167+
!igc.functions = !{!0, !3, !4, !5, !6}
168+
169+
!0 = !{void (i64)* @test_abs, !1}
170+
!1 = !{!2}
171+
!2 = !{!"function_type", i32 0}
172+
!3 = !{void (i64, i64)* @test_smax, !1}
173+
!4 = !{void (i64, i64)* @test_smin, !1}
174+
!5 = !{void (i64, i64)* @test_umax, !1}
175+
!6 = !{void (i64, i64)* @test_umin, !1}

0 commit comments

Comments
 (0)