Explicitly handle LLVM min/max intrinsics in i64 emulation

Artem Gindinson · igcbot · commit 7a7d73557a78 · 2023-10-13T16:28:06.000+02:00
The existing "default" logic for emulating i64 intrinsic calls is not suitable for LLVM 12+ min/max intrinsics - upon the LLVM 14 switch, any such occurence out of LLVM's instruction simplification gets lowered into invalid assembly, resulting in GPU hangs. Rework such calls into `cmp` intructions with corresponding predicates, deferring the resulting `cmp`/`select` sequence to the dedicated `InstExpander::` visitors. Additionally, add a LIT case for `llvm.abs.i64` emulation as a follow-up on commit 8863ed6.
diff --git a/IGC/Compiler/CISACodeGen/Emu64OpsPass.cpp b/IGC/Compiler/CISACodeGen/Emu64OpsPass.cpp
@@ -1924,8 +1924,21 @@ bool InstExpander::visitCall(CallInst& Call) {
     IGC_ASSERT(nullptr != Emu);
 
     const Function* F = Call.getCalledFunction();
+    bool doInt64BitCall = Emu->isInt64(&Call);
+    if (!doInt64BitCall) {
+        for (auto& Op : Call.operands()) {
+            if (Emu->isInt64(Op.get())) {
+                doInt64BitCall = true;
+                break;
+            }
+        }
+    }
+    if (!doInt64BitCall) {
+        return false;
+    }
     if (F && F->isDeclaration()) {
-        switch (F->getIntrinsicID()) {
+        Intrinsic::ID IntrID = F->getIntrinsicID();
+        switch (IntrID) {
         default:
             break;
             // Ignore the following intrinsics in CG.
@@ -1945,9 +1958,6 @@ bool InstExpander::visitCall(CallInst& Call) {
         // emulate @llvm.abs.i64
         case Intrinsic::abs:
         {
-            if (!Emu->isInt64(&Call))
-                return false;
-
             Value* OldVal = Call.getArgOperand(0);
             Value* Lo = nullptr, * Hi = nullptr;
             std::tie(Lo, Hi) = Emu->getExpandedValues(OldVal);
@@ -1967,23 +1977,61 @@ bool InstExpander::visitCall(CallInst& Call) {
             Emu->setExpandedValues(&Call, SelectLo, SelectHo);
             return true;
         }
-#endif
+        // emulate LLVM min/max intrinsics
+        case Intrinsic::smax:
+        case Intrinsic::smin:
+        case Intrinsic::umax:
+        case Intrinsic::umin:
+        {
+            // The least significant halves' comparison is dependent on that
+            // for the most significant halves, so we gain nothing by lowering
+            // this into i32 min/max calls. Basic cmp/sel sequence should
+            // suffice
+            const DenseMap<Intrinsic::ID, CmpInst::Predicate> CmpPredMap {
+                {Intrinsic::smax, CmpInst::Predicate::ICMP_SGT},
+                {Intrinsic::smin, CmpInst::Predicate::ICMP_SLT},
+                {Intrinsic::umax, CmpInst::Predicate::ICMP_UGT},
+                {Intrinsic::umin, CmpInst::Predicate::ICMP_ULT}
+            };
+            Value* LHS = Call.getArgOperand(0), * RHS = Call.getArgOperand(1);
+            // FIXME: Note that we aren't producing expanded/emulated values
+            // here, but rather replacing the call uses with the result of a
+            // newly generated i64 instruction. To make that work, 2 criteria
+            // should be satisfied from the perspective of Emu64Ops::expandInsts
+            // algorithm:
+            // 1. Inst-over-BB iterators cannot be invalidated
+            // 2. Due to averse inst-over-BB iteration order, the cmp/sel
+            //    sequence must be inserted after the current min/max call,
+            //    before its first use - regardless of the fact that the call
+            //    itself will be unlinked from those uses and marked for
+            //    deletion.
+            // For 1, we're entirely relying on IRBuilder's internal validation
+            // of instruction numbering within the BB. For 2, we're basically
+            // exploiting the knowledge that the inst-over-BB iteration in the
+            // parent method strictly heeds the averse order.
+            // TODO: Instead of hacking the iteration logic from within the
+            // helper InstExpander method, we should encapsulate this use-case
+            // (inserting new i64 insts into the emulation queue) at the
+            // Emu64Ops class level. One of the options is implementing a util
+            // akin to LLVM's InstructionWorklist, which would support averse
+            // iteration order and handle the deferred instructions upon their
+            // creation. Such a worklist class might have its use in a broader
+            // set of IGC passes, hence implementing this a "global" IGC util
+            // could be an idea.
+            IRB->SetInsertPoint(&*std::next(BasicBlock::iterator(Call)));
+            auto* Cmp = cast<Instruction>(
+                IRB->CreateICmp(CmpPredMap.lookup(IntrID), LHS, RHS));
+            Call.replaceAllUsesWith(IRB->CreateSelect(Cmp, LHS, RHS));
+            return true;
         }
-    }
-    bool doInt64BitCall = Emu->isInt64(&Call);
-    if (!doInt64BitCall) {
-        for (auto& Op : Call.operands()) {
-            if (Emu->isInt64(Op.get())) {
-                doInt64BitCall = true;
-                break;
-            }
+#endif
         }
     }
-    if (!doInt64BitCall) {
-        return false;
-    }
 
     // Recreate Call with its operands/result emulated
+    // TODO: Investigate whether we should replace the call with two
+    // i32-operating calls for Lo and Hi instead (at least for certain
+    // intrinsics)
     auto* CallCopy = Call.clone();
     IGC_ASSERT(nullptr != CallCopy);
     CallCopy->insertBefore(&Call);
diff --git a/IGC/Compiler/tests/Emu64Ops/calls.ll b/IGC/Compiler/tests/Emu64Ops/calls.ll
@@ -0,0 +1,175 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2023 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+;
+; REQUIRES: llvm-14-plus
+;
+; RUN: igc_opt --platformdg2 --enable-debugify --igc-emu64ops -S < %s 2>&1 | FileCheck %s
+; ------------------------------------------------
+; Emu64Ops
+; ------------------------------------------------
+
+; Debug-info related check
+; CHECK-NOT: WARNING
+; CHECK: CheckModuleDebugify: PASS
+
+; CHECK-LABEL: @test_abs(
+; CHECK: %[[CAST:.+]] = bitcast i64 %arg to <2 x i32>
+; CHECK: %[[ARG_LO:.+]] = extractelement <2 x i32> %[[CAST]], i32 0
+; CHECK: %[[ARG_HI:.+]] = extractelement <2 x i32> %[[CAST]], i32 1
+;
+; CHECK: %[[COND_NEG:.+]] = icmp slt i32 %[[ARG_HI]], 0
+; CHECK: %[[NEGATE:.+]] = call { i32, i32 } @llvm.genx.GenISA.sub.pair(
+; CHECK-SAME: i32 0, i32 0, i32 %[[ARG_LO]], i32 %[[ARG_HI]])
+; CHECK: %[[NEG_LO:.+]] = extractvalue { i32, i32 } %[[NEGATE]], 0
+; CHECK: %[[NEG_HI:.+]] = extractvalue { i32, i32 } %[[NEGATE]], 1
+;
+; CHECK: %[[SEL_LO:.+]] = select i1 %[[COND_NEG]], i32 %[[NEG_LO]], i32 %[[ARG_LO]]
+; CHECK: %[[SEL_HI:.+]] = select i1 %[[COND_NEG]], i32 %[[NEG_HI]], i32 %[[ARG_HI]]
+; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
+; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
+; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
+; CHECK: call void @use.i64(i64 %[[RES_CAST]])
+; CHECK: ret void
+define void @test_abs(i64 %arg) {
+  %1 = call i64 @llvm.abs.i64(i64 %arg, i1 false)
+  call void @use.i64(i64 %1)
+  ret void
+}
+
+; CHECK-LABEL: @test_smax(
+; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
+; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
+; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
+; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
+; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
+; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
+;
+; COM: Comparing LSBs in case MSB halves are equal
+; CHECK: %[[CMP_LO:.+]] = icmp ugt i32 %[[LHS_LO]], %[[RHS_LO]]
+; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
+; COM: Comparing signed MSBs - sgt
+; CHECK: %[[COND_HI:.+]] = icmp sgt i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
+;
+; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
+; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
+; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
+; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
+; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
+; CHECK: call void @use.i64(i64 %[[RES_CAST]])
+; CHECK: ret void
+define void @test_smax(i64 %argL, i64 %argR) {
+  %1 = call i64 @llvm.smax.i64(i64 %argL, i64 %argR)
+  call void @use.i64(i64 %1)
+  ret void
+}
+
+; CHECK-LABEL: @test_smin(
+; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
+; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
+; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
+; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
+; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
+; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
+;
+; COM: Comparing LSBs in case MSB halves are equal
+; CHECK: %[[CMP_LO:.+]] = icmp ult i32 %[[LHS_LO]], %[[RHS_LO]]
+; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
+; COM: Comparing signed MSBs - slt
+; CHECK: %[[COND_HI:.+]] = icmp slt i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
+;
+; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
+; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
+; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
+; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
+; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
+; CHECK: call void @use.i64(i64 %[[RES_CAST]])
+; CHECK: ret void
+define void @test_smin(i64 %argL, i64 %argR) {
+  %1 = call i64 @llvm.smin.i64(i64 %argL, i64 %argR)
+  call void @use.i64(i64 %1)
+  ret void
+}
+
+; CHECK-LABEL: @test_umax(
+; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
+; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
+; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
+; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
+; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
+; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
+;
+; COM: Comparing LSBs in case MSB halves are equal
+; CHECK: %[[CMP_LO:.+]] = icmp ugt i32 %[[LHS_LO]], %[[RHS_LO]]
+; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
+; COM: Comparing unsigned MSBs - ugt
+; CHECK: %[[COND_HI:.+]] = icmp ugt i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
+;
+; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
+; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
+; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
+; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
+; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
+; CHECK: call void @use.i64(i64 %[[RES_CAST]])
+; CHECK: ret void
+define void @test_umax(i64 %argL, i64 %argR) {
+  %1 = call i64 @llvm.umax.i64(i64 %argL, i64 %argR)
+  call void @use.i64(i64 %1)
+  ret void
+}
+
+; CHECK-LABEL: @test_umin(
+; CHECK: %[[CAST_LHS:.+]] = bitcast i64 %argL to <2 x i32>
+; CHECK: %[[LHS_LO:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 0
+; CHECK: %[[LHS_HI:.+]] = extractelement <2 x i32> %[[CAST_LHS]], i32 1
+; CHECK: %[[CAST_RHS:.+]] = bitcast i64 %argR to <2 x i32>
+; CHECK: %[[RHS_LO:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 0
+; CHECK: %[[RHS_HI:.+]] = extractelement <2 x i32> %[[CAST_RHS]], i32 1
+;
+; COM: Comparing LSBs in case MSB halves are equal
+; CHECK: %[[CMP_LO:.+]] = icmp ult i32 %[[LHS_LO]], %[[RHS_LO]]
+; CHECK: %[[CMP_EQ_HI:.+]] = icmp eq i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[COND_LO:.+]] = and i1 %[[CMP_EQ_HI]], %[[CMP_LO]]
+; COM: Comparing unsigned MSBs - ult
+; CHECK: %[[COND_HI:.+]] = icmp ult i32 %[[LHS_HI]], %[[RHS_HI]]
+; CHECK: %[[RES_COND:.+]] = or i1 %[[COND_LO]], %[[COND_HI]]
+;
+; CHECK: %[[SEL_LO:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_LO]], i32 %[[RHS_LO]]
+; CHECK: %[[SEL_HI:.+]] = select i1 %[[RES_COND]], i32 %[[LHS_HI]], i32 %[[RHS_HI]]
+; CHECK: %[[RES_LO:.+]] = insertelement <2 x i32> undef, i32 %[[SEL_LO]], i32 0
+; CHECK: %[[RES_VEC:.+]] = insertelement <2 x i32> %[[RES_LO]], i32 %[[SEL_HI]], i32 1
+; CHECK: %[[RES_CAST:.+]] = bitcast <2 x i32> %[[RES_VEC]] to i64
+; CHECK: call void @use.i64(i64 %[[RES_CAST]])
+; CHECK: ret void
+define void @test_umin(i64 %argL, i64 %argR) {
+  %1 = call i64 @llvm.umin.i64(i64 %argL, i64 %argR)
+  call void @use.i64(i64 %1)
+  ret void
+}
+
+declare i64 @llvm.abs.i64(i64, i1)
+declare i64 @llvm.smax.i64(i64, i64)
+declare i64 @llvm.smin.i64(i64, i64)
+declare i64 @llvm.umax.i64(i64, i64)
+declare i64 @llvm.umin.i64(i64, i64)
+declare void @use.i64(i64)
+
+!igc.functions = !{!0, !3, !4, !5, !6}
+
+!0 = !{void (i64)* @test_abs, !1}
+!1 = !{!2}
+!2 = !{!"function_type", i32 0}
+!3 = !{void (i64, i64)* @test_smax, !1}
+!4 = !{void (i64, i64)* @test_smin, !1}
+!5 = !{void (i64, i64)* @test_umax, !1}
+!6 = !{void (i64, i64)* @test_umin, !1}