Skip to content

Commit 3333cff

Browse files
vmustyaigcbot
authored andcommitted
Pedicate promotion into scalar when all and any are not used
VC should promote predicate operations into scalar ones even for fused EU targets, when `genx.any` and `genx.all` intrinsics are not used as predicate operations consumers.
1 parent 1b69857 commit 3333cff

File tree

2 files changed

+86
-6
lines changed

2 files changed

+86
-6
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXPromotePredicate.cpp

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -227,8 +227,8 @@ static void foldBitcast(BitCastInst *Cast) {
227227
class PredicateWeb {
228228
public:
229229
template <class InputIt>
230-
PredicateWeb(InputIt First, InputIt Last, bool AllowScalar)
231-
: Web(First, Last), AllowScalarPromotion(AllowScalar) {}
230+
PredicateWeb(InputIt First, InputIt Last, bool AllowScalarAllAny)
231+
: Web(First, Last), AllowScalarAllAny(AllowScalarAllAny) {}
232232
void print(llvm::raw_ostream &O) const {
233233
for (auto Inst : Web)
234234
O << *Inst << '\n';
@@ -241,10 +241,20 @@ class PredicateWeb {
241241
return NumBinaryOps >= LogicOpsThreshold;
242242
}
243243
void doPromotion() const {
244+
auto AllowScalar = true;
245+
if (!AllowScalarAllAny)
246+
AllowScalar = llvm::none_of(Web, [](auto *Inst) {
247+
return llvm::any_of(Inst->users(), [](auto *U) {
248+
auto IID = vc::getAnyIntrinsicID(U);
249+
return IID == GenXIntrinsic::genx_any ||
250+
IID == GenXIntrinsic::genx_all;
251+
});
252+
});
253+
244254
// Do promotion.
245255
SmallVector<Instruction *, 8> Worklist;
246256
for (auto *Inst : Web) {
247-
auto *PromotedInst = promoteInst(Inst, AllowScalarPromotion);
257+
auto *PromotedInst = promoteInst(Inst, AllowScalar);
248258

249259
if (isa<TruncInst>(PromotedInst) || isa<BitCastInst>(PromotedInst))
250260
Worklist.push_back(cast<Instruction>(PromotedInst));
@@ -262,7 +272,7 @@ class PredicateWeb {
262272

263273
private:
264274
SmallPtrSet<Instruction *, 16> Web;
265-
bool AllowScalarPromotion;
275+
bool AllowScalarAllAny;
266276
};
267277

268278
constexpr const char IdxMDName[] = "pred.index";
@@ -285,7 +295,7 @@ bool GenXPromotePredicate::runOnFunction(Function &F) {
285295
auto &ST = getAnalysis<TargetPassConfig>()
286296
.getTM<GenXTargetMachine>()
287297
.getGenXSubtarget();
288-
bool AllowScalarPromotion = !ST.hasFusedEU();
298+
bool AllowScalarAllAny = !ST.hasFusedEU();
289299

290300
// Put every predicate instruction into its own equivalence class.
291301
long Idx = 0;
@@ -318,7 +328,7 @@ bool GenXPromotePredicate::runOnFunction(Function &F) {
318328
if (!I->isLeader())
319329
continue;
320330
PredicateWeb Web(PredicateWebs.member_begin(I), PredicateWebs.member_end(),
321-
AllowScalarPromotion);
331+
AllowScalarAllAny);
322332
LLVM_DEBUG(dbgs() << "Predicate web:\n"; Web.dump());
323333
++NumCollectedPredicateWebs;
324334
if (!Web.isBeneficialToPromote())
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt %use_old_pass_manager% -GenXPromotePredicate -march=genx64 -mtriple=spir64-unknown-unknown \
10+
; RUN: -mcpu=Gen9 -logical-ops-threshold=2 -S < %s | FileCheck %s
11+
12+
; RUN: %opt %use_old_pass_manager% -GenXPromotePredicate -march=genx64 -mtriple=spir64-unknown-unknown \
13+
; RUN: -mcpu=XeHPG -logical-ops-threshold=2 -S < %s | FileCheck %s
14+
15+
; RUN: %opt %use_old_pass_manager% -GenXPromotePredicate -march=genx64 -mtriple=spir64-unknown-unknown \
16+
; RUN: -mcpu=XeHPC -logical-ops-threshold=2 -S < %s | FileCheck %s
17+
18+
; CHECK-LABEL: f_f
19+
; CHECK-DAG: [[LESSEQUAL_A_LOAD_widened:%.*]] = bitcast <8 x i1> %lessequal_a_load_ to i8
20+
; CHECK-DAG: [[EQUAL_A_LOAD5_widened:%.*]] = bitcast <8 x i1> %equal_a_load5_ to i8
21+
; CHECK-DAG: [[LOGICAL_AND_promoted:%.*]] = and i8 [[LESSEQUAL_A_LOAD_widened]], [[EQUAL_A_LOAD5_widened]]
22+
; CHECK-DAG: [[LOGICAL_AND:%.*]] = bitcast i8 [[LOGICAL_AND_promoted]] to <8 x i1>
23+
; CHECK-DAG: call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[LOGICAL_AND]])
24+
; CHECK-DAG: [[RETURNED_LANES_MEMORY_0_promoted:%.*]] = phi i8 [ [[LOGICAL_AND_promoted]], %safe_if_run_true.safe_if_after_true_crit_edge ], [ 0, %allocas.safe_if_after_true_crit_edge ]
25+
; CHECK-DAG: [[NEG_RETURNED_LANES_promoted:%.*]] = xor i8 [[RETURNED_LANES_MEMORY_0_promoted]], -1
26+
; CHECK-DAG: [[NEG_RETURNED_LANES:%.*]] = bitcast i8 [[NEG_RETURNED_LANES_promoted]] to <8 x i1>
27+
; CHECK-DAG: call void @llvm.genx.svm.scatter.v8i1.v8i64.v8f32(<8 x i1> [[NEG_RETURNED_LANES]], i32 0, <8 x i64> %new_offsets.i.i34, <8 x float> zeroinitializer)
28+
; CHECK-DAG: icmp eq i8 [[LOGICAL_AND_promoted]], -1
29+
30+
declare void @llvm.genx.svm.scatter.v8i1.v8i64.v8f32(<8 x i1>, i32, <8 x i64>, <8 x float>)
31+
declare <8 x float> @llvm.genx.svm.block.ld.unaligned.v8f32.i64(i64)
32+
declare void @llvm.genx.svm.block.st.i64.v8f32(i64, <8 x float>)
33+
34+
declare i1 @llvm.vector.reduce.or.v8i1(<8 x i1>)
35+
36+
define dllexport spir_kernel void @f_f(float* nocapture %RET, float* %aFOO, i64 %privBase) {
37+
allocas:
38+
%svm_ld_ptrtoint = ptrtoint float* %aFOO to i64
39+
%aFOO_load_ptr2int_2void2021_masked_load22 = call <8 x float> @llvm.genx.svm.block.ld.unaligned.v8f32.i64(i64 %svm_ld_ptrtoint)
40+
%lessequal_a_load_ = fcmp ole <8 x float> %aFOO_load_ptr2int_2void2021_masked_load22, <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
41+
%equal_a_load5_ = fcmp oeq <8 x float> %aFOO_load_ptr2int_2void2021_masked_load22, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
42+
%logical_and = and <8 x i1> %lessequal_a_load_, %equal_a_load5_
43+
%v.i = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> %logical_and)
44+
%ptr_to_int.i.i31 = ptrtoint float* %RET to i64
45+
%base.i.i32 = insertelement <8 x i64> undef, i64 %ptr_to_int.i.i31, i32 0
46+
%shuffle.i.i33 = shufflevector <8 x i64> %base.i.i32, <8 x i64> undef, <8 x i32> zeroinitializer
47+
%new_offsets.i.i34 = add <8 x i64> %shuffle.i.i33, <i64 0, i64 4, i64 8, i64 12, i64 16, i64 20, i64 24, i64 28>
48+
br i1 %v.i, label %safe_if_run_true, label %allocas.safe_if_after_true_crit_edge
49+
50+
allocas.safe_if_after_true_crit_edge:
51+
br label %safe_if_after_true
52+
53+
safe_if_after_true:
54+
%returned_lanes_memory.0 = phi <8 x i1> [ %logical_and, %safe_if_run_true.safe_if_after_true_crit_edge ], [ zeroinitializer, %allocas.safe_if_after_true_crit_edge ]
55+
%"~returned_lanes" = xor <8 x i1> %returned_lanes_memory.0, <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>
56+
call void @llvm.genx.svm.scatter.v8i1.v8i64.v8f32(<8 x i1> %"~returned_lanes", i32 0, <8 x i64> %new_offsets.i.i34, <8 x float> zeroinitializer)
57+
ret void
58+
59+
safe_if_run_true:
60+
call void @llvm.genx.svm.scatter.v8i1.v8i64.v8f32(<8 x i1> %logical_and, i32 0, <8 x i64> %new_offsets.i.i34, <8 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>)
61+
%v.i30 = bitcast <8 x i1> %logical_and to i8
62+
%"equal__old_mask|returned_lanes" = icmp eq i8 %v.i30, -1
63+
br i1 %"equal__old_mask|returned_lanes", label %do_return, label %safe_if_run_true.safe_if_after_true_crit_edge
64+
65+
safe_if_run_true.safe_if_after_true_crit_edge:
66+
br label %safe_if_after_true
67+
68+
do_return:
69+
ret void
70+
}

0 commit comments

Comments
 (0)