Skip to content

Commit 5741f80

Browse files
mnaczkigcbot
authored andcommitted
Implement support for LLVM intrinsic CTLZ
Implement support for LLVM intrinsic CTLZ with source other than i32
1 parent c839e26 commit 5741f80

File tree

2 files changed

+257
-1
lines changed

2 files changed

+257
-1
lines changed

IGC/Compiler/Optimizer/OpenCLPasses/ReplaceUnsupportedIntrinsics/ReplaceUnsupportedIntrinsics.cpp

Lines changed: 113 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,9 @@ namespace
9696
void generalGroupI8Stream(
9797
LLVMContext& C, uint32_t NumI8, uint32_t Align,
9898
uint32_t& NumI32, Type** Vecs, uint32_t& L, uint32_t BaseTypeSize);
99+
// support function for replaceCountTheLeadingZeros
100+
Value* evaluateCtlzUpto32bit(IGCLLVM::IRBuilder<>* Builder, Value* inVal, Type* singleElementType, Value* canBePoison);
101+
Value* evaluateCtlz64bit(IGCLLVM::IRBuilder<>* Builder, Value* inVal, Type* singleElementType, Value* canBePoison);
99102

100103
/// replace member function
101104
void replaceMemcpy(IntrinsicInst* I);
@@ -104,6 +107,7 @@ namespace
104107
void replaceExpect(IntrinsicInst* I);
105108
void replaceFunnelShift(IntrinsicInst* I);
106109
void replaceLRound(IntrinsicInst* I);
110+
void replaceCountTheLeadingZeros(IntrinsicInst* I);
107111

108112
static const std::map< Intrinsic::ID, MemFuncPtr_t > m_intrinsicToFunc;
109113
};
@@ -129,7 +133,8 @@ const std::map< Intrinsic::ID, ReplaceUnsupportedIntrinsics::MemFuncPtr_t > Repl
129133
{ Intrinsic::memmove, &ReplaceUnsupportedIntrinsics::replaceMemMove },
130134
{ Intrinsic::expect, &ReplaceUnsupportedIntrinsics::replaceExpect },
131135
{ Intrinsic::lround, &ReplaceUnsupportedIntrinsics::replaceLRound },
132-
{ Intrinsic::llround, &ReplaceUnsupportedIntrinsics::replaceLRound }
136+
{ Intrinsic::llround, &ReplaceUnsupportedIntrinsics::replaceLRound },
137+
{ Intrinsic::ctlz, &ReplaceUnsupportedIntrinsics::replaceCountTheLeadingZeros }
133138
};
134139

135140
ReplaceUnsupportedIntrinsics::ReplaceUnsupportedIntrinsics() : FunctionPass(ID)
@@ -985,6 +990,113 @@ void ReplaceUnsupportedIntrinsics::replaceLRound(IntrinsicInst* I) {
985990
I->eraseFromParent();
986991
}
987992

993+
/*
994+
Replaces llvm.ctlz.* intrinsics (count the leading zeros)
995+
to llvm.ctlz.i32 because we support llvm.ctlz intrinsic
996+
only with source type i32.
997+
998+
E.g.
999+
%1 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %0, i1 false)
1000+
ret <2 x i8> %1
1001+
=>
1002+
%1 = extractelement <2 x i8> %0, i64 0
1003+
%2 = zext i8 %1 to i32
1004+
%3 = call i32 @llvm.ctlz.i32(i32 %2, i1 false)
1005+
%4 = trunc i32 %3 to i8
1006+
%5 = add nsw i8 %4, -24
1007+
%6 = insertelement <2 x i8> undef, i8 %5, i32 0
1008+
%7 = extractelement <2 x i8> %0, i64 1
1009+
%8 = zext i8 %7 to i32
1010+
%9 = call i32 @llvm.ctlz.i32(i32 %8, i1 false)
1011+
%10 = trunc i32 %9 to i8
1012+
%11 = add nsw i8 %10, -24
1013+
%12 = insertelement <2 x i8> %6, i8 %11, i32 1
1014+
%13 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %0, i1 false)
1015+
ret <2 x i8> %12
1016+
*/
1017+
void ReplaceUnsupportedIntrinsics::replaceCountTheLeadingZeros(IntrinsicInst* I) {
1018+
IGC_ASSERT(I->getIntrinsicID() == Intrinsic::ctlz);
1019+
1020+
Type* oldIntrinsicDstType = I->getType();
1021+
Type* singleElementType = oldIntrinsicDstType;
1022+
uint32_t numOfElements = 1;
1023+
bool isVector = oldIntrinsicDstType->isVectorTy();
1024+
1025+
if (isVector)
1026+
{
1027+
auto oldIntrinsicDstTypeFVT = dyn_cast<IGCLLVM::FixedVectorType>(oldIntrinsicDstType);
1028+
numOfElements = (uint32_t)oldIntrinsicDstTypeFVT->getNumElements();
1029+
singleElementType = oldIntrinsicDstTypeFVT->getElementType();
1030+
}
1031+
1032+
int singleElementSizeInBits = singleElementType->getScalarSizeInBits();
1033+
1034+
IGC_ASSERT_MESSAGE(singleElementSizeInBits == 8 || singleElementSizeInBits == 16 ||
1035+
singleElementSizeInBits == 32 || singleElementSizeInBits == 64,
1036+
"Currently for Intrinsic::ctlz we support source bit size: 8,16,32,64");
1037+
1038+
// noting to replace, early return
1039+
if (!isVector && singleElementSizeInBits == 32) return;
1040+
1041+
bool bitSizeLowerThan32 = singleElementSizeInBits < 32;
1042+
bool bitSizeEqual64 = singleElementSizeInBits == 64;
1043+
1044+
IGCLLVM::IRBuilder<> Builder(I);
1045+
1046+
Value* inputVal = I->getArgOperand(0);
1047+
Value* canBePoison = I->getArgOperand(1);
1048+
Value* outputVal = llvm::UndefValue::get(oldIntrinsicDstType); // Will be overwritten in scalar case.
1049+
Value* retVal = inputVal;
1050+
1051+
for (uint32_t i = 0; i < numOfElements; i++)
1052+
{
1053+
if (isVector) retVal = Builder.CreateExtractElement(inputVal, i);
1054+
1055+
if (bitSizeLowerThan32)
1056+
retVal = evaluateCtlzUpto32bit(&Builder, retVal, singleElementType, canBePoison);
1057+
else if (bitSizeEqual64)
1058+
retVal = evaluateCtlz64bit(&Builder, retVal, singleElementType, canBePoison);
1059+
1060+
if (singleElementSizeInBits == 32)
1061+
retVal = Builder.CreateIntrinsic(Intrinsic::ctlz, { Builder.getInt32Ty() }, { retVal, canBePoison });
1062+
1063+
if (isVector)
1064+
outputVal = Builder.CreateInsertElement(outputVal, retVal, Builder.getInt32(i));
1065+
else // for scalar type
1066+
outputVal = retVal;
1067+
}
1068+
I->replaceAllUsesWith(outputVal);
1069+
}
1070+
1071+
Value* ReplaceUnsupportedIntrinsics::evaluateCtlzUpto32bit(IGCLLVM::IRBuilder<>* Builder, Value* inVal, Type* singleElementType, Value* canBePoison) {
1072+
int sizeInBits = singleElementType->getScalarSizeInBits();
1073+
Value* retVal = Builder->CreateZExt(inVal, Builder->getInt32Ty());
1074+
retVal = Builder->CreateIntrinsic(Intrinsic::ctlz, { Builder->getInt32Ty() }, { retVal, canBePoison });
1075+
retVal = Builder->CreateTrunc(retVal, singleElementType);
1076+
auto constInt = Builder->getIntN(sizeInBits, sizeInBits - 32);
1077+
retVal = Builder->CreateNSWAdd(retVal, constInt);
1078+
return retVal;
1079+
}
1080+
1081+
Value* ReplaceUnsupportedIntrinsics::evaluateCtlz64bit(IGCLLVM::IRBuilder<>* Builder, Value* inVal, Type* singleElementType, Value* canBePoison) {
1082+
Value* lowBits = Builder->CreateTrunc(inVal, Builder->getInt32Ty());
1083+
lowBits = Builder->CreateIntrinsic(Intrinsic::ctlz, { Builder->getInt32Ty() }, { lowBits, canBePoison });
1084+
1085+
Value* hiBits = Builder->CreateLShr(inVal, 32);
1086+
hiBits = Builder->CreateTrunc(hiBits, Builder->getInt32Ty());
1087+
hiBits = Builder->CreateIntrinsic(Intrinsic::ctlz, { Builder->getInt32Ty() }, { hiBits, canBePoison });
1088+
1089+
auto maxValueIn32BitsPlusOne = Builder->getInt64((uint64_t)(0xffffffff) + 1); // maxValueIn32Bits + 1
1090+
Value* cmp = Builder->CreateICmp(CmpInst::Predicate::ICMP_ULT, inVal, maxValueIn32BitsPlusOne);
1091+
1092+
auto constInt = Builder->getInt32(32);
1093+
lowBits = Builder->CreateAdd(lowBits, constInt);
1094+
1095+
Value* retVal = Builder->CreateSelect(cmp, lowBits, hiBits);
1096+
retVal = Builder->CreateZExt(retVal, singleElementType);
1097+
return retVal;
1098+
}
1099+
9881100
void ReplaceUnsupportedIntrinsics::visitIntrinsicInst(IntrinsicInst& I) {
9891101
if (m_intrinsicToFunc.find(I.getIntrinsicID()) != m_intrinsicToFunc.end()) {
9901102
m_instsToReplace.push_back(&I);
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2022 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: igc_opt -igc-replace-unsupported-intrinsics -verify -S %s -o %t
10+
; RUN: FileCheck %s < %t
11+
12+
define i8 @A0(i8) {
13+
entry:
14+
; CHECK-LABEL: entry:
15+
; CHECK: [[CONV_0:%[a-zA-Z0-9]+]] = zext i8 %0 to i32
16+
; CHECK: [[CALL:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0]], i1 false)
17+
; CHECK: [[CONV_1:%[a-zA-Z0-9]+]] = trunc i32 [[CALL]] to i8
18+
; CHECK: [[SUB:%[a-zA-Z0-9]+]] = add nsw i8 [[CONV_1]], -24
19+
; CHECK: ret i8 [[SUB]]
20+
%1 = call i8 @llvm.ctlz.i8(i8 %0, i1 false)
21+
ret i8 %1
22+
}
23+
24+
25+
define i16 @A1(i16) {
26+
entry:
27+
; CHECK-LABEL: entry:
28+
; CHECK: [[CONV_0:%[a-zA-Z0-9]+]] = zext i16 %0 to i32
29+
; CHECK: [[CALL:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0]], i1 false)
30+
; CHECK: [[CONV_1:%[a-zA-Z0-9]+]] = trunc i32 [[CALL]] to i16
31+
; CHECK: [[SUB:%[a-zA-Z0-9]+]] = add nsw i16 [[CONV_1]], -16
32+
; CHECK: ret i16 [[SUB]]
33+
%1 = call i16 @llvm.ctlz.i16(i16 %0, i1 false)
34+
ret i16 %1
35+
}
36+
37+
38+
define i32 @A2(i32) {
39+
entry:
40+
; CHECK-LABEL: entry:
41+
; CHECK: [[CALL:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
42+
; CHECK: ret i32 [[CALL]]
43+
%1 = call i32 @llvm.ctlz.i32(i32 %0, i1 false)
44+
ret i32 %1
45+
}
46+
47+
define <2 x i8> @A3(<2 x i8>) {
48+
entry:
49+
; CHECK-LABEL: entry:
50+
; CHECK: [[EXTRACT_0:%[a-zA-Z0-9]+]] = extractelement <2 x i8> %0, [[INDEX_TYPE_E:i(16|32|64)]] 0
51+
; CHECK: [[CONV_0_0:%[a-zA-Z0-9]+]] = zext i8 [[EXTRACT_0]] to i32
52+
; CHECK: [[CALL_0:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0_0]], i1 false)
53+
; CHECK: [[CONV_0_1:%[a-zA-Z0-9]+]] = trunc i32 [[CALL_0]] to i8
54+
; CHECK: [[SUB_0:%[a-zA-Z0-9]+]] = add nsw i8 [[CONV_0_1]], -24
55+
; CHECK: [[INSERT_0:%[a-zA-Z0-9]+]] = insertelement <2 x i8> undef, i8 [[SUB_0]], [[INDEX_TYPE_I:i(16|32|64)]] 0
56+
57+
; CHECK: [[EXTRACT_1:%[a-zA-Z0-9]+]] = extractelement <2 x i8> %0, [[INDEX_TYPE_E]] 1
58+
; CHECK: [[CONV_1_0:%[a-zA-Z0-9]+]] = zext i8 [[EXTRACT_1]] to i32
59+
; CHECK: [[CALL_1:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_1_0]], i1 false)
60+
; CHECK: [[CONV_1_1:%[a-zA-Z0-9]+]] = trunc i32 [[CALL_1]] to i8
61+
; CHECK: [[SUB_1:%[a-zA-Z0-9]+]] = add nsw i8 [[CONV_1_1]], -24
62+
; CHECK: [[INSERT_1:%[a-zA-Z0-9]+]] = insertelement <2 x i8> [[INSERT_0]], i8 [[SUB_1]], [[INDEX_TYPE_I]] 1
63+
64+
; CHECK: ret <2 x i8> [[INSERT_1]]
65+
%1 = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %0, i1 false)
66+
ret <2 x i8> %1
67+
}
68+
69+
70+
define <2 x i32> @A4(<2 x i32>) {
71+
entry:
72+
; CHECK-LABEL: entry:
73+
; CHECK: [[EXTRACT_0:%[a-zA-Z0-9]+]] = extractelement <2 x i32> %0, [[INDEX_TYPE_E:i(16|32|64)]] 0
74+
; CHECK: [[CALL_0:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[EXTRACT_0]], i1 false)
75+
; CHECK: [[INSERT_0:%[a-zA-Z0-9]+]] = insertelement <2 x i32> undef, i32 [[CALL_0]], [[INDEX_TYPE_I:i(16|32|64)]] 0
76+
77+
; CHECK: [[EXTRACT_1:%[a-zA-Z0-9]+]] = extractelement <2 x i32> %0, [[INDEX_TYPE_E]] 1
78+
; CHECK: [[CALL_1:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[EXTRACT_1]], i1 false)
79+
; CHECK: [[INSERT_1:%[a-zA-Z0-9]+]] = insertelement <2 x i32> [[INSERT_0]], i32 [[CALL_1]], [[INDEX_TYPE_I]] 1
80+
81+
; CHECK: ret <2 x i32> [[INSERT_1]]
82+
%1 = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %0, i1 false)
83+
ret <2 x i32> %1
84+
}
85+
86+
define i64 @A5(i64) {
87+
entry:
88+
; CHECK-LABEL: entry:
89+
; CHECK: [[CONV_0:%[a-zA-Z0-9]+]] = trunc i64 %0 to i32
90+
; CHECK: [[CALL_0:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0]], i1 false)
91+
; CHECK: [[SHR:%[a-zA-Z0-9]+]] = lshr i64 %0, 32
92+
; CHECK: [[CONV_1:%[a-zA-Z0-9]+]] = trunc i64 [[SHR]] to i32
93+
; CHECK: [[CALL_1:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_1]], i1 false)
94+
; CHECK: [[CMP:%[a-zA-Z0-9]+]] = icmp ult i64 %0, 4294967296
95+
; CHECK: [[ADD:%[a-zA-Z0-9]+]] = add i32 [[CALL_0]], 32
96+
; CHECK: [[SELECT:%[a-zA-Z0-9]+]] = select i1 [[CMP]], i32 [[ADD]], i32 [[CALL_1]]
97+
; CHECK: [[CONV_3:%[a-zA-Z0-9]+]] = zext i32 [[SELECT]] to i64
98+
; CHECK: ret i64 [[CONV_3]]
99+
%1 = call i64 @llvm.ctlz.i64(i64 %0, i1 false)
100+
ret i64 %1
101+
}
102+
103+
104+
define <2 x i64> @A6(<2 x i64>) {
105+
entry:
106+
; CHECK-LABEL: entry:
107+
; CHECK: [[EXTRACT_0:%[a-zA-Z0-9]+]] = extractelement <2 x i64> %0, [[INDEX_TYPE_E]] 0
108+
; CHECK: [[CONV_0_0:%[a-zA-Z0-9]+]] = trunc i64 [[EXTRACT_0]] to i32
109+
; CHECK: [[CALL_0_0:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0_0]], i1 false)
110+
; CHECK: [[SHR_0:%[a-zA-Z0-9]+]] = lshr i64 [[EXTRACT_0]], 32
111+
; CHECK: [[CONV_0_1:%[a-zA-Z0-9]+]] = trunc i64 [[SHR_0]] to i32
112+
; CHECK: [[CALL_1_1:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_0_1]], i1 false)
113+
; CHECK: [[CMP_0:%[a-zA-Z0-9]+]] = icmp ult i64 [[EXTRACT_0]], 4294967296
114+
; CHECK: [[ADD_0:%[a-zA-Z0-9]+]] = add i32 [[CALL_0_0]], 32
115+
; CHECK: [[SELECT_0:%[a-zA-Z0-9]+]] = select i1 [[CMP_0]], i32 [[ADD_0]], i32 [[CALL_1_1]]
116+
; CHECK: [[CONV_0_3:%[a-zA-Z0-9]+]] = zext i32 [[SELECT_0]] to i64
117+
; CHECK: [[INSERT_0:%[a-zA-Z0-9]+]] = insertelement <2 x i64> undef, i64 [[CONV_0_3]], [[INDEX_TYPE_I]] 0
118+
119+
; CHECK: [[EXTRACT_1:%[a-zA-Z0-9]+]] = extractelement <2 x i64> %0, [[INDEX_TYPE_E]] 1
120+
; CHECK: [[CONV_1_0:%[a-zA-Z0-9]+]] = trunc i64 [[EXTRACT_1]] to i32
121+
; CHECK: [[CALL_1_0:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_1_0]], i1 false)
122+
; CHECK: [[SHR_1:%[a-zA-Z0-9]+]] = lshr i64 [[EXTRACT_1]], 32
123+
; CHECK: [[CONV_1_1:%[a-zA-Z0-9]+]] = trunc i64 [[SHR_1]] to i32
124+
; CHECK: [[CALL_1_1:%[a-zA-Z0-9]+]] = call i32 @llvm.ctlz.i32(i32 [[CONV_1_1]], i1 false)
125+
; CHECK: [[CMP_1:%[a-zA-Z0-9]+]] = icmp ult i64 [[EXTRACT_1]], 4294967296
126+
; CHECK: [[ADD_1:%[a-zA-Z0-9]+]] = add i32 [[CALL_1_0]], 32
127+
; CHECK: [[SELECT_1:%[a-zA-Z0-9]+]] = select i1 [[CMP_1]], i32 [[ADD_1]], i32 [[CALL_1_1]]
128+
; CHECK: [[CONV_1_3:%[a-zA-Z0-9]+]] = zext i32 [[SELECT_1]] to i64
129+
; CHECK: [[INSERT_1:%[a-zA-Z0-9]+]] = insertelement <2 x i64> [[INSERT_0]], i64 [[CONV_1_3]], [[INDEX_TYPE_I]] 1
130+
131+
; CHECK: ret <2 x i64> [[INSERT_1]]
132+
%1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %0, i1 false)
133+
ret <2 x i64> %1
134+
}
135+
136+
137+
138+
declare i8 @llvm.ctlz.i8(i8, i1)
139+
declare i16 @llvm.ctlz.i16(i16, i1)
140+
declare i32 @llvm.ctlz.i32(i32, i1)
141+
declare i64 @llvm.ctlz.i64(i64, i1)
142+
declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1)
143+
declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1)
144+
declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1)

0 commit comments

Comments
 (0)