Skip to content

Commit 8d2757f

Browse files
vsemenov368igcbot
authored andcommitted
Enable double and half precision UGM atomic emulation in VC
Emulating in VC double and half precision UGM atomic instructions such as FADD, FSUB, FCAS, FMIN, FMAX leveraging integer compare exchange instruction.
1 parent 104f49f commit 8d2757f

File tree

7 files changed

+379
-38
lines changed

7 files changed

+379
-38
lines changed

IGC/VectorCompiler/lib/BiF/Library/Atomics/Local/binop.cpp

Lines changed: 189 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -15,37 +15,39 @@ namespace {
1515
// Value are taken from LSC_OP enum
1616
// Source/visa/include/visa_igc_common_header.h
1717
enum class AtomicOp : char {
18+
Inc = 0x08,
19+
Dec = 0x09,
20+
Load = 0x0A,
21+
Xchg = 0x0B,
1822
Add = 0x0C,
1923
Sub = 0x0D,
20-
Xchg = 0x0B,
21-
And = 0x18,
22-
Or = 0x19,
23-
Xor = 0x1A,
2424
SMin = 0x0E,
2525
SMax = 0x0F,
2626
UMin = 0x10,
2727
UMax = 0x11,
28-
Dec = 0x09,
29-
Inc = 0x08,
30-
Load = 0x0A,
28+
Cas = 0x12,
29+
Fadd = 0x13,
30+
Fsub = 0x14,
31+
Fmin = 0x15,
32+
Fmax = 0x16,
33+
Fcas = 0x17,
34+
And = 0x18,
35+
Or = 0x19,
36+
Xor = 0x1A,
3137
};
3238

3339
template <int N>
3440
CM_NODEBUG CM_INLINE vector<uint64_t, N>
35-
__impl_atomic_local_binop(mask<N> pred, AtomicOp op, char l1cachecontrol,
36-
char l3cachecontrol, int base, vector<int, N> index,
37-
short scale, int offset, vector<uint64_t, N> src,
38-
vector<uint64_t, N> passthru) {
41+
__impl_atomic_local(mask<N> pred, AtomicOp op, char l1cachecontrol,
42+
char l3cachecontrol, int base, vector<int, N> index,
43+
short scale, int offset, vector<uint64_t, N> src1,
44+
vector<uint64_t, N> src2, vector<uint64_t, N> passthru) {
3945
vector<int, N> addr = base + index * scale + offset;
4046
vector<uint64_t, N> laddr = addr;
4147
vector<uint64_t, N> orig =
4248
detail::__cm_cl_gather(3, laddr.cl_vector(), sizeof(uint64_t),
4349
pred.cl_vector(), passthru.cl_vector());
4450

45-
// Value should be equal to LSC_ATOMIC_ICAS from
46-
// Source/visa/include/visa_igc_common_header.h
47-
constexpr char OpcodeICAS = 0x12;
48-
4951
// Value should be equal to LSC_ADDR_SIZE_32b from
5052
// Source/visa/include/visa_igc_common_header.h
5153
constexpr char AddrSize = 2;
@@ -58,38 +60,38 @@ __impl_atomic_local_binop(mask<N> pred, AtomicOp op, char l1cachecontrol,
5860
vector<uint64_t, N> newval = orig;
5961
switch (op) {
6062
case AtomicOp::Add:
61-
newval += src;
63+
newval += src1;
6264
break;
6365
case AtomicOp::Sub:
64-
newval -= src;
66+
newval -= src1;
6567
break;
6668
case AtomicOp::And:
67-
newval &= src;
69+
newval &= src1;
6870
break;
6971
case AtomicOp::Or:
70-
newval |= src;
72+
newval |= src1;
7173
break;
7274
case AtomicOp::Xor:
73-
newval ^= src;
75+
newval ^= src1;
7476
break;
7577
case AtomicOp::Xchg:
76-
newval = src;
78+
newval = src1;
7779
break;
7880
case AtomicOp::SMin: {
79-
vector<int64_t, N> ssrc = src.template format<int64_t>();
81+
vector<int64_t, N> ssrc1 = src1.template format<int64_t>();
8082
vector<int64_t, N> snewval = newval.template format<int64_t>();
81-
newval.merge(src, ssrc < snewval);
83+
newval.merge(src1, ssrc1 < snewval);
8284
} break;
8385
case AtomicOp::SMax: {
84-
vector<int64_t, N> ssrc = src.template format<int64_t>();
86+
vector<int64_t, N> ssrc1 = src1.template format<int64_t>();
8587
vector<int64_t, N> snewval = newval.template format<int64_t>();
86-
newval.merge(src, ssrc > snewval);
88+
newval.merge(src1, ssrc1 > snewval);
8789
} break;
8890
case AtomicOp::UMin:
89-
newval.merge(src, src < newval);
91+
newval.merge(src1, src1 < newval);
9092
break;
9193
case AtomicOp::UMax:
92-
newval.merge(src, src > newval);
94+
newval.merge(src1, src1 > newval);
9395
break;
9496
case AtomicOp::Inc:
9597
newval = newval + 1;
@@ -104,32 +106,184 @@ __impl_atomic_local_binop(mask<N> pred, AtomicOp op, char l1cachecontrol,
104106
}
105107

106108
vector<uint64_t, N> res = detail::__cm_cl_vector_atomic_slm(
107-
pred.cl_vector(), OpcodeICAS, AddrSize, DataSize, l1cachecontrol,
108-
l3cachecontrol, 0, addr.cl_vector(), 1, 0, orig.cl_vector(),
109-
newval.cl_vector(), orig.cl_vector());
109+
pred.cl_vector(), static_cast<char>(AtomicOp::Cas), AddrSize, DataSize,
110+
l1cachecontrol, l3cachecontrol, 0, addr.cl_vector(), 1, 0,
111+
orig.cl_vector(), newval.cl_vector(), orig.cl_vector());
110112
pred &= res != orig;
111113
orig = res;
112114
} while (pred.any());
113115

114116
return orig;
115117
}
116118

119+
template <int N>
120+
CM_NODEBUG CM_INLINE vector<double, N>
121+
__impl_atomic_global(mask<N> pred, AtomicOp op, char l1cachecontrol,
122+
char l3cachecontrol, long base, vector<long, N> index,
123+
short scale, int offset, vector<double, N> src1,
124+
vector<double, N> src2, vector<double, N> passthru) {
125+
vector<long, N> addr = base + index * scale + offset;
126+
vector<uint64_t, N> laddr = addr;
127+
vector<double, N> orig =
128+
detail::__cm_cl_gather(1, laddr.cl_vector(), sizeof(double),
129+
pred.cl_vector(), passthru.cl_vector());
130+
131+
// Value should be equal to LSC_ADDR_SIZE_64b from
132+
// Source/visa/include/visa_igc_common_header.h
133+
constexpr char AddrSize = 3;
134+
135+
// Value should be equal to LSC_DATA_SIZE_64b from
136+
// Source/visa/include/visa_igc_common_header.h
137+
constexpr char DataSize = 4;
138+
139+
do {
140+
vector<double, N> newval = orig;
141+
switch (op) {
142+
case AtomicOp::Fadd:
143+
newval += src1;
144+
break;
145+
case AtomicOp::Fsub:
146+
newval -= src1;
147+
break;
148+
case AtomicOp::Fmin:
149+
newval.merge(src1, src1 < newval);
150+
break;
151+
case AtomicOp::Fmax:
152+
newval.merge(src1, src1 > newval);
153+
break;
154+
case AtomicOp::Fcas:
155+
newval.merge(src2, src1 == newval);
156+
break;
157+
default:
158+
break;
159+
}
160+
161+
vector<uint64_t, N> iorig = orig.template format<uint64_t>();
162+
vector<uint64_t, N> inewval = newval.template format<uint64_t>();
163+
164+
vector<uint64_t, N> res = detail::__cm_cl_vector_atomic_ugm(
165+
pred.cl_vector(), static_cast<char>(AtomicOp::Cas), AddrSize, DataSize,
166+
l1cachecontrol, l3cachecontrol, 0, addr.cl_vector(), 1, 0,
167+
iorig.cl_vector(), inewval.cl_vector(), iorig.cl_vector());
168+
vector<double, N> fres = res.template format<double>();
169+
pred &= fres != orig;
170+
orig = fres;
171+
} while (pred.any());
172+
173+
return orig;
174+
}
175+
176+
template <int N>
177+
CM_NODEBUG CM_INLINE vector<uint32_t, N>
178+
__impl_atomic_global(mask<N> pred, AtomicOp op, char l1cachecontrol,
179+
char l3cachecontrol, long base, vector<long, N> index,
180+
short scale, int offset, vector<uint32_t, N> src1,
181+
vector<uint32_t, N> src2, vector<uint32_t, N> passthru) {
182+
vector<long, N> addr = base + index * scale + offset;
183+
vector<uint64_t, N> laddr = addr;
184+
185+
vector<half, N> hpassthru =
186+
passthru.template format<half>().template select<N, 2>(0);
187+
188+
vector<half, N> orig =
189+
detail::__cm_cl_gather(1, laddr.cl_vector(), sizeof(half),
190+
pred.cl_vector(), hpassthru.cl_vector());
191+
192+
vector<uint32_t, N> iorig;
193+
194+
// Value should be equal to LSC_ADDR_SIZE_64b from
195+
// Source/visa/include/visa_igc_common_header.h
196+
constexpr char AddrSize = 3;
197+
198+
// Value should be equal to LSC_DATA_SIZE_16c32b from
199+
// Source/visa/include/visa_igc_common_header.h
200+
constexpr char DataSize = 6;
201+
202+
vector<half, N> hsrc = src1.template format<half>().template select<N, 2>(0);
203+
204+
do {
205+
vector<half, N> newval = orig;
206+
switch (op) {
207+
case AtomicOp::Fadd:
208+
newval += hsrc;
209+
break;
210+
case AtomicOp::Fsub:
211+
newval -= hsrc;
212+
break;
213+
default:
214+
break;
215+
}
216+
217+
iorig.template format<half>().template select<N, 2>(0) = orig;
218+
219+
vector<uint32_t, N> inewval;
220+
inewval.template format<half>().template select<N, 2>(0) = newval;
221+
222+
vector<uint32_t, N> res = detail::__cm_cl_vector_atomic_ugm(
223+
pred.cl_vector(), static_cast<char>(AtomicOp::Cas), AddrSize, DataSize,
224+
l1cachecontrol, l3cachecontrol, 0, addr.cl_vector(), 1, 0,
225+
iorig.cl_vector(), inewval.cl_vector(), iorig.cl_vector());
226+
227+
vector<half, N> hres = res.template format<half>().template select<N, 2>(0);
228+
229+
pred &= hres != orig;
230+
orig = hres;
231+
} while (pred.any());
232+
233+
return iorig;
234+
}
235+
117236
} // namespace
118237

119238
#define ATOMIC(WIDTH) \
120239
CM_NODEBUG CM_INLINE extern "C" cl_vector<uint64_t, WIDTH> \
121240
__vc_builtin_atomic_slm_v##WIDTH##i64( \
122241
cl_vector<char, WIDTH> pred, AtomicOp op, char l1cachecontrol, \
123242
char l3cachecontrol, int base, cl_vector<int, WIDTH> index, \
124-
short scale, int offset, cl_vector<uint64_t, WIDTH> src, \
243+
short scale, int offset, cl_vector<uint64_t, WIDTH> src1, \
244+
cl_vector<uint64_t, WIDTH> src2, \
125245
cl_vector<uint64_t, WIDTH> passthru) { \
126246
mask<WIDTH> vpred{pred}; \
127247
vector<int, WIDTH> vindex{index}; \
128-
vector<uint64_t, WIDTH> vsrc{src}; \
248+
vector<uint64_t, WIDTH> vsrc1{src1}; \
249+
vector<uint64_t, WIDTH> vsrc2{src2}; \
129250
vector<uint64_t, WIDTH> vpassthru{passthru}; \
130-
return __impl_atomic_local_binop<WIDTH>(vpred, op, l1cachecontrol, \
131-
l3cachecontrol, base, vindex, \
132-
scale, offset, vsrc, vpassthru) \
251+
return __impl_atomic_local<WIDTH>(vpred, op, l1cachecontrol, \
252+
l3cachecontrol, base, vindex, scale, \
253+
offset, vsrc1, vsrc2, vpassthru) \
254+
.cl_vector(); \
255+
} \
256+
CM_NODEBUG CM_INLINE extern "C" cl_vector<double, WIDTH> \
257+
__vc_builtin_atomic_ugm_v##WIDTH##f64( \
258+
cl_vector<char, WIDTH> pred, AtomicOp op, char l1cachecontrol, \
259+
char l3cachecontrol, long base, cl_vector<long, WIDTH> index, \
260+
short scale, int offset, cl_vector<double, WIDTH> src1, \
261+
cl_vector<double, WIDTH> src2, cl_vector<double, WIDTH> passthru) { \
262+
mask<WIDTH> vpred{pred}; \
263+
vector<long, WIDTH> vindex{index}; \
264+
vector<double, WIDTH> vsrc1{src1}; \
265+
vector<double, WIDTH> vsrc2{src2}; \
266+
vector<double, WIDTH> vpassthru{passthru}; \
267+
return __impl_atomic_global<WIDTH>(vpred, op, l1cachecontrol, \
268+
l3cachecontrol, base, vindex, scale, \
269+
offset, vsrc1, vsrc2, vpassthru) \
270+
.cl_vector(); \
271+
} \
272+
CM_NODEBUG CM_INLINE extern "C" cl_vector<uint32_t, WIDTH> \
273+
__vc_builtin_atomic_ugm_v##WIDTH##i32( \
274+
cl_vector<char, WIDTH> pred, AtomicOp op, char l1cachecontrol, \
275+
char l3cachecontrol, long base, cl_vector<long, WIDTH> index, \
276+
short scale, int offset, cl_vector<uint32_t, WIDTH> src1, \
277+
cl_vector<uint32_t, WIDTH> src2, \
278+
cl_vector<uint32_t, WIDTH> passthru) { \
279+
mask<WIDTH> vpred{pred}; \
280+
vector<long, WIDTH> vindex{index}; \
281+
vector<uint32_t, WIDTH> vsrc1{src1}; \
282+
vector<uint32_t, WIDTH> vsrc2{src2}; \
283+
vector<uint32_t, WIDTH> vpassthru{passthru}; \
284+
return __impl_atomic_global<WIDTH>(vpred, op, l1cachecontrol, \
285+
l3cachecontrol, base, vindex, scale, \
286+
offset, vsrc1, vsrc2, vpassthru) \
133287
.cl_vector(); \
134288
}
135289

IGC/VectorCompiler/lib/GenXCodeGen/GenX.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,11 @@ def FeatureInstrLocalIntegerCas64: SubtargetFeature<"localintegercas64",
105105
"true",
106106
"support of local 64-bit integer compare exchange instruction">;
107107

108+
def FeatureInstrGlobalAtomicAddF64: SubtargetFeature<"globaldoubleaddsub",
109+
"HasGlobalAtomicAddF64",
110+
"true",
111+
"support of global double precision atomic add/sub instructions">;
112+
108113
def FeatureHWTIDFromPredef: SubtargetFeature<"hwtidfrompredef",
109114
"GetsHWTIDFromPredef",
110115
"true",
@@ -406,6 +411,7 @@ def : Proc<"XeHPC", [
406411
FeatureIndirectGRFCrossing,
407412
FeatureInstr64BitRotate,
408413
FeatureInstrAdd64,
414+
FeatureInstrGlobalAtomicAddF64,
409415
FeatureInstrLocalIntegerCas64,
410416
FeatureLSCMaxWidth32,
411417
FeatureLongLong,

IGC/VectorCompiler/lib/GenXCodeGen/GenXBuiltinFunctions.cpp

Lines changed: 43 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ class GenXBuiltinFunctions : public ModulePass,
7474
Value *createLibraryCall(Instruction &I, Function *Func,
7575
ArrayRef<Value *> Args);
7676

77+
bool isHandleUgmAtomics(const CallInst &II) const;
78+
7779
const GenXSubtarget *ST = nullptr;
7880
BuiltinFunctionKind Kind;
7981
};
@@ -277,10 +279,31 @@ Value *GenXBuiltinFunctions::visitURem(BinaryOperator &I) {
277279
return createLibraryCall(I, Func, {I.getOperand(0), I.getOperand(1)});
278280
}
279281

282+
bool GenXBuiltinFunctions::isHandleUgmAtomics(const CallInst &II) const {
283+
auto *Ty = II.getType();
284+
auto *Opcode = cast<ConstantInt>(II.getArgOperand(1));
285+
auto *VTy = cast<IGCLLVM::FixedVectorType>(Ty);
286+
auto *ETy = VTy->getElementType();
287+
switch (Opcode->getZExtValue()) {
288+
case LSC_ATOMIC_FADD:
289+
case LSC_ATOMIC_FSUB:
290+
return (ETy->isDoubleTy() && !ST->hasGlobalAtomicAddF64()) ||
291+
cast<ConstantInt>(II.getArgOperand(3))->getZExtValue() ==
292+
LSC_DATA_SIZE_16c32b;
293+
case LSC_ATOMIC_FMIN:
294+
case LSC_ATOMIC_FMAX:
295+
case LSC_ATOMIC_FCAS:
296+
return ETy->isDoubleTy();
297+
default:
298+
return false;
299+
}
300+
}
301+
280302
Value *GenXBuiltinFunctions::visitCallInst(CallInst &II) {
281303
auto IID = vc::getAnyIntrinsicID(&II);
282304
auto *Ty = II.getType();
283305
auto &M = *II.getModule();
306+
IRBuilder<> Builder(&II);
284307
Function *Func = nullptr;
285308
SmallVector<Value *, 2> Args(II.args());
286309

@@ -315,7 +338,6 @@ Value *GenXBuiltinFunctions::visitCallInst(CallInst &II) {
315338
} break;
316339

317340
case vc::InternalIntrinsic::lsc_atomic_slm: {
318-
IRBuilder<> Builder(&II);
319341
auto *Opcode = cast<ConstantInt>(II.getArgOperand(1));
320342
if (Opcode->getZExtValue() == LSC_ATOMIC_ICAS)
321343
return nullptr;
@@ -332,8 +354,26 @@ Value *GenXBuiltinFunctions::visitCallInst(CallInst &II) {
332354
Args.clear();
333355
Args.push_back(Mask);
334356
Args.push_back(Opcode);
335-
std::copy(II.arg_begin() + 4, II.arg_end() - 2, std::back_inserter(Args));
336-
Args.push_back(II.getArgOperand(12));
357+
std::copy(II.arg_begin() + 4, II.arg_end(), std::back_inserter(Args));
358+
} break;
359+
360+
case vc::InternalIntrinsic::lsc_atomic_ugm: {
361+
if (!isHandleUgmAtomics(II))
362+
return nullptr;
363+
364+
auto *Opcode = cast<ConstantInt>(II.getArgOperand(1));
365+
auto *VTy = cast<IGCLLVM::FixedVectorType>(Ty);
366+
367+
Func = getBuiltinDeclaration(M, "atomic_ugm", false, {VTy});
368+
369+
auto *MaskVTy = IGCLLVM::FixedVectorType::get(Builder.getInt8Ty(),
370+
VTy->getNumElements());
371+
auto *Mask = Builder.CreateZExt(II.getArgOperand(0), MaskVTy);
372+
373+
Args.clear();
374+
Args.push_back(Mask);
375+
Args.push_back(Opcode);
376+
std::copy(II.arg_begin() + 4, II.arg_end(), std::back_inserter(Args));
337377
} break;
338378

339379
default:

0 commit comments

Comments
 (0)