Skip to content

Commit 7f50139

Browse files
authored
[SYCL][ESIMD] Update ESIMD support for split barrier, slm load and dp4a. (#2917)
1 parent f621a20 commit 7f50139

File tree

7 files changed

+258
-15
lines changed

7 files changed

+258
-15
lines changed

llvm/lib/SYCLLowerIR/LowerESIMD.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -289,10 +289,10 @@ class ESIMDIntrinDescTable {
289289
{"media.st", {a(0), aSI(1), a(2), a(3), a(4), a(5), a(6)}}},
290290
{"slm_fence", {"fence", {a(0)}}},
291291
{"barrier", {"barrier", {}}},
292+
{"sbarrier", {"sbarrier", {a(0)}}},
292293
{"block_read", {"oword.ld.unaligned", {c32(0), aSI(0), a(1)}}},
293294
{"block_write", {"oword.st", {aSI(0), a(1), a(2)}}},
294-
{"slm_block_read",
295-
{"oword.ld.unaligned", {c32(0), c32(SLM_BTI), a(0)}}},
295+
{"slm_block_read", {"oword.ld", {c32(0), c32(SLM_BTI), a(0)}}},
296296
{"slm_block_write", {"oword.st", {c32(SLM_BTI), a(0), a(1)}}},
297297
{"slm_read",
298298
{"gather.scaled",
@@ -372,7 +372,14 @@ class ESIMDIntrinDescTable {
372372
{"cos", {"cos", {a(0)}}},
373373
{"pow", {"pow", {a(0), a(1)}}},
374374
{"div_ieee", {"ieee.div", {a(0), a(1)}}},
375-
{"dp4a", {"dp4a", {a(0), a(1), a(2)}}},
375+
{"uudp4a", {"uudp4a", {a(0), a(1), a(2)}}},
376+
{"usdp4a", {"usdp4a", {a(0), a(1), a(2)}}},
377+
{"sudp4a", {"sudp4a", {a(0), a(1), a(2)}}},
378+
{"ssdp4a", {"ssdp4a", {a(0), a(1), a(2)}}},
379+
{"uudp4a_sat", {"uudp4a.sat", {a(0), a(1), a(2)}}},
380+
{"usdp4a_sat", {"usdp4a.sat", {a(0), a(1), a(2)}}},
381+
{"sudp4a_sat", {"sudp4a.sat", {a(0), a(1), a(2)}}},
382+
{"ssdp4a_sat", {"ssdp4a.sat", {a(0), a(1), a(2)}}},
376383
{"any", {"any", {ai1(0)}}},
377384
{"all", {"all", {ai1(0)}}},
378385
};

llvm/test/SYCLLowerIR/esimd_lower_intrins.ll

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,120 @@ define dso_local spir_kernel void @FUNC_31() !sycl_explicit_simd !1 {
180180
; CHECK-NEXT: ret void
181181
}
182182

183+
define dso_local spir_func <16 x i32> @FUNC_32() !sycl_explicit_simd !1 {
184+
%a_1 = alloca <16 x i32>
185+
%1 = load <16 x i32>, <16 x i32>* %a_1
186+
%a_2 = alloca <16 x i32>
187+
%2 = load <16 x i32>, <16 x i32>* %a_2
188+
%a_3 = alloca <16 x i32>
189+
%3 = load <16 x i32>, <16 x i32>* %a_3
190+
%ret_val = call spir_func <16 x i32> @_Z14__esimd_uudp4aIjjjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
191+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.uudp4a.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
192+
ret <16 x i32> %ret_val
193+
}
194+
195+
define dso_local spir_func <16 x i32> @FUNC_33() !sycl_explicit_simd !1 {
196+
%a_1 = alloca <16 x i32>
197+
%1 = load <16 x i32>, <16 x i32>* %a_1
198+
%a_2 = alloca <16 x i32>
199+
%2 = load <16 x i32>, <16 x i32>* %a_2
200+
%a_3 = alloca <16 x i32>
201+
%3 = load <16 x i32>, <16 x i32>* %a_3
202+
%ret_val = call spir_func <16 x i32> @_Z14__esimd_usdp4aIjiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
203+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.usdp4a.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
204+
ret <16 x i32> %ret_val
205+
}
206+
207+
define dso_local spir_func <16 x i32> @FUNC_34() !sycl_explicit_simd !1 {
208+
%a_1 = alloca <16 x i32>
209+
%1 = load <16 x i32>, <16 x i32>* %a_1
210+
%a_2 = alloca <16 x i32>
211+
%2 = load <16 x i32>, <16 x i32>* %a_2
212+
%a_3 = alloca <16 x i32>
213+
%3 = load <16 x i32>, <16 x i32>* %a_3
214+
%ret_val = call spir_func <16 x i32> @_Z14__esimd_sudp4aIijjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
215+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.sudp4a.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
216+
ret <16 x i32> %ret_val
217+
}
218+
219+
define dso_local spir_func <16 x i32> @FUNC_35() !sycl_explicit_simd !1 {
220+
%a_1 = alloca <16 x i32>
221+
%1 = load <16 x i32>, <16 x i32>* %a_1
222+
%a_2 = alloca <16 x i32>
223+
%2 = load <16 x i32>, <16 x i32>* %a_2
224+
%a_3 = alloca <16 x i32>
225+
%3 = load <16 x i32>, <16 x i32>* %a_3
226+
%ret_val = call spir_func <16 x i32> @_Z14__esimd_ssdp4aIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
227+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.ssdp4a.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
228+
ret <16 x i32> %ret_val
229+
}
230+
231+
define dso_local spir_func <16 x i32> @FUNC_36() !sycl_explicit_simd !1 {
232+
%a_1 = alloca <16 x i32>
233+
%1 = load <16 x i32>, <16 x i32>* %a_1
234+
%a_2 = alloca <16 x i32>
235+
%2 = load <16 x i32>, <16 x i32>* %a_2
236+
%a_3 = alloca <16 x i32>
237+
%3 = load <16 x i32>, <16 x i32>* %a_3
238+
%ret_val = call spir_func <16 x i32> @_Z18__esimd_uudp4a_satIjjjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
239+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.uudp4a.sat.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
240+
ret <16 x i32> %ret_val
241+
}
242+
243+
define dso_local spir_func <16 x i32> @FUNC_37() !sycl_explicit_simd !1 {
244+
%a_1 = alloca <16 x i32>
245+
%1 = load <16 x i32>, <16 x i32>* %a_1
246+
%a_2 = alloca <16 x i32>
247+
%2 = load <16 x i32>, <16 x i32>* %a_2
248+
%a_3 = alloca <16 x i32>
249+
%3 = load <16 x i32>, <16 x i32>* %a_3
250+
%ret_val = call spir_func <16 x i32> @_Z18__esimd_usdp4a_satIjiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
251+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.usdp4a.sat.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
252+
ret <16 x i32> %ret_val
253+
}
254+
255+
define dso_local spir_func <16 x i32> @FUNC_38() !sycl_explicit_simd !1 {
256+
%a_1 = alloca <16 x i32>
257+
%1 = load <16 x i32>, <16 x i32>* %a_1
258+
%a_2 = alloca <16 x i32>
259+
%2 = load <16 x i32>, <16 x i32>* %a_2
260+
%a_3 = alloca <16 x i32>
261+
%3 = load <16 x i32>, <16 x i32>* %a_3
262+
%ret_val = call spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
263+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.sudp4a.sat.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
264+
ret <16 x i32> %ret_val
265+
}
266+
267+
define dso_local spir_func <16 x i32> @FUNC_39() !sycl_explicit_simd !1 {
268+
%a_1 = alloca <16 x i32>
269+
%1 = load <16 x i32>, <16 x i32>* %a_1
270+
%a_2 = alloca <16 x i32>
271+
%2 = load <16 x i32>, <16 x i32>* %a_2
272+
%a_3 = alloca <16 x i32>
273+
%3 = load <16 x i32>, <16 x i32>* %a_3
274+
%ret_val = call spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %1, <16 x i32> %2, <16 x i32> %3)
275+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <16 x i32> @llvm.genx.ssdp4a.sat.v16i32.v16i32.v16i32.v16i32(<16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}}, <16 x i32> %{{[0-9a-zA-Z_.]+}})
276+
ret <16 x i32> %ret_val
277+
}
278+
279+
define dso_local spir_func <8 x i32> @FUNC_40() !sycl_explicit_simd !1 {
280+
%ret_val = call spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 0)
281+
; CHECK: %{{[0-9a-zA-Z_.]+}} = call <8 x i32> @llvm.genx.oword.ld.v8i32(i32 0, i32 254, i32 0)
282+
ret <8 x i32> %ret_val
283+
}
284+
285+
define dso_local spir_func void @FUNC_41() !sycl_explicit_simd !1 {
286+
call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 1)
287+
; CHECK: call void @llvm.genx.sbarrier(i8 1)
288+
ret void
289+
}
290+
291+
define dso_local spir_func void @FUNC_42() !sycl_explicit_simd !1 {
292+
call spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 zeroext 0)
293+
; CHECK: call void @llvm.genx.sbarrier(i8 0)
294+
ret void
295+
}
296+
183297
declare dso_local spir_func i64 @_Z27__spirv_LocalInvocationId_xv()
184298
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic0ILN2cm3gen14CmAtomicOpTypeE2EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeENS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i16> %1)
185299
declare dso_local spir_func <32 x i32> @_Z20__esimd_flat_atomic1ILN2cm3gen14CmAtomicOpTypeE0EjLi32ELNS1_9CacheHintE0ELS3_0EENS1_13__vector_typeIT0_XT1_EE4typeENS4_IyXT1_EE4typeES7_NS4_ItXT1_EE4typeE(<32 x i64> %0, <32 x i32> %1, <32 x i16> %2)
@@ -211,6 +325,16 @@ declare dso_local spir_func void @_Z25__esimd_media_block_storeIiLi4ELi8E14ocl_i
211325
declare dso_local spir_func <32 x i32> @_Z13__esimd_vloadIiLi32EEN2cm3gen13__vector_typeIT_XT0_EE4typeEPKS5_(<32 x i32> addrspace(4)* %0)
212326
declare dso_local spir_func void @_Z14__esimd_vstoreIfLi16EEvPN2cm3gen13__vector_typeIT_XT0_EE4typeES5_(<16 x float> addrspace(4)* %0, <16 x float> %1)
213327
declare dso_local spir_func void @_ZN2cl4sycl5INTEL3gpu8slm_initEj(i32)
328+
declare dso_local spir_func <16 x i32> @_Z14__esimd_uudp4aIjjjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
329+
declare dso_local spir_func <16 x i32> @_Z14__esimd_usdp4aIjiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
330+
declare dso_local spir_func <16 x i32> @_Z14__esimd_sudp4aIijjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
331+
declare dso_local spir_func <16 x i32> @_Z14__esimd_ssdp4aIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
332+
declare dso_local spir_func <16 x i32> @_Z18__esimd_uudp4a_satIjjjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
333+
declare dso_local spir_func <16 x i32> @_Z18__esimd_usdp4a_satIjiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
334+
declare dso_local spir_func <16 x i32> @_Z18__esimd_sudp4a_satIijjjLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
335+
declare dso_local spir_func <16 x i32> @_Z18__esimd_ssdp4a_satIiiiiLi16EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT3_EE4typeENS4_IT0_XT3_EE4typeENS4_IT1_XT3_EE4typeENS4_IT2_XT3_EE4typeE(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2)
336+
declare dso_local spir_func <8 x i32> @_Z22__esimd_slm_block_readIiLi8EEN2cl4sycl5INTEL3gpu11vector_typeIT_XT0_EE4typeEj(i32 %0)
337+
declare dso_local spir_func void @_Z16__esimd_sbarrierN2cl4sycl5INTEL3gpu17EsimdSbarrierTypeE(i8 %0)
214338

215339
attributes #0 = { "genx_byte_offset"="192" "genx_volatile" }
216340

sycl/include/CL/sycl/INTEL/esimd/detail/esimd_math_intrin.hpp

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,44 @@ template <int N>
204204
SYCL_EXTERNAL vector_type_t<uint16_t, N> __esimd_unpack_mask(uint32_t src0);
205205

206206
template <typename T1, typename T2, typename T3, typename T4, int N>
207-
SYCL_EXTERNAL vector_type_t<T1, N> __esimd_dp4a(vector_type_t<T2, N> src0,
208-
vector_type_t<T3, N> src1,
209-
vector_type_t<T4, N> src2);
207+
SYCL_EXTERNAL vector_type_t<T1, N> __esimd_uudp4a(vector_type_t<T2, N> src0,
208+
vector_type_t<T3, N> src1,
209+
vector_type_t<T4, N> src2);
210+
211+
template <typename T1, typename T2, typename T3, typename T4, int N>
212+
SYCL_EXTERNAL vector_type_t<T1, N> __esimd_usdp4a(vector_type_t<T2, N> src0,
213+
vector_type_t<T3, N> src1,
214+
vector_type_t<T4, N> src2);
215+
216+
template <typename T1, typename T2, typename T3, typename T4, int N>
217+
SYCL_EXTERNAL vector_type_t<T1, N> __esimd_sudp4a(vector_type_t<T2, N> src0,
218+
vector_type_t<T3, N> src1,
219+
vector_type_t<T4, N> src2);
220+
221+
template <typename T1, typename T2, typename T3, typename T4, int N>
222+
SYCL_EXTERNAL vector_type_t<T1, N> __esimd_ssdp4a(vector_type_t<T2, N> src0,
223+
vector_type_t<T3, N> src1,
224+
vector_type_t<T4, N> src2);
225+
226+
template <typename T1, typename T2, typename T3, typename T4, int N>
227+
SYCL_EXTERNAL vector_type_t<T1, N>
228+
__esimd_uudp4a_sat(vector_type_t<T2, N> src0, vector_type_t<T3, N> src1,
229+
vector_type_t<T4, N> src2);
230+
231+
template <typename T1, typename T2, typename T3, typename T4, int N>
232+
SYCL_EXTERNAL vector_type_t<T1, N>
233+
__esimd_usdp4a_sat(vector_type_t<T2, N> src0, vector_type_t<T3, N> src1,
234+
vector_type_t<T4, N> src2);
235+
236+
template <typename T1, typename T2, typename T3, typename T4, int N>
237+
SYCL_EXTERNAL vector_type_t<T1, N>
238+
__esimd_sudp4a_sat(vector_type_t<T2, N> src0, vector_type_t<T3, N> src1,
239+
vector_type_t<T4, N> src2);
240+
241+
template <typename T1, typename T2, typename T3, typename T4, int N>
242+
SYCL_EXTERNAL vector_type_t<T1, N>
243+
__esimd_ssdp4a_sat(vector_type_t<T2, N> src0, vector_type_t<T3, N> src1,
244+
vector_type_t<T4, N> src2);
210245

211246
// Reduction functions
212247
template <typename Ty, int N>
@@ -248,7 +283,7 @@ __esimd_dp4(sycl::INTEL::gpu::vector_type_t<Ty, N> v1,
248283

249284
template <typename T>
250285
T extract(const uint32_t &width, const uint32_t &offset, uint32_t src,
251-
uint32_t &sign_extend) {
286+
const uint32_t &sign_extend) {
252287
uint32_t mask = ((1 << width) - 1) << offset;
253288
T ret = (src & mask) >> offset;
254289
if (sign_extend) {
@@ -1089,7 +1124,7 @@ SYCL_EXTERNAL vector_type_t<T1, N> __esimd_dp4a(vector_type_t<T2, N> src0,
10891124

10901125
ret = src1_a * src2_a + src1_b * src2_b + src1_c * src2_c + src1_d * src2_d;
10911126
reta = ret + src0[i];
1092-
retv(i) = EsimdEmulSys::satur<T1>::saturate(reta, sat1);
1127+
retv[i] = EsimdEmulSys::satur<T1>::saturate(reta, sat1);
10931128
}
10941129

10951130
return retv;

sycl/include/CL/sycl/INTEL/esimd/detail/esimd_memory_intrin.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,9 @@ __esimd_flat_atomic2(sycl::INTEL::gpu::vector_type_t<uint64_t, N> addrs,
196196
// esimd_barrier, generic group barrier
197197
SYCL_EXTERNAL void __esimd_barrier();
198198

199+
// generic work-group split barrier
200+
SYCL_EXTERNAL void __esimd_sbarrier(sycl::INTEL::gpu::EsimdSbarrierType flag);
201+
199202
// slm_fence sets the SLM read/write order
200203
SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl);
201204

@@ -772,6 +775,8 @@ __esimd_dp4(sycl::INTEL::gpu::vector_type_t<Ty, N> v1,
772775
/// TODO
773776
SYCL_EXTERNAL void __esimd_barrier() {}
774777

778+
SYCL_EXTERNAL void __esimd_sbarrier(sycl::INTEL::gpu::EsimdSbarrierType flag) {}
779+
775780
SYCL_EXTERNAL void __esimd_slm_fence(uint8_t cntl) {}
776781

777782
template <typename Ty, int N>

sycl/include/CL/sycl/INTEL/esimd/esimd_enum.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,14 @@ enum class CacheHint : uint8_t {
104104
ReadInvalidate = 5
105105
};
106106

107+
enum class EsimdSbarrierType : uint8_t {
108+
WAIT = 0, // split barrier wait
109+
SIGNAL = 1 // split barrier signal
110+
};
111+
112+
#define ESIMD_SBARRIER_WAIT EsimdSbarrierType::WAIT
113+
#define ESIMD_SBARRIER_SIGNAL EsimdSbarrierType::SIGNAL
114+
107115
} // namespace gpu
108116

109117
} // namespace INTEL

sycl/include/CL/sycl/INTEL/esimd/esimd_math.hpp

Lines changed: 63 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1314,6 +1314,18 @@ esimd_fbh(T src) {
13141314

13151315
template <typename T = void> simd<uint, 4> esimd_rdtsc();
13161316

1317+
/// \brief DP4A.
1318+
///
1319+
/// @param src0 the first source operand of dp4a operation.
1320+
///
1321+
/// @param src1 the second source operand of dp4a operation.
1322+
///
1323+
/// @param src2 the third source operand of dp4a operation.
1324+
///
1325+
/// @param flag saturation flag, which has default value of GENX_NOSAT.
1326+
///
1327+
/// Returns simd vector of the dp4a operation result.
1328+
///
13171329
template <typename T1, typename T2, typename T3, typename T4, int N>
13181330
ESIMD_NODEBUG ESIMD_INLINE typename sycl::detail::enable_if_t<
13191331
detail::is_dword_type<T1>::value && detail::is_dword_type<T2>::value &&
@@ -1324,10 +1336,57 @@ esimd_dp4a(simd<T2, N> src0, simd<T3, N> src1, simd<T4, N> src2,
13241336
simd<T2, N> Src0 = src0;
13251337
simd<T3, N> Src1 = src1;
13261338
simd<T4, N> Src2 = src2;
1327-
simd<T1, N> Result = __esimd_dp4a<T1>(Src0.data(), Src1.data(), Src2.data());
1328-
if (flag != GENX_SAT)
1329-
return Result;
1330-
return esimd_sat<T1>(Result);
1339+
simd<T1, N> Result;
1340+
1341+
#if defined(__SYCL_DEVICE_ONLY__)
1342+
if (flag == GENX_NOSAT) {
1343+
if constexpr (std::is_unsigned<T1>::value) {
1344+
if constexpr (std::is_unsigned<T2>::value) {
1345+
Result = __esimd_uudp4a<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1346+
Src2.data());
1347+
} else {
1348+
Result = __esimd_usdp4a<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1349+
Src2.data());
1350+
}
1351+
} else {
1352+
if constexpr (std::is_unsigned<T2>::value) {
1353+
Result = __esimd_sudp4a<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1354+
Src2.data());
1355+
} else {
1356+
Result = __esimd_ssdp4a<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1357+
Src2.data());
1358+
}
1359+
}
1360+
} else {
1361+
if constexpr (std::is_unsigned<T1>::value) {
1362+
if constexpr (std::is_unsigned<T2>::value) {
1363+
Result = __esimd_uudp4a_sat<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1364+
Src2.data());
1365+
} else {
1366+
Result = __esimd_usdp4a_sat<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1367+
Src2.data());
1368+
}
1369+
} else {
1370+
if constexpr (std::is_unsigned<T2>::value) {
1371+
Result = __esimd_sudp4a_sat<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1372+
Src2.data());
1373+
} else {
1374+
Result = __esimd_ssdp4a_sat<T1, T2, T3, T4, N>(Src0.data(), Src1.data(),
1375+
Src2.data());
1376+
}
1377+
}
1378+
}
1379+
#else
1380+
simd<T2, N> tmp =
1381+
__esimd_dp4a<T1, T2, T3, T4, N>(Src0.data(), Src1.data(), Src2.data());
1382+
1383+
if (flag == GENX_SAT)
1384+
Result = esimd_sat<T1>(tmp);
1385+
else
1386+
Result = convert<T1>(tmp);
1387+
#endif // __SYCL_DEVICE_ONLY__
1388+
1389+
return Result;
13311390
}
13321391

13331392
static auto constexpr ESIMD_CONST_E = 2.71828f;

sycl/include/CL/sycl/INTEL/esimd/esimd_memory.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -587,6 +587,11 @@ ESIMD_NODEBUG ESIMD_INLINE
587587
/// \ingroup sycl_esimd
588588
inline ESIMD_NODEBUG void esimd_barrier() { __esimd_barrier(); }
589589

590+
/// Generic work-group split barrier
591+
inline ESIMD_NODEBUG void esimd_sbarrier(EsimdSbarrierType flag) {
592+
__esimd_sbarrier(flag);
593+
}
594+
590595
enum EsimdFenceMask {
591596
ESIMD_GLOBAL_COHERENT_FENCE = 0x1,
592597
ESIMD_L3_FLUSH_INSTRUCTIONS = 0x2,
@@ -660,10 +665,10 @@ ESIMD_INLINE ESIMD_NODEBUG simd<T, n> slm_block_load(uint32_t offset) {
660665
"block size must be whole number of owords");
661666
static_assert(__esimd::isPowerOf2(Sz / __esimd::OWORD),
662667
"block must be 1, 2, 4 or 8 owords long");
663-
static_assert(Sz <= 8 * __esimd::OWORD,
664-
"block size must be at most 8 owords");
668+
static_assert(Sz <= 16 * __esimd::OWORD,
669+
"block size must be at most 16 owords");
665670

666-
return __esimd_slm_block_read<T, n>(offset);
671+
return __esimd_slm_block_read<T, n>(offset >> 4);
667672
}
668673

669674
/// SLM block-store.

0 commit comments

Comments
 (0)