Skip to content

Commit ca39f47

Browse files
committed
The following function will now be available in OCL with using the
build option (cl_intel_64bit_global_atomics_placeholder)... A32/A64 with 64bit values: atomic_add atomic_sub atomic_xchg atomic_min atomic_max atomic_inc atomic_dec atomic_and atomic_or atomic_xor atomic_cmpxchg Change-Id: I44e029f735b3de768bcf584f4fe987e6bdbb2cab
1 parent 07b71c0 commit ca39f47

23 files changed

+859
-583
lines changed

IGC/BiFModule/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979

8080
if(NOT EXISTS ${CCLANG_DIR_BINARIES})
8181
set(CCLANG_DIR_BINARIES "${CMAKE_CURRENT_SOURCE_DIR}/../../../../cclang-prebuilt/${_OS_folder}")
82-
endif()
82+
endif()
8383
if(NOT EXISTS ${CCLANG_DIR_BINARIES})
8484
set(CCLANG_DIR_BINARIES "${IGC_BUILD__GFX_DEV_SRC_DIR}/Clang/Prebuilt/${_OS_folder}")
8585
endif()
@@ -89,7 +89,7 @@
8989
string(TOUPPER "${configName}" upperConfigName)
9090
set(locPropName "IMPORTED_LOCATION_${upperConfigName}")
9191
set_property(TARGET CLANG_7Z
92-
PROPERTY "${locPropName}" "${IGC_BUILD__GFX_DEV_SRC_DIR}/Clang/Prebuilt/${_OS_folder}/Release/${_cpuSuffix}/clang.7z"
92+
PROPERTY "${locPropName}" "${CCLANG_DIR_BINARIES}/Release/${_cpuSuffix}/clang.7z"
9393
)
9494
set_property(TARGET CLANG_7Z_OUTPUT
9595
PROPERTY "${locPropName}" "${IGC_TARGET__TOOLS_CLANG_DIR}"
@@ -898,7 +898,7 @@ set(FLAG "")
898898

899899
igc_bif_find_cl_files(IGC_BUILD__BIF_OCL_COMMON_DEPENDS ${IGC_BUILD__BIF_OCL_COMMON_INC_DIRS} "${IGC_OPTION__BIF_SRC_OCL_DIR}/Implementation")
900900

901-
set(KHR_DEFINES "cl_khr_f16" "cl_khrfp64" "cl_khr_gl_msaa_sharing" "cl_khr_mipmap_image" "cl_khr_depth_images" "cl_intel_subgroups_short" "cl_intel_subgroups_char" "cl_intel_subgroups_long")
901+
set(KHR_DEFINES "cl_khr_f16" "cl_khrfp64" "cl_khr_gl_msaa_sharing" "cl_khr_mipmap_image" "cl_khr_depth_images" "cl_intel_subgroups_short" "cl_intel_subgroups_char" "cl_intel_subgroups_long" "cl_intel_64bit_global_atomics_placeholder")
902902

903903
igc_bif_build_bc(
904904
OUTPUT "${IGC_BUILD__BIF_DIR}/IBiF_Impl_int.bc"

IGC/BiFModule/Implementation/IGCBiF_Intrinsics.cl

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,26 @@ float __builtin_IB_atomic_cmpxchg_global_f32(__global float*, float, float);
243243
int __builtin_IB_atomic_cmpxchg_local_i32(__local int*, int, int);
244244
float __builtin_IB_atomic_cmpxchg_local_f32(__local float*, float, float);
245245

246+
// 64bit Atomic operations
247+
#if defined(cl_intel_64bit_global_atomics_placeholder)
248+
long __builtin_IB_atomic_add_global_i64(__global long*, long);
249+
long __builtin_IB_atomic_sub_global_i64(__global long*, long);
250+
long __builtin_IB_atomic_xchg_global_i64(__global long*, long);
251+
long __builtin_IB_atomic_min_global_i64(__global long*, long);
252+
ulong __builtin_IB_atomic_min_global_u64(__global ulong*, ulong);
253+
double __builtin_IB_atomic_min_global_f64(__global double*, double);
254+
long __builtin_IB_atomic_max_global_i64(__global long*, long);
255+
ulong __builtin_IB_atomic_max_global_u64(__global ulong*, ulong);
256+
double __builtin_IB_atomic_max_global_f64(__global double*, double);
257+
long __builtin_IB_atomic_and_global_i64(__global long*, long);
258+
long __builtin_IB_atomic_or_global_i64(__global long*, long);
259+
long __builtin_IB_atomic_xor_global_i64(__global long*, long);
260+
long __builtin_IB_atomic_inc_global_i64(__global long*);
261+
long __builtin_IB_atomic_dec_global_i64(__global long*);
262+
long __builtin_IB_atomic_cmpxchg_global_i64(__global long*, long, long);
263+
double __builtin_IB_atomic_cmpxchg_global_f64(__global double*, double, double);
264+
#endif // defined(cl_intel_64bit_global_atomics_placeholder)
265+
246266

247267
int __builtin_IB_image_atomic_add_i32(int, int4, int);
248268
int __builtin_IB_image_atomic_sub_i32(int, int4, int);

IGC/BiFModule/Languages/OpenCL/IBiF_Atomics.cl

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,12 @@ INLINE float OVERLOADABLE atomic_xchg(__local volatile float *p, float val) {
5454
return as_float( __builtin_IB_atomic_xchg_local_i32( (__local volatile int *)p, as_int(val) ) );
5555
}
5656

57+
#if defined(cl_intel_64bit_global_atomics_placeholder)
58+
INLINE float OVERLOADABLE atomic_xchg(__global volatile double *p, double val) {
59+
return as_double( __builtin_IB_atomic_xchg_global_i64( (__global volatile long *)p, as_long(val) ) );
60+
}
61+
#endif // if defined(cl_intel_64bit_global_atomics_placeholder)
62+
5763

5864
#define DEF_ATOMIC_1SRC(KEY, ADDRSPACE, TYPE, IGC_TYPE, IGC_CL_TYPE) \
5965
INLINE TYPE OVERLOADABLE atomic_##KEY(volatile __##ADDRSPACE TYPE *p) { \
@@ -244,6 +250,98 @@ DEF_ATOM_1SRC(dec, local, int, i32, int)
244250
DEF_ATOM_1SRC(dec, local, uint, i32, int)
245251

246252

253+
#if defined(cl_intel_64bit_global_atomics_placeholder)
254+
255+
// atomic_add_64
256+
DEF_ATOMIC_2SRC(add, global, long, i64, long)
257+
DEF_ATOMIC_2SRC(add, global, ulong, i64, long)
258+
259+
// atomic_sub_64
260+
DEF_ATOMIC_SUB(sub, global, long, i64, long)
261+
DEF_ATOMIC_SUB(sub, global, ulong, i64, long)
262+
263+
// atomic_xchg_64
264+
DEF_ATOMIC_2SRC(xchg, global, long, i64, long)
265+
DEF_ATOMIC_2SRC(xchg, global, ulong, i64, long)
266+
267+
// atomic_min_64
268+
DEF_ATOMIC_2SRC(min, global, long, i64, long)
269+
DEF_ATOMIC_2SRC(min, global, ulong, u64, ulong)
270+
271+
// atomic_max_64
272+
DEF_ATOMIC_2SRC(max, global, long, i64, long)
273+
DEF_ATOMIC_2SRC(max, global, ulong, u64, ulong)
274+
275+
// atomic_and_64
276+
DEF_ATOMIC_2SRC(and, global, long, i64, long)
277+
DEF_ATOMIC_2SRC(and, global, ulong, i64, long)
278+
279+
// atomic_or_64
280+
DEF_ATOMIC_2SRC(or, global, long, i64, long)
281+
DEF_ATOMIC_2SRC(or, global, ulong, i64, long)
282+
283+
// atomic_xor_64
284+
DEF_ATOMIC_2SRC(xor, global, long, i64, long)
285+
DEF_ATOMIC_2SRC(xor, global, ulong, i64, long)
286+
287+
// atomic_inc_64
288+
DEF_ATOMIC_1SRC(inc, global, long, i64, long)
289+
DEF_ATOMIC_1SRC(inc, global, ulong, i64, long)
290+
291+
// atomic_dec_64
292+
DEF_ATOMIC_1SRC(dec, global, long, i64, long)
293+
DEF_ATOMIC_1SRC(dec, global, ulong, i64, long)
294+
295+
// atomic_cmpxchg_64
296+
DEF_ATOMIC_3SRC(cmpxchg, global, long, i64, long)
297+
DEF_ATOMIC_3SRC(cmpxchg, global, ulong, i64, long)
298+
299+
// atom_add
300+
DEF_ATOM_2SRC(add, global, long, i64, long)
301+
DEF_ATOM_2SRC(add, global, ulong, i64, long)
302+
303+
// atom_sub
304+
DEF_ATOM_SUB(sub, global, long, i64, long)
305+
DEF_ATOM_SUB(sub, global, ulong, i64, long)
306+
307+
// atom_xchg
308+
DEF_ATOM_2SRC(xchg, global, long, i64, long)
309+
DEF_ATOM_2SRC(xchg, global, ulong, i64, long)
310+
311+
// atom_min
312+
DEF_ATOM_2SRC(min, global, long, i64, long)
313+
DEF_ATOM_2SRC(min, global, ulong, u64, ulong)
314+
315+
// atom_max
316+
DEF_ATOM_2SRC(max, global, long, i64, long)
317+
DEF_ATOM_2SRC(max, global, ulong, u64, ulong)
318+
319+
// atom_and
320+
DEF_ATOM_2SRC(and, global, long, i64, long)
321+
DEF_ATOM_2SRC(and, global, ulong, i64, long)
322+
323+
// atom_or
324+
DEF_ATOM_2SRC(or, global, long, i64, long)
325+
DEF_ATOM_2SRC(or, global, ulong, i64, long)
326+
327+
// atom_xor
328+
DEF_ATOM_2SRC(xor, global, long, i64, long)
329+
DEF_ATOM_2SRC(xor, global, ulong, i64, long)
330+
331+
// atom_inc
332+
DEF_ATOM_1SRC(inc, global, long, i64, long)
333+
DEF_ATOM_1SRC(inc, global, ulong, i64, long)
334+
335+
// atom_cmpxchg
336+
DEF_ATOM_3SRC(cmpxchg, global, long, i64, long)
337+
DEF_ATOM_3SRC(cmpxchg, global, ulong, i64, long)
338+
339+
// atom_dec
340+
DEF_ATOM_1SRC(dec, global, long, i64, long)
341+
DEF_ATOM_1SRC(dec, global, ulong, i64, long)
342+
343+
#endif // if defined(cl_intel_64bit_global_atomics_placeholder)
344+
247345
// The below functions were added because of the clang 4.0 itanium mangling update
248346
// See http://llvm.org/viewvc/llvm-project?view=revision&revision=262414
249347

IGC/BiFModule/Languages/OpenCL/opencl_cth_released.h

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -604,6 +604,36 @@ atomic_flag_prototype(clear, void)
604604

605605
#endif
606606

607+
#if defined(cl_intel_64bit_global_atomics_placeholder)
608+
long __attribute__((overloadable)) atomic_add(volatile __global long *p, long val);
609+
long __attribute__((overloadable)) atomic_sub(volatile __global long *p, long val);
610+
long __attribute__((overloadable)) atomic_xchg(volatile __global long *p, long val);
611+
long __attribute__((overloadable)) atomic_min(volatile __global long *p, long val);
612+
unsigned long __attribute__((overloadable)) atomic_min(volatile __global unsigned long *p, unsigned long val);
613+
long __attribute__((overloadable)) atomic_max(volatile __global long *p, long val);
614+
unsigned long __attribute__((overloadable)) atomic_max(volatile __global unsigned long *p, unsigned long val);
615+
long __attribute__((overloadable)) atomic_and(volatile __global long *p, long val);
616+
long __attribute__((overloadable)) atomic_or(volatile __global long *p, long val);
617+
long __attribute__((overloadable)) atomic_xor(volatile __global long *p, long val);
618+
long __attribute__((overloadable)) atomic_inc(volatile __global long *p, long val);
619+
long __attribute__((overloadable)) atomic_dec(volatile __global long *p, long val);
620+
long __attribute__((overloadable)) atomic_cmpxchg(volatile __global long *p, long val);
621+
622+
long __attribute__((overloadable)) atom_add(volatile __global long *p, long val);
623+
long __attribute__((overloadable)) atom_sub(volatile __global long *p, long val);
624+
long __attribute__((overloadable)) atom_xchg(volatile __global long *p, long val);
625+
long __attribute__((overloadable)) atom_min(volatile __global long *p, long val);
626+
unsigned long __attribute__((overloadable)) atom_min(volatile __global unsigned long *p, unsigned long val);
627+
long __attribute__((overloadable)) atom_max(volatile __global long *p, long val);
628+
unsigned long __attribute__((overloadable)) atom_max(volatile __global unsigned long *p, unsigned long val);
629+
long __attribute__((overloadable)) atom_and(volatile __global long *p, long val);
630+
long __attribute__((overloadable)) atom_or(volatile __global long *p, long val);
631+
long __attribute__((overloadable)) atom_xor(volatile __global long *p, long val);
632+
long __attribute__((overloadable)) atom_inc(volatile __global long *p, long val);
633+
long __attribute__((overloadable)) atom_dec(volatile __global long *p, long val);
634+
long __attribute__((overloadable)) atom_cmpxchg(volatile __global long *p, long val);
635+
#endif // defined(cl_intel_64bit_global_atomics_placeholder)
636+
607637
#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
608638
void __attribute__((overloadable)) work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
609639
void __attribute__((overloadable)) work_group_barrier(cl_mem_fence_flags flags);

IGC/Compiler/CISACodeGen/CISABuilder.cpp

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,36 +77,52 @@ VISAAtomicOps convertAtomicOpEnumToVisa(AtomicOp op)
7777
switch(op)
7878
{
7979
case EATOMIC_AND:
80+
case EATOMIC_AND64:
8081
return ATOMIC_AND;
8182
case EATOMIC_DEC:
83+
case EATOMIC_DEC64:
8284
return ATOMIC_DEC;
8385
case EATOMIC_IADD:
86+
case EATOMIC_IADD64:
8487
return ATOMIC_ADD;
8588
case EATOMIC_IMAX:
89+
case EATOMIC_IMAX64:
8690
return ATOMIC_IMAX;
8791
case EATOMIC_IMIN:
92+
case EATOMIC_IMIN64:
8893
return ATOMIC_IMIN;
8994
case EATOMIC_INC:
95+
case EATOMIC_INC64:
9096
return ATOMIC_INC;
9197
case EATOMIC_MAX:
98+
case EATOMIC_MAX64:
9299
return ATOMIC_MAX;
93100
case EATOMIC_MIN:
101+
case EATOMIC_MIN64:
94102
return ATOMIC_MIN;
95103
case EATOMIC_OR:
104+
case EATOMIC_OR64:
96105
return ATOMIC_OR;
97106
case EATOMIC_SUB:
107+
case EATOMIC_SUB64:
98108
return ATOMIC_SUB;
99109
case EATOMIC_UMAX:
110+
case EATOMIC_UMAX64:
100111
return ATOMIC_MAX;
101112
case EATOMIC_UMIN:
113+
case EATOMIC_UMIN64:
102114
return ATOMIC_MIN;
103115
case EATOMIC_XOR:
116+
case EATOMIC_XOR64:
104117
return ATOMIC_XOR;
105118
case EATOMIC_XCHG:
119+
case EATOMIC_XCHG64:
106120
return ATOMIC_XCHG;
107121
case EATOMIC_CMPXCHG:
122+
case EATOMIC_CMPXCHG64:
108123
return ATOMIC_CMPXCHG;
109124
case EATOMIC_PREDEC:
125+
case EATOMIC_PREDEC64:
110126
return ATOMIC_PREDEC;
111127
case EATOMIC_FMAX:
112128
return ATOMIC_FMAX;
@@ -4933,7 +4949,7 @@ void CEncoder::AtomicRawA64(AtomicOp atomic_op,
49334949
CVariable* offset,
49344950
CVariable* src0,
49354951
CVariable* src1,
4936-
bool is16Bit)
4952+
unsigned short bitwidth)
49374953
{
49384954
// For cmpxchg, we have to change the order of arguments.
49394955
if (atomic_op == EATOMIC_CMPXCHG) {
@@ -4956,7 +4972,7 @@ void CEncoder::AtomicRawA64(AtomicOp atomic_op,
49564972

49574973
V(vKernel->AppendVISASvmAtomicInst(GetFlagOperand(m_encoderState.m_flag),
49584974
SplitEMask(fromExecSize, toExecSize, thePart, execMask),
4959-
toExecSize, atomicOpcode, is16Bit,
4975+
toExecSize, atomicOpcode, bitwidth,
49604976
addressOpnd, src0Opnd, src1Opnd, dstOpnd));
49614977
}
49624978

@@ -4972,7 +4988,7 @@ void CEncoder::AtomicRawA64(AtomicOp atomic_op,
49724988
ConvertMaskToVisaType(m_encoderState.m_mask, m_encoderState.m_noMask),
49734989
visaExecSize(m_encoderState.m_simdSize),
49744990
atomicOpcode,
4975-
is16Bit,
4991+
bitwidth,
49764992
addressOpnd,
49774993
src0Opnd,
49784994
src1Opnd,

IGC/Compiler/CISACodeGen/CISABuilder.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ class CEncoder
155155
CVariable *src1, bool is16Bit = false);
156156
void AtomicRawA64(AtomicOp atomic_op, CVariable *dst,
157157
CVariable *elem_offset, CVariable *src0, CVariable *src1,
158-
bool is16Bit = false);
158+
unsigned short bitwidth);
159159
void Cmp(e_predicate p, CVariable* dst, CVariable* src0, CVariable* src1);
160160
void Select(CVariable* flag, CVariable* dst, CVariable* src0, CVariable* src1);
161161
void GenericAlu(e_opcode opcode, CVariable* dst, CVariable* src0, CVariable* src1, CVariable* src2 = nullptr);

IGC/Compiler/CISACodeGen/EmitVISAPass.cpp

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10022,7 +10022,7 @@ void EmitPass::emitScalarAtomics(
1002210022

1002310023
if(isA64)
1002410024
{
10025-
m_encoder->AtomicRawA64(uniformAtomicOp, pReturnVal, pDstAddr, pFinalAtomicSrcVal, nullptr, is16Bit);
10025+
m_encoder->AtomicRawA64(uniformAtomicOp, pReturnVal, pDstAddr, pFinalAtomicSrcVal, nullptr, is16Bit ? 16 : 32);
1002610026
}
1002710027
else
1002810028
{
@@ -10090,7 +10090,7 @@ bool EmitPass::IsUniformAtomic(llvm::Instruction* pInst)
1009010090

1009110091
CVariable *EmitPass::UnpackOrBroadcastIfUniform(CVariable *pVar)
1009210092
{
10093-
if (pVar->GetElemSize() == 4)
10093+
if (pVar->GetElemSize() == 4 || pVar->GetElemSize() == 8)
1009410094
return BroadcastIfUniform(pVar);
1009510095

1009610096
assert(pVar->GetElemSize() == 2);
@@ -10149,8 +10149,10 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1014910149
atomic_op = static_cast<AtomicOp>(llvm::cast<llvm::ConstantInt>(pInsn->getOperand(3))->getZExtValue());
1015010150
}
1015110151

10152+
unsigned short bitwidth = pInsn->getType()->getScalarSizeInBits();
1015210153
const bool is16Bit = (pInsn->getType()->getScalarSizeInBits() == 16);
1015310154

10155+
1015410156
// atomic_inc and atomic_dec don't have both src0 and src1.
1015510157
if(atomic_op != EATOMIC_INC && atomic_op != EATOMIC_DEC)
1015610158
{
@@ -10160,8 +10162,8 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1016010162
// Dst address in bytes.
1016110163
CVariable* pDstAddr = GetSymbol(pllDstAddr);
1016210164
// If DisableScalarAtomics regkey is enabled or DisableIGCOptimizations regkey is enabled then
10163-
// don't enable scalar atomics
10164-
if (IsUniformAtomic(pInsn))
10165+
// don't enable scalar atomics, also do not enable for 64 bit
10166+
if (IsUniformAtomic(pInsn) && bitwidth != 64)
1016510167
{
1016610168
PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
1016710169
bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
@@ -10186,15 +10188,29 @@ void EmitPass::emitAtomicRaw(llvm::GenIntrinsicInst* pInsn)
1018610188
else
1018710189
{
1018810190
CVariable* pDst = returnsImmValue ?
10189-
m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize), ISA_TYPE_UD, EALIGN_GRF) :
10191+
m_currShader->GetNewVariable(numLanes(m_currShader->m_SIMDSize), m_destination->GetType(), EALIGN_GRF) :
1019010192
nullptr;
1019110193

1019210194
PointerType *PtrTy = dyn_cast<PointerType>(pllDstAddr->getType());
1019310195
bool isA64 = PtrTy && isA64Ptr(PtrTy, m_currShader->GetContext());
10194-
if(isA64)
10196+
bool extendPointer = (bitwidth == 64 && !isA64);
10197+
if(isA64 || extendPointer)
1019510198
{
10196-
m_encoder->AtomicRawA64(atomic_op, pDst, pDstAddr, pSrc0, pSrc1, is16Bit);
10197-
m_encoder->Push();
10199+
if (extendPointer)
10200+
{
10201+
pDstAddr = m_currShader->BitCast(pDstAddr, GetUnsignedIntegerType(pDstAddr->GetType()));
10202+
CVariable* pDstAddr2 = m_currShader->GetNewVariable(pDstAddr->GetNumberElement(),
10203+
ISA_TYPE_UQ, EALIGN_GRF);
10204+
m_encoder->Cast(pDstAddr2, pDstAddr);
10205+
m_encoder->AtomicRawA64(atomic_op, pDst, pDstAddr2, pSrc0, pSrc1, bitwidth);
10206+
m_encoder->Push();
10207+
}
10208+
else
10209+
{
10210+
m_encoder->AtomicRawA64(atomic_op, pDst, pDstAddr, pSrc0, pSrc1, bitwidth);
10211+
m_encoder->Push();
10212+
}
10213+
1019810214
if (returnsImmValue)
1019910215
{
1020010216
m_encoder->Cast(

0 commit comments

Comments
 (0)