Skip to content

Commit 92f6edc

Browse files
committed
Merge branch 'sycl' into get_native_br
2 parents e20666b + cee7110 commit 92f6edc

File tree

6 files changed

+89
-39
lines changed

6 files changed

+89
-39
lines changed

clang/test/SemaSYCL/intel-fpga-global-const.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: %clang_cc1 -fsycl -fsycl-is-device -fsyntax-only -ast-dump -verify -pedantic %s | FileCheck %s
1+
// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -ast-dump -verify -pedantic %s | FileCheck %s
22

33
// Test that checks global constant variable (which allows the redeclaration) since
44
// IntelFPGAConstVar is one of the subjects listed for [[intel::max_replicates()]] attribute.

sycl/include/CL/sycl/handler.hpp

Lines changed: 32 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -731,14 +731,37 @@ class __SYCL_EXPORT handler {
731731

732732
// FIXME Remove the ESIMD check once rounding of execution range works well
733733
// with ESIMD compilation flow.
734+
// Range rounding can be disabled by the user.
735+
// Range rounding is not done on the host device.
734736
// Range rounding is supported only for newer SYCL standards.
735-
// Range rounding can also be disabled by the user.
736737
#if !defined(__SYCL_EXPLICIT_SIMD__) && \
737738
!defined(SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING) && \
738-
SYCL_LANGUAGE_VERSION >= 202001
739-
// The work group size preferred by this device.
740-
// A reasonable choice for rounding up the range is 32.
741-
constexpr size_t GoodLocalSizeX = 32;
739+
!defined(DPCPP_HOST_DEVICE_OPENMP) && \
740+
!defined(DPCPP_HOST_DEVICE_PERF_NATIVE) && SYCL_LANGUAGE_VERSION >= 202001
741+
// Range should be a multiple of this for reasonable performance.
742+
size_t MinFactorX = 16;
743+
// Range should be a multiple of this for improved performance.
744+
size_t GoodFactorX = 32;
745+
// Range should be at least this to make rounding worthwhile.
746+
size_t MinRangeX = 1024;
747+
748+
// Parse optional parameters of this form:
749+
// MinRound:PreferredRound:MinRange
750+
char *RoundParams = getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS");
751+
if (RoundParams != nullptr) {
752+
std::string Params(RoundParams);
753+
size_t Pos = Params.find(':');
754+
if (Pos != std::string::npos) {
755+
MinFactorX = std::stoi(Params.substr(0, Pos));
756+
Params.erase(0, Pos + 1);
757+
Pos = Params.find(':');
758+
if (Pos != std::string::npos) {
759+
GoodFactorX = std::stoi(Params.substr(0, Pos));
760+
Params.erase(0, Pos + 1);
761+
MinRangeX = std::stoi(Params);
762+
}
763+
}
764+
}
742765

743766
// Disable the rounding-up optimizations under these conditions:
744767
// 1. The env var SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING is set.
@@ -769,15 +792,16 @@ class __SYCL_EXPORT handler {
769792
(KI::callsThisItem());
770793

771794
// Perform range rounding if rounding-up is enabled
795+
// and there are sufficient work-items to need rounding
772796
// and the user-specified range is not a multiple of a "good" value.
773-
if (!DisableRounding && NumWorkItems[0] % GoodLocalSizeX != 0) {
797+
if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
798+
(NumWorkItems[0] % MinFactorX != 0)) {
774799
// It is sufficient to round up just the first dimension.
775800
// Multiplying the rounded-up value of the first dimension
776801
// by the values of the remaining dimensions (if any)
777802
// will yield a rounded-up value for the total range.
778803
size_t NewValX =
779-
((NumWorkItems[0] + GoodLocalSizeX - 1) / GoodLocalSizeX) *
780-
GoodLocalSizeX;
804+
((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX;
781805
using NameWT = typename detail::get_kernel_wrapper_name_t<NameT>::name;
782806
if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr)
783807
std::cout << "parallel_for range adjusted from " << NumWorkItems[0]

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 44 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1802,19 +1802,26 @@ pi_result piDevicePartition(pi_device Device,
18021802

18031803
PI_ASSERT(Device, PI_INVALID_DEVICE);
18041804

1805+
// Check if Device was already partitioned into the same or bigger size
1806+
// before. If so, we can return immediately without searching the global
1807+
// device cache. Note that L0 driver always returns the same handles in the
1808+
// same order for the given number of sub-devices.
1809+
if (OutDevices && NumDevices <= Device->SubDevices.size()) {
1810+
for (uint32_t I = 0; I < NumDevices; I++) {
1811+
OutDevices[I] = Device->SubDevices[I];
1812+
// reusing the same pi_device needs to increment the reference count
1813+
piDeviceRetain(OutDevices[I]);
1814+
}
1815+
if (OutNumDevices)
1816+
*OutNumDevices = NumDevices;
1817+
return PI_SUCCESS;
1818+
}
1819+
18051820
// Get the number of subdevices available.
18061821
// TODO: maybe add interface to create the specified # of subdevices.
18071822
uint32_t Count = 0;
18081823
ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, nullptr));
18091824

1810-
// Check that the requested/allocated # of sub-devices is the same
1811-
// as was reported by the above call.
1812-
// TODO: we may want to support smaller/larger # devices too.
1813-
if (Count != NumDevices) {
1814-
zePrint("piDevicePartition: unsupported # of sub-devices requested\n");
1815-
return PI_INVALID_OPERATION;
1816-
}
1817-
18181825
if (OutNumDevices) {
18191826
*OutNumDevices = Count;
18201827
}
@@ -1825,17 +1832,29 @@ pi_result piDevicePartition(pi_device Device,
18251832
}
18261833

18271834
try {
1835+
pi_platform Platform = Device->Platform;
18281836
auto ZeSubdevices = new ze_device_handle_t[Count];
18291837
ZE_CALL(zeDeviceGetSubDevices(Device->ZeDevice, &Count, ZeSubdevices));
18301838

18311839
// Wrap the Level Zero sub-devices into PI sub-devices, and write them out.
18321840
for (uint32_t I = 0; I < Count; ++I) {
1833-
OutDevices[I] = new _pi_device(ZeSubdevices[I], Device->Platform,
1834-
true /* isSubDevice */);
1835-
pi_result Result = OutDevices[I]->initialize();
1836-
if (Result != PI_SUCCESS) {
1837-
delete[] ZeSubdevices;
1838-
return Result;
1841+
pi_device Dev = Platform->getDeviceFromNativeHandle(ZeSubdevices[I]);
1842+
if (Dev) {
1843+
OutDevices[I] = Dev;
1844+
// reusing the same pi_device needs to increment the reference count
1845+
piDeviceRetain(OutDevices[I]);
1846+
} else {
1847+
std::unique_ptr<_pi_device> PiSubDevice(
1848+
new _pi_device(ZeSubdevices[I], Platform));
1849+
pi_result Result = PiSubDevice->initialize();
1850+
if (Result != PI_SUCCESS) {
1851+
delete[] ZeSubdevices;
1852+
return Result;
1853+
}
1854+
OutDevices[I] = PiSubDevice.get();
1855+
Platform->PiDevicesCache.push_back(std::move(PiSubDevice));
1856+
// save pointers to sub-devices for quick retrieval in the future.
1857+
Device->SubDevices.push_back(Dev);
18391858
}
18401859
}
18411860
delete[] ZeSubdevices;
@@ -1911,29 +1930,25 @@ pi_result piextDeviceCreateWithNativeHandle(pi_native_handle NativeHandle,
19111930
PI_ASSERT(Device, PI_INVALID_DEVICE);
19121931
PI_ASSERT(NativeHandle, PI_INVALID_VALUE);
19131932
PI_ASSERT(Platform, PI_INVALID_PLATFORM);
1914-
1915-
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
1916-
pi_result Res = populateDeviceCacheIfNeeded(Platform);
1917-
if (Res != PI_SUCCESS) {
1918-
return Res;
1933+
{
1934+
std::lock_guard<std::mutex> Lock(Platform->PiDevicesCacheMutex);
1935+
pi_result Res = populateDeviceCacheIfNeeded(Platform);
1936+
if (Res != PI_SUCCESS) {
1937+
return Res;
1938+
}
19191939
}
1920-
19211940
auto ZeDevice = pi_cast<ze_device_handle_t>(NativeHandle);
19221941

19231942
// The SYCL spec requires that the set of devices must remain fixed for the
19241943
// duration of the application's execution. We assume that we found all of the
19251944
// Level Zero devices when we initialized the device cache, so the
19261945
// "NativeHandle" must already be in the cache. If it is not, this must not be
19271946
// a valid Level Zero device.
1928-
for (const std::unique_ptr<_pi_device> &CachedDevice :
1929-
Platform->PiDevicesCache) {
1930-
if (CachedDevice->ZeDevice == ZeDevice) {
1931-
*Device = CachedDevice.get();
1932-
return PI_SUCCESS;
1933-
}
1934-
}
1935-
1936-
return PI_INVALID_VALUE;
1947+
pi_device Dev = Platform->getDeviceFromNativeHandle(ZeDevice);
1948+
if (Dev == nullptr)
1949+
return PI_INVALID_VALUE;
1950+
*Device = Dev;
1951+
return PI_SUCCESS;
19371952
}
19381953

19391954
pi_result piContextCreate(const pi_context_properties *Properties,

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,12 @@ struct _pi_device : _pi_object {
156156
// Level Zero device handle.
157157
ze_device_handle_t ZeDevice;
158158

159+
// Keep the subdevices that are partitioned from this pi_device for reuse
160+
// The order of sub-devices in this vector is repeated from the
161+
// ze_device_handle_t array that are returned from zeDeviceGetSubDevices()
162+
// call, which will always return sub-devices in the fixed same order.
163+
std::vector<pi_device> SubDevices;
164+
159165
// PI platform to which this device belongs.
160166
pi_platform Platform;
161167

sycl/source/detail/device_image_impl.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ class device_image_impl {
4747

4848
// Collect kernel names for the image
4949
pi_device_binary DevBin =
50-
const_cast<pi_device_binary>(&BinImage->getRawData());
50+
const_cast<pi_device_binary>(&MBinImage->getRawData());
5151
for (_pi_offload_entry EntriesIt = DevBin->EntriesBegin;
5252
EntriesIt != DevBin->EntriesEnd; ++EntriesIt) {
5353

sycl/test/basic_tests/kernel_bundle/kernel_bundle_api.cpp renamed to sycl/test/on-device/basic_tests/kernel_bundle/kernel_bundle_api.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,11 @@ class Kernel2Name;
1313

1414
int main() {
1515
sycl::queue Q;
16+
17+
// No support for host device so far.
18+
if (Q.is_host())
19+
return 0;
20+
1621
const sycl::context Ctx = Q.get_context();
1722
const sycl::device Dev = Q.get_device();
1823

0 commit comments

Comments
 (0)