Skip to content

Commit 3fa65de

Browse files
authored
[mlir] SYCL runtime wrapper: add memcpy support. (#141647)
1 parent 4dbc755 commit 3fa65de

File tree

2 files changed

+60
-3
lines changed

2 files changed

+60
-3
lines changed

mlir/lib/ExecutionEngine/SyclRuntimeWrappers.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
#include <CL/sycl.hpp>
1413
#include <level_zero/ze_api.h>
1514
#include <sycl/ext/oneapi/backend/level_zero.hpp>
15+
#include <sycl/sycl.hpp>
1616

1717
#ifdef _WIN32
1818
#define SYCL_RUNTIME_EXPORT __declspec(dllexport)
@@ -81,8 +81,7 @@ static void *allocDeviceMemory(sycl::queue *queue, size_t size, bool isShared) {
8181
memPtr = sycl::aligned_alloc_shared(64, size, getDefaultDevice(),
8282
getDefaultContext());
8383
} else {
84-
memPtr = sycl::aligned_alloc_device(64, size, getDefaultDevice(),
85-
getDefaultContext());
84+
memPtr = sycl::aligned_alloc_device(64, size, *queue);
8685
}
8786
if (memPtr == nullptr) {
8887
throw std::runtime_error("mem allocation failed!");
@@ -208,3 +207,8 @@ mgpuModuleUnload(ze_module_handle_t module) {
208207

209208
catchAll([&]() { L0_SAFE_CALL(zeModuleDestroy(module)); });
210209
}
210+
211+
extern "C" SYCL_RUNTIME_EXPORT void
212+
mgpuMemcpy(void *dst, void *src, size_t sizeBytes, sycl::queue *queue) {
213+
catchAll([&]() { queue->memcpy(dst, src, sizeBytes).wait(); });
214+
}
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// RUN: mlir-opt %s -pass-pipeline='builtin.module(func.func(gpu-async-region),spirv-attach-target{ver=v1.0 caps=Addresses,Int64,Kernel},convert-gpu-to-spirv{use-64bit-index=true},gpu.module(spirv.module(spirv-lower-abi-attrs,spirv-update-vce)),func.func(llvm-request-c-wrappers),convert-scf-to-cf,convert-cf-to-llvm,convert-arith-to-llvm,convert-math-to-llvm,convert-func-to-llvm,gpu-to-llvm{use-bare-pointers-for-kernels=true},gpu-module-to-binary,expand-strided-metadata,lower-affine,finalize-memref-to-llvm,reconcile-unrealized-casts)' \
2+
// RUN: | mlir-runner \
3+
// RUN: --shared-libs=%mlir_sycl_runtime \
4+
// RUN: --shared-libs=%mlir_runner_utils \
5+
// RUN: --entry-point-result=void \
6+
// RUN: | FileCheck %s
7+
8+
module @add attributes {gpu.container_module} {
9+
memref.global "private" constant @__constant_2x2x2xf32_0 : memref<2x2x2xf32> = dense<[[[1.1, 2.2], [3.3, 4.4]], [[5.5, 6.6], [7.7, 8.8 ]]]>
10+
memref.global "private" constant @__constant_2x2x2xf32 : memref<2x2x2xf32> = dense<[[[1.2, 2.3], [4.5, 5.8]], [[7.2, 8.3], [10.5, 11.8]]]>
11+
func.func @main() {
12+
%0 = memref.get_global @__constant_2x2x2xf32 : memref<2x2x2xf32>
13+
%1 = memref.get_global @__constant_2x2x2xf32_0 : memref<2x2x2xf32>
14+
%2 = call @test(%0, %1) : (memref<2x2x2xf32>, memref<2x2x2xf32>) -> memref<2x2x2xf32>
15+
%cast = memref.cast %2 : memref<2x2x2xf32> to memref<*xf32>
16+
call @printMemrefF32(%cast) : (memref<*xf32>) -> ()
17+
memref.dealloc %2 : memref<2x2x2xf32>
18+
return
19+
}
20+
func.func private @printMemrefF32(memref<*xf32>)
21+
func.func @test(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>) -> memref<2x2x2xf32> {
22+
%c2 = arith.constant 2 : index
23+
%c1 = arith.constant 1 : index
24+
%memref = gpu.alloc () : memref<2x2x2xf32>
25+
gpu.memcpy %memref, %arg0 : memref<2x2x2xf32>, memref<2x2x2xf32>
26+
%memref_0 = gpu.alloc () : memref<2x2x2xf32>
27+
gpu.memcpy %memref_0, %arg1 : memref<2x2x2xf32>, memref<2x2x2xf32>
28+
%memref_1 = gpu.alloc () : memref<2x2x2xf32>
29+
gpu.launch_func @test_kernel::@test_kernel blocks in (%c2, %c2, %c2) threads in (%c1, %c1, %c1) args(%memref : memref<2x2x2xf32>, %memref_0 : memref<2x2x2xf32>, %memref_1 : memref<2x2x2xf32>)
30+
%alloc = memref.alloc() : memref<2x2x2xf32>
31+
gpu.memcpy %alloc, %memref_1 : memref<2x2x2xf32>, memref<2x2x2xf32>
32+
gpu.dealloc %memref_1 : memref<2x2x2xf32>
33+
gpu.dealloc %memref_0 : memref<2x2x2xf32>
34+
gpu.dealloc %memref : memref<2x2x2xf32>
35+
return %alloc : memref<2x2x2xf32>
36+
}
37+
gpu.module @test_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Int64, Kernel], []>, api=OpenCL, #spirv.resource_limits<>>} {
38+
gpu.func @test_kernel(%arg0: memref<2x2x2xf32>, %arg1: memref<2x2x2xf32>, %arg2: memref<2x2x2xf32>) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 2, 2, 2>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
39+
%0 = gpu.block_id x
40+
%1 = gpu.block_id y
41+
%2 = gpu.block_id z
42+
%3 = memref.load %arg0[%0, %1, %2] : memref<2x2x2xf32>
43+
%4 = memref.load %arg1[%0, %1, %2] : memref<2x2x2xf32>
44+
%5 = arith.addf %3, %4 : f32
45+
memref.store %5, %arg2[%0, %1, %2] : memref<2x2x2xf32>
46+
gpu.return
47+
}
48+
}
49+
// CHECK: [2.3, 4.5]
50+
// CHECK: [7.8, 10.2]
51+
// CHECK: [12.7, 14.9]
52+
// CHECK: [18.2, 20.6]
53+
}

0 commit comments

Comments
 (0)