19
19
#include " dpct/helper.hpp"
20
20
#include " ggml-sycl.h"
21
21
#include " presets.hpp"
22
+ #if GGML_SYCL_DNNL
23
+ #include " dnnl.hpp"
24
+ #include " dnnl_sycl.hpp"
25
+ #endif
22
26
23
27
#define GGML_COMMON_DECL_SYCL
24
28
#define GGML_COMMON_IMPL_SYCL
@@ -59,7 +63,7 @@ static int g_ggml_sycl_debug = 0;
59
63
// define for XMX in Intel GPU
60
64
// TODO: currently, it's not used for XMX really.
61
65
#if !defined(GGML_SYCL_FORCE_MMQ)
62
- #define SYCL_USE_XMX
66
+ #define SYCL_USE_XMX
63
67
#endif
64
68
65
69
// max batch size to use MMQ kernels when tensor cores are available
@@ -80,16 +84,16 @@ static int g_ggml_sycl_debug = 0;
80
84
typedef sycl::queue *queue_ptr;
81
85
82
86
enum ggml_sycl_backend_gpu_mode {
83
- SYCL_UNSET_GPU_MODE = -1 ,
84
- SYCL_SINGLE_GPU_MODE = 0 ,
85
- SYCL_MUL_GPU_MODE
87
+ SYCL_UNSET_GPU_MODE = -1 ,
88
+ SYCL_SINGLE_GPU_MODE = 0 ,
89
+ SYCL_MUL_GPU_MODE
86
90
};
87
91
88
92
static_assert (sizeof (sycl::half) == sizeof(ggml_fp16_t ), "wrong fp16 size");
89
93
90
94
static void crash () {
91
- int * ptr = NULL ;
92
- *ptr = 0 ;
95
+ int * ptr = NULL ;
96
+ *ptr = 0 ;
93
97
}
94
98
95
99
[[noreturn]] static void ggml_sycl_error (
@@ -98,9 +102,9 @@ static void crash() {
98
102
const char * file,
99
103
const int line,
100
104
const char * msg) {
101
- fprintf (stderr, " SYCL error: %s: %s\n " , stmt, msg);
102
- fprintf (stderr, " in function %s at %s:%d\n " , func, file, line);
103
- GGML_ABORT (" SYCL error" );
105
+ fprintf (stderr, " SYCL error: %s: %s\n " , stmt, msg);
106
+ fprintf (stderr, " in function %s at %s:%d\n " , func, file, line);
107
+ GGML_ABORT (" SYCL error" );
104
108
}
105
109
106
110
#define SYCL_CHECK (err ) \
@@ -138,40 +142,40 @@ static int g_all_sycl_device_count = -1;
138
142
static bool g_ggml_backend_sycl_buffer_type_initialized = false ;
139
143
140
144
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
141
- SYCL_UNSET_GPU_MODE;
145
+ SYCL_UNSET_GPU_MODE;
142
146
143
147
static void * g_scratch_buffer = nullptr ;
144
148
static size_t g_scratch_size = 0 ; // disabled by default
145
149
static size_t g_scratch_offset = 0 ;
146
150
147
151
[[noreturn]] static inline void bad_arch (const sycl::stream& stream_ct1) {
148
- stream_ct1 << " ERROR: ggml-sycl was compiled without support for the "
149
- " current GPU architecture.\n " ;
150
- // __trap();
151
- std::exit (1 );
152
+ stream_ct1 << " ERROR: ggml-sycl was compiled without support for the "
153
+ " current GPU architecture.\n " ;
154
+ // __trap();
155
+ std::exit (1 );
152
156
153
- (void )bad_arch; // suppress unused function warning
157
+ (void )bad_arch; // suppress unused function warning
154
158
}
155
159
156
160
int get_current_device_id ();
157
161
158
162
inline dpct::err0 ggml_sycl_set_device (const int device) try {
159
163
160
- int current_device_id;
161
- SYCL_CHECK (CHECK_TRY_ERROR (current_device_id = get_current_device_id ()));
164
+ int current_device_id;
165
+ SYCL_CHECK (CHECK_TRY_ERROR (current_device_id = get_current_device_id ()));
162
166
163
- // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
164
- // current_device_id=%d\n", device, current_device);
165
- if (device == current_device_id) {
166
- return 0 ;
167
- }
167
+ // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
168
+ // current_device_id=%d\n", device, current_device);
169
+ if (device == current_device_id) {
170
+ return 0 ;
171
+ }
168
172
169
- return CHECK_TRY_ERROR (dpct::select_device (device));
173
+ return CHECK_TRY_ERROR (dpct::select_device (device));
170
174
} catch (sycl::exception const & exc) {
171
- std::cerr << exc.what () << " Exception caught at file:" << __FILE__
172
- << " , line:" << __LINE__ << std::endl;
173
- crash ();
174
- std::exit (1 );
175
+ std::cerr << exc.what () << " Exception caught at file:" << __FILE__
176
+ << " , line:" << __LINE__ << std::endl;
177
+ crash ();
178
+ std::exit (1 );
175
179
}
176
180
177
181
// ////////////////////
@@ -249,10 +253,10 @@ struct ggml_sycl_pool_alloc {
249
253
// backend interface
250
254
251
255
struct ggml_tensor_extra_gpu {
252
- void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
253
- // tensors
254
- dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
255
- [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
256
+ void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
257
+ // tensors
258
+ dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
259
+ [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
256
260
};
257
261
258
262
struct ggml_backend_sycl_context {
@@ -277,6 +281,33 @@ struct ggml_backend_sycl_context {
277
281
return stream (device, 0 );
278
282
}
279
283
284
+ #if GGML_SYCL_DNNL
285
+ dnnl::stream make_stream (sycl::queue& q) {
286
+ // Get the device associated with the queue
287
+ sycl::device dev = q.get_device ();
288
+ // Get the context associated with the queue
289
+ sycl::context ctx = q.get_context ();
290
+ const dnnl::engine eng = dnnl::sycl_interop::make_engine (dev, ctx);
291
+ dnnl::stream stream = dnnl::sycl_interop::make_stream (eng, q);
292
+ return stream;
293
+ }
294
+ std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
295
+ dnnl::stream stream_dnnl (int device, int _stream) {
296
+ auto q = stream (device, _stream);
297
+ return stream_dnnl (q);
298
+ }
299
+ dnnl::stream stream_dnnl (sycl::queue* qptr) {
300
+ auto it = stream_map.find (qptr);
301
+ if (it == stream_map.end ()) {
302
+ stream_map[qptr] = make_stream (*qptr);
303
+ }
304
+ return it->second ;
305
+ }
306
+ dnnl::stream stream_dnnl () {
307
+ return stream_dnnl (device, 0 );
308
+ }
309
+ #endif
310
+
280
311
// pool
281
312
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
282
313
0 commit comments