19
19
#include " dpct/helper.hpp"
20
20
#include " ggml-sycl.h"
21
21
#include " presets.hpp"
22
+ #if GGML_SYCL_DNNL
23
+ #include " dnnl.hpp"
24
+ #include " dnnl_sycl.hpp"
25
+ #endif
22
26
23
27
#define GGML_COMMON_DECL_SYCL
24
28
#define GGML_COMMON_IMPL_SYCL
@@ -59,7 +63,7 @@ static int g_ggml_sycl_debug = 0;
59
63
// define for XMX in Intel GPU
60
64
// TODO: currently, it's not used for XMX really.
61
65
#if !defined(GGML_SYCL_FORCE_MMQ)
62
- #define SYCL_USE_XMX
66
+ #define SYCL_USE_XMX
63
67
#endif
64
68
65
69
// max batch size to use MMQ kernels when tensor cores are available
@@ -80,16 +84,16 @@ static int g_ggml_sycl_debug = 0;
80
84
typedef sycl::queue *queue_ptr;
81
85
82
86
enum ggml_sycl_backend_gpu_mode {
83
- SYCL_UNSET_GPU_MODE = -1 ,
84
- SYCL_SINGLE_GPU_MODE = 0 ,
85
- SYCL_MUL_GPU_MODE
87
+ SYCL_UNSET_GPU_MODE = -1 ,
88
+ SYCL_SINGLE_GPU_MODE = 0 ,
89
+ SYCL_MUL_GPU_MODE
86
90
};
87
91
88
92
static_assert (sizeof (sycl::half) == sizeof(ggml_fp16_t ), "wrong fp16 size");
89
93
90
94
static void crash () {
91
- int * ptr = NULL ;
92
- *ptr = 0 ;
95
+ int * ptr = NULL ;
96
+ *ptr = 0 ;
93
97
}
94
98
95
99
[[noreturn]] static void ggml_sycl_error (
@@ -98,9 +102,9 @@ static void crash() {
98
102
const char * file,
99
103
const int line,
100
104
const char * msg) {
101
- fprintf (stderr, " SYCL error: %s: %s\n " , stmt, msg);
102
- fprintf (stderr, " in function %s at %s:%d\n " , func, file, line);
103
- GGML_ABORT (" SYCL error" );
105
+ fprintf (stderr, " SYCL error: %s: %s\n " , stmt, msg);
106
+ fprintf (stderr, " in function %s at %s:%d\n " , func, file, line);
107
+ GGML_ABORT (" SYCL error" );
104
108
}
105
109
106
110
#define SYCL_CHECK (err ) \
@@ -137,40 +141,40 @@ static int g_all_sycl_device_count = -1;
137
141
static bool g_ggml_backend_sycl_buffer_type_initialized = false ;
138
142
139
143
static ggml_sycl_backend_gpu_mode g_ggml_sycl_backend_gpu_mode =
140
- SYCL_UNSET_GPU_MODE;
144
+ SYCL_UNSET_GPU_MODE;
141
145
142
146
static void * g_scratch_buffer = nullptr ;
143
147
static size_t g_scratch_size = 0 ; // disabled by default
144
148
static size_t g_scratch_offset = 0 ;
145
149
146
150
[[noreturn]] static inline void bad_arch (const sycl::stream& stream_ct1) {
147
- stream_ct1 << " ERROR: ggml-sycl was compiled without support for the "
148
- " current GPU architecture.\n " ;
149
- // __trap();
150
- std::exit (1 );
151
+ stream_ct1 << " ERROR: ggml-sycl was compiled without support for the "
152
+ " current GPU architecture.\n " ;
153
+ // __trap();
154
+ std::exit (1 );
151
155
152
- (void )bad_arch; // suppress unused function warning
156
+ (void )bad_arch; // suppress unused function warning
153
157
}
154
158
155
159
int get_current_device_id ();
156
160
157
161
inline dpct::err0 ggml_sycl_set_device (const int device) try {
158
162
159
- int current_device_id;
160
- SYCL_CHECK (CHECK_TRY_ERROR (current_device_id = get_current_device_id ()));
163
+ int current_device_id;
164
+ SYCL_CHECK (CHECK_TRY_ERROR (current_device_id = get_current_device_id ()));
161
165
162
- // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
163
- // current_device_id=%d\n", device, current_device);
164
- if (device == current_device_id) {
165
- return 0 ;
166
- }
166
+ // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d,
167
+ // current_device_id=%d\n", device, current_device);
168
+ if (device == current_device_id) {
169
+ return 0 ;
170
+ }
167
171
168
- return CHECK_TRY_ERROR (dpct::select_device (device));
172
+ return CHECK_TRY_ERROR (dpct::select_device (device));
169
173
} catch (sycl::exception const & exc) {
170
- std::cerr << exc.what () << " Exception caught at file:" << __FILE__
171
- << " , line:" << __LINE__ << std::endl;
172
- crash ();
173
- std::exit (1 );
174
+ std::cerr << exc.what () << " Exception caught at file:" << __FILE__
175
+ << " , line:" << __LINE__ << std::endl;
176
+ crash ();
177
+ std::exit (1 );
174
178
}
175
179
176
180
// ////////////////////
@@ -248,10 +252,10 @@ struct ggml_sycl_pool_alloc {
248
252
// backend interface
249
253
250
254
struct ggml_tensor_extra_gpu {
251
- void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
252
- // tensors
253
- dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
254
- [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
255
+ void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split
256
+ // tensors
257
+ dpct::event_ptr events[GGML_SYCL_MAX_DEVICES]
258
+ [GGML_SYCL_MAX_STREAMS]; // events for synchronizing multiple GPUs
255
259
};
256
260
257
261
struct ggml_backend_sycl_context {
@@ -276,6 +280,33 @@ struct ggml_backend_sycl_context {
276
280
return stream (device, 0 );
277
281
}
278
282
283
+ #if GGML_SYCL_DNNL
284
+ dnnl::stream make_stream (sycl::queue& q) {
285
+ // Get the device associated with the queue
286
+ sycl::device dev = q.get_device ();
287
+ // Get the context associated with the queue
288
+ sycl::context ctx = q.get_context ();
289
+ const dnnl::engine eng = dnnl::sycl_interop::make_engine (dev, ctx);
290
+ dnnl::stream stream = dnnl::sycl_interop::make_stream (eng, q);
291
+ return stream;
292
+ }
293
+ std::unordered_map<sycl::queue*, dnnl::stream> stream_map;
294
+ dnnl::stream stream_dnnl (int device, int _stream) {
295
+ auto q = stream (device, _stream);
296
+ return stream_dnnl (q);
297
+ }
298
+ dnnl::stream stream_dnnl (sycl::queue* qptr) {
299
+ auto it = stream_map.find (qptr);
300
+ if (it == stream_map.end ()) {
301
+ stream_map[qptr] = make_stream (*qptr);
302
+ }
303
+ return it->second ;
304
+ }
305
+ dnnl::stream stream_dnnl () {
306
+ return stream_dnnl (device, 0 );
307
+ }
308
+ #endif
309
+
279
310
// pool
280
311
std::unique_ptr<ggml_sycl_pool> pools[GGML_SYCL_MAX_DEVICES];
281
312
0 commit comments