Skip to content

Commit 9012eb9

Browse files
authored
sycl: Add more debug prints (#13640)
1 parent fef693d commit 9012eb9

File tree

16 files changed

+249
-163
lines changed

16 files changed

+249
-163
lines changed

ggml/src/ggml-sycl/binbcast.cpp

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -319,32 +319,27 @@ inline void ggml_sycl_op_repeat(ggml_backend_sycl_context & ctx, ggml_tensor *ds
319319

320320

321321
void ggml_sycl_add(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
322-
GGML_SYCL_DEBUG("call %s\n", __func__);
322+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
323323
ggml_sycl_op_add(ctx, dst);
324-
GGML_SYCL_DEBUG("call %s done\n", __func__);
325324
}
326325

327326
void ggml_sycl_sub(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
328-
GGML_SYCL_DEBUG("call %s\n", __func__);
327+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
329328
ggml_sycl_op_sub(ctx, dst);
330-
GGML_SYCL_DEBUG("call %s done\n", __func__);
331329
}
332330

333331
void ggml_sycl_mul(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
334-
GGML_SYCL_DEBUG("call %s\n", __func__);
332+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
335333
ggml_sycl_op_mul(ctx, dst);
336-
GGML_SYCL_DEBUG("call %s done\n", __func__);
337334
}
338335

339336
void ggml_sycl_div(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
340-
GGML_SYCL_DEBUG("call %s\n", __func__);
337+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
341338
ggml_sycl_op_div(ctx, dst);
342-
GGML_SYCL_DEBUG("call %s done\n", __func__);
343339
}
344340

345341
void ggml_sycl_repeat(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
346-
GGML_SYCL_DEBUG("call %s\n", __func__);
342+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
347343
ggml_sycl_op_repeat(ctx, dst);
348-
GGML_SYCL_DEBUG("call %s done\n", __func__);
349344
}
350345

ggml/src/ggml-sycl/common.hpp

Lines changed: 87 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
#include <fstream>
1717
#include <iostream>
18+
#include <string>
1819

1920
#include "dpct/helper.hpp"
2021
#include "ggml-sycl.h"
@@ -44,11 +45,20 @@ extern int g_ggml_sycl_debug;
4445
extern int g_ggml_sycl_disable_optimize;
4546
extern int g_ggml_sycl_prioritize_dmmv;
4647

47-
#define GGML_SYCL_DEBUG(...) \
48-
do { \
49-
if (g_ggml_sycl_debug) \
50-
fprintf(stderr, __VA_ARGS__); \
51-
} while (0)
48+
#if defined(__clang__) && __has_builtin(__builtin_expect)
49+
// Hint the optimizer to pipeline the more likely following instruction in branches
50+
# define LIKELY(expr) __builtin_expect(expr, true)
51+
# define UNLIKELY(expr) __builtin_expect(expr, false)
52+
#else
53+
# define LIKELY(expr) (expr)
54+
# define UNLIKELY(expr) (expr)
55+
#endif
56+
57+
#define GGML_SYCL_DEBUG(...) \
58+
do { \
59+
if (UNLIKELY(g_ggml_sycl_debug)) \
60+
fprintf(stderr, __VA_ARGS__); \
61+
} while (0)
5262

5363
#define CHECK_TRY_ERROR(expr) \
5464
[&]() { \
@@ -490,4 +500,76 @@ constexpr size_t ceil_div(const size_t m, const size_t n) {
490500
}
491501

492502
bool gpu_has_xmx(sycl::device &dev);
503+
504+
template <int N, class T> void debug_print_array(const std::string & prefix, const T array[N]) {
505+
if (LIKELY(!g_ggml_sycl_debug)) {
506+
return;
507+
}
508+
std::stringstream ss;
509+
ss << prefix << "=[";
510+
for (std::size_t i = 0; i < N - 1; ++i) {
511+
ss << array[i] << ", ";
512+
}
513+
if constexpr (N > 0) {
514+
ss << array[N - 1];
515+
}
516+
ss << "]";
517+
GGML_SYCL_DEBUG("%s", ss.str().c_str());
518+
}
519+
520+
inline void debug_print_tensor(const std::string & prefix, const ggml_tensor * tensor,
521+
const std::string & suffix = "") {
522+
if (LIKELY(!g_ggml_sycl_debug)) {
523+
return;
524+
}
525+
GGML_SYCL_DEBUG("%s=", prefix.c_str());
526+
if (tensor) {
527+
GGML_SYCL_DEBUG("'%s':type=%s", tensor->name, ggml_type_name(tensor->type));
528+
debug_print_array<GGML_MAX_DIMS>(";ne", tensor->ne);
529+
debug_print_array<GGML_MAX_DIMS>(";nb", tensor->nb);
530+
if (!ggml_is_contiguous(tensor)) {
531+
GGML_SYCL_DEBUG(";strided");
532+
}
533+
if (ggml_is_permuted(tensor)) {
534+
GGML_SYCL_DEBUG(";permuted");
535+
}
536+
} else {
537+
GGML_SYCL_DEBUG("nullptr");
538+
}
539+
GGML_SYCL_DEBUG("%s", suffix.c_str());
540+
}
541+
542+
// Use scope_op_debug_print to log operations coming from running a model
543+
struct scope_op_debug_print {
544+
// Use string_views to avoid the cost of creating a string and concatenating them
545+
// string_views must be alive for as long as the object is alive
546+
// scope_op_debug_print are used with string literals in practice which are stored in constant space so always accessible
547+
scope_op_debug_print(const std::string_view & func, const std::string_view & func_suffix, const ggml_tensor * dst,
548+
std::size_t num_src, const std::string_view & suffix = "") :
549+
func(func),
550+
func_suffix(func_suffix) {
551+
if (LIKELY(!g_ggml_sycl_debug)) {
552+
return;
553+
}
554+
GGML_SYCL_DEBUG("[SYCL][OP] call %s%s:", func.data(), func_suffix.data());
555+
debug_print_tensor(" dst", dst);
556+
if (dst) {
557+
for (std::size_t i = 0; i < num_src; ++i) {
558+
debug_print_tensor("\tsrc" + std::to_string(i), dst->src[i]);
559+
}
560+
}
561+
GGML_SYCL_DEBUG("%s\n", suffix.data());
562+
}
563+
564+
scope_op_debug_print(const std::string_view & func, const ggml_tensor * dst, std::size_t num_src,
565+
const std::string_view & suffix = "") :
566+
scope_op_debug_print(func, "", dst, num_src, suffix) {}
567+
568+
~scope_op_debug_print() { GGML_SYCL_DEBUG("[SYCL][OP] call %s%s done\n", func.data(), func_suffix.data()); }
569+
570+
private:
571+
std::string_view func;
572+
std::string_view func_suffix;
573+
};
574+
493575
#endif // GGML_SYCL_COMMON_HPP

ggml/src/ggml-sycl/concat.cpp

Lines changed: 31 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -159,39 +159,37 @@ static void concat_f32_sycl_non_cont(
159159
}
160160

161161
void ggml_sycl_op_concat(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
162-
const ggml_tensor *src0 = dst->src[0];
163-
const ggml_tensor *src1 = dst->src[1];
164-
queue_ptr stream = ctx.stream();
165-
166-
const int32_t dim = ((int32_t *)dst->op_params)[0];
167-
168-
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
169-
const float *src0_d = (const float *)src0->data;
170-
const float *src1_d = (const float *)src1->data;
171-
172-
float *dst_d = (float *)dst->data;
173-
174-
if (dim != 3) {
175-
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
176-
concat_f32_sycl(
177-
src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178-
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1],
179-
src0->ne[2], dst->ne[0], dst->ne[1], dst->ne[2], dim, stream);
180-
}
162+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
163+
const ggml_tensor * src0 = dst->src[0];
164+
const ggml_tensor * src1 = dst->src[1];
165+
queue_ptr stream = ctx.stream();
166+
167+
const int32_t dim = ((int32_t *) dst->op_params)[0];
168+
169+
if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
170+
const float * src0_d = (const float *) src0->data;
171+
const float * src1_d = (const float *) src1->data;
172+
173+
float * dst_d = (float *) dst->data;
174+
175+
if (dim != 3) {
176+
for (int i3 = 0; i3 < dst->ne[3]; i3++) {
177+
concat_f32_sycl(src0_d + i3 * (src0->nb[3] / 4), src1_d + i3 * (src1->nb[3] / 4),
178+
dst_d + i3 * (dst->nb[3] / 4), src0->ne[0], src0->ne[1], src0->ne[2], dst->ne[0],
179+
dst->ne[1], dst->ne[2], dim, stream);
180+
}
181+
} else {
182+
const size_t size0 = ggml_nbytes(src0);
183+
const size_t size1 = ggml_nbytes(src1);
184+
185+
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186+
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
187+
}
181188
} else {
182-
const size_t size0 = ggml_nbytes(src0);
183-
const size_t size1 = ggml_nbytes(src1);
184-
185-
SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy(dst_d, src0_d, size0).wait()));
186-
SYCL_CHECK(CHECK_TRY_ERROR(
187-
stream->memcpy(dst_d + size0 / 4, src1_d, size1).wait()));
189+
concat_f32_sycl_non_cont(stream, (const char *) src0->data, (const char *) src1->data, (char *) dst->data,
190+
src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], src0->nb[0], src0->nb[1],
191+
src0->nb[2], src0->nb[3], src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
192+
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
193+
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
188194
}
189-
} else
190-
concat_f32_sycl_non_cont(
191-
stream, (const char *)src0->data, (const char *)src1->data,
192-
(char *)dst->data, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
193-
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3], src1->ne[0],
194-
src1->ne[1], src1->ne[2], src1->ne[3], src1->nb[0], src1->nb[1],
195-
src1->nb[2], src1->nb[3], dst->ne[0], dst->ne[1], dst->ne[2],
196-
dst->ne[3], dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3], dim);
197195
}

ggml/src/ggml-sycl/conv.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ static void conv_transpose_1d_f32_f32_sycl(
7272
}
7373

7474
void ggml_sycl_op_conv_transpose_1d(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
75+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2);
7576
const ggml_tensor *src0 = dst->src[0];
7677
const ggml_tensor *src1 = dst->src[1];
7778
const float * src0_d = (const float *)src0->data;

ggml/src/ggml-sycl/cpy.cpp

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,9 @@ static void ggml_cpy_i32_i32_sycl(const char * cx, char * cdst, const int ne, co
616616
}
617617

618618
void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1) try {
619+
// Unlike other operators ggml_sycl_cpy takes 2 distinct tensors instead of a dst ggml_tensor and rely on its src field
620+
scope_op_debug_print scope_dbg_print(__func__, src1, /*num_src=*/0,
621+
std::string(" src0 type=") + ggml_type_name(src0->type));
619622
const int64_t ne = ggml_nelements(src0);
620623
GGML_ASSERT(ne == ggml_nelements(src1));
621624

@@ -629,8 +632,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
629632

630633
char * src0_ddc = (char *) src0->data;
631634
char * src1_ddc = (char *) src1->data;
632-
GGML_SYCL_DEBUG("[SYCL] %s: Tensor supplied: %s to %s\n", __func__, ggml_type_name(src0->type),
633-
ggml_type_name(src1->type));
634635

635636
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
636637
ggml_cpy_f32_f32_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10,
@@ -694,8 +695,6 @@ void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, co
694695
}
695696

696697
void ggml_sycl_dup(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
697-
// TODO: why do we pass dst as src1 here?
698-
GGML_SYCL_DEBUG("[SYCL] call %s\n", __func__);
698+
scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1);
699699
ggml_sycl_cpy(ctx, dst->src[0], dst);
700-
GGML_SYCL_DEBUG("[SYCL] call %s done\n", __func__);
701700
}

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1092,6 +1092,8 @@ void ggml_sycl_op_dequantize_mul_mat_vec(
10921092
src0->type == GGML_TYPE_Q8_0 || src0->type == GGML_TYPE_F16;
10931093

10941094
if (src1_convert_f16) {
1095+
scope_op_debug_print scope_dbg_print(__func__, "/to_fp16_sycl", dst, /*num_src=*/2,
1096+
" : converting src1 to fp16");
10951097
src1_dfloat = src1_dfloat_a.alloc(ne00);
10961098
const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type, dst);
10971099
GGML_ASSERT(to_fp16_sycl != nullptr);

0 commit comments

Comments
 (0)