Skip to content

Refactor device management and usage api #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Aug 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ggml/include/ggml-sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int index);
GGML_API GGML_CALL void ggml_sycl_set_single_device(int main_gpu_id);

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);

// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
Expand Down
63 changes: 32 additions & 31 deletions ggml/src/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

#include "ggml-sycl/backend.hpp"
#include "ggml-sycl/presets.hpp"
#include "ggml-sycl/sycl_device.hpp"


void ggml_sycl_free_data(struct ggml_tensor * tensor);
Expand All @@ -48,7 +49,7 @@ void ggml_sycl_get_device_description(int device, char * description, size_t d
bool ggml_backend_is_sycl(ggml_backend_t backend);
int ggml_backend_sycl_get_device(ggml_backend_t backend);
static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer);

static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer);

void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
Expand Down Expand Up @@ -2279,11 +2280,11 @@ static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_SYC
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
if (tensor_split[i] < (i + 1 < ggml_sycl_info().device_count ? tensor_split[i + 1] : 1.0f)) {
if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
min_compute_capability = ggml_sycl_info().devices[id].cc;
if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
min_compute_capability = ggml_sycl_info().infos[id].cc;
}
if (max_compute_capability < ggml_sycl_info().devices[id].cc) {
max_compute_capability = ggml_sycl_info().devices[id].cc;
if (max_compute_capability < ggml_sycl_info().infos[id].cc) {
max_compute_capability = ggml_sycl_info().infos[id].cc;
}
}
}
Expand Down Expand Up @@ -2680,17 +2681,14 @@ static void ggml_sycl_set_peer_access(const int n_tokens, int main_device) {
}

#ifdef NDEBUG
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto &id: ggml_sycl_info().ids) {
SYCL_CHECK(ggml_sycl_set_device(id));
}

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto &id: ggml_sycl_info().ids) {
SYCL_CHECK(ggml_sycl_set_device(id));

for (int i_other = 0; i_other < ggml_sycl_info().device_count; ++i_other) {
int id_other = ggml_backend_sycl_get_device_id(i_other);
for (auto &id_other: ggml_sycl_info().ids) {
if (id == id_other) {
continue;
}
Expand Down Expand Up @@ -2818,8 +2816,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
}
}

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue;
}
Expand All @@ -2843,7 +2840,6 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
} else {
dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1));
}

if (convert_src1_to_q8_1) {
dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs);

Expand Down Expand Up @@ -2885,8 +2881,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_SYCL_MAX_STREAMS : 0;
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;

for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) {
continue;
}
Expand Down Expand Up @@ -3028,8 +3023,7 @@ static void ggml_sycl_op_mul_mat(ggml_backend_sycl_context & ctx, const ggml_ten
is_max = is_max <= GGML_SYCL_MAX_STREAMS ? is_max : GGML_SYCL_MAX_STREAMS;

ggml_sycl_set_device(ctx.device);
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
if (dev[id].row_low == dev[id].row_high) {
continue;
}
Expand Down Expand Up @@ -3165,8 +3159,13 @@ static void ggml_sycl_pad(ggml_backend_sycl_context & ctx, const ggml_tensor * s

static void ggml_sycl_rms_norm(ggml_backend_sycl_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_SYCL_DEBUG("call %s\n", __func__);
// log_tensor_with_cnt(ctx, "log/src0", src0, -1);
// log_tensor_with_cnt(ctx, "log/src1", src1, -1);
// log_tensor_with_cnt(ctx, "log/dst0", dst, -1);
ggml_sycl_op_flatten(ctx, src0, src1, dst, ggml_sycl_op_rms_norm);
// log_tensor_with_cnt(ctx, "log/dst1", dst, -1);
GGML_SYCL_DEBUG("call %s done\n", __func__);
// exit(1);
}

static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const ggml_tensor *src0,
Expand Down Expand Up @@ -3417,12 +3416,12 @@ static void ggml_sycl_mul_mat(ggml_backend_sycl_context & ctx, const ggml_tensor
continue;
}

if (min_compute_capability > ggml_sycl_info().devices[id].cc) {
min_compute_capability = ggml_sycl_info().devices[id].cc;
if (min_compute_capability > ggml_sycl_info().infos[id].cc) {
min_compute_capability = ggml_sycl_info().infos[id].cc;
}
}
} else {
min_compute_capability = ggml_sycl_info().devices[ctx.device].cc;
min_compute_capability = ggml_sycl_info().infos[ctx.device].cc;
}

// check data types and tensor shapes for custom matrix multiplication kernels:
Expand Down Expand Up @@ -4332,7 +4331,6 @@ static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);

GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");

check_allow_device_id(device_id);
Expand All @@ -4342,10 +4340,9 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device_id) {
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
auto & device = dpct::dev_mgr::instance().get_device(id);
queue_ptr stream = &(device.default_queue());
queue_ptr stream = ggml_sycl_info().infos[id].qptrs[0];
ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), stream},
Expand All @@ -4366,8 +4363,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_conte
static bool ggml_backend_sycl_buffer_type_initialized = false;

if (!ggml_backend_sycl_buffer_type_initialized) {
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
ggml_backend_sycl_buffer_types[id] = {
/* .iface = */ ggml_backend_sycl_buffer_type_interface,
/* .context = */ new ggml_backend_sycl_buffer_type_context{id, GGML_SYCL_NAME + std::to_string(id), ctx->stream(id, 0)},
Expand Down Expand Up @@ -4396,8 +4392,7 @@ static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tens
struct ggml_backend_sycl_split_buffer_context {
~ggml_backend_sycl_split_buffer_context() try {
for (ggml_tensor_extra_gpu * extra : tensor_extras) {
for (int i = 0; i < ggml_sycl_info().device_count; ++i) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
for (int64_t is = 0; is < GGML_SYCL_MAX_STREAMS; ++is) {
if (extra->events[id][is] != nullptr) {
/*
Expand Down Expand Up @@ -5148,6 +5143,13 @@ GGML_CALL int ggml_backend_sycl_get_device_count() {
return ggml_sycl_info().device_count;
}

GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id) {

GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_set_single_device_mode\n");
fprintf(stderr, "ggml_backend_sycl_set_single_device: use single device: [%d]\n", main_gpu_id);
ggml_sycl_info(main_gpu_id);
}

GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) {
ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data);
return sycl_backend;
Expand All @@ -5159,8 +5161,7 @@ extern "C" int ggml_backend_sycl_reg_devices();

int ggml_backend_sycl_reg_devices() {
assert(ggml_sycl_info().device_count>0);
for (int i = 0; i < ggml_sycl_info().device_count; i++) {
int id = ggml_backend_sycl_get_device_id(i);
for (auto & id: ggml_sycl_info().ids) {
char name[128];
snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id);
ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(id), (void *) (intptr_t) id);
Expand Down
Loading
Loading