Skip to content

Commit e13041c

Browse files
committed
revert qx_k
1 parent 079f16a commit e13041c

File tree

3 files changed

+19
-17
lines changed

3 files changed

+19
-17
lines changed

ggml/src/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -490,7 +490,7 @@ if (GGML_SYCL)
490490
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
491491
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
492492
else()
493-
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
493+
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
494494
endif()
495495

496496
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")

ggml/src/ggml-sycl/dmmv.cpp

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include "dequantize.hpp"
44
#include "presets.hpp"
55

6+
int constexpr QK_WARP_SIZE = 32;
7+
68
static void convert_f16(const void * vx, const int ib, const int iqs, dfloat2 & v){
79
const sycl::half *x = (const sycl::half *)vx;
810

@@ -227,7 +229,7 @@ static void dequantize_mul_mat_vec_q2_k(const void *__restrict__ vx,
227229

228230
// sum up partial sums and write back result
229231
#pragma unroll
230-
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
232+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
231233
tmp +=
232234
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
233235
}
@@ -346,7 +348,7 @@ static void dequantize_mul_mat_vec_q3_k(const void *__restrict__ vx,
346348

347349
// sum up partial sums and write back result
348350
#pragma unroll
349-
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
351+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
350352
tmp +=
351353
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
352354
}
@@ -499,7 +501,7 @@ static void dequantize_mul_mat_vec_q4_k(const void *__restrict__ vx,
499501

500502
// sum up partial sums and write back result
501503
#pragma unroll
502-
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
504+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
503505
tmp +=
504506
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
505507
}
@@ -633,7 +635,7 @@ static void dequantize_mul_mat_vec_q5_k(const void *__restrict__ vx,
633635

634636
// sum up partial sums and write back result
635637
#pragma unroll
636-
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
638+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
637639
tmp +=
638640
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
639641
}
@@ -748,7 +750,7 @@ static void dequantize_mul_mat_vec_q6_k(const void * __restrict__ vx, const floa
748750

749751
// sum up partial sums and write back result
750752
#pragma unroll
751-
for (int mask = WARP_SIZE / 2; mask > 0; mask >>= 1) {
753+
for (int mask = QK_WARP_SIZE / 2; mask > 0; mask >>= 1) {
752754
tmp +=
753755
dpct::permute_sub_group_by_xor(item_ct1.get_sub_group(), tmp, mask);
754756
}
@@ -873,10 +875,10 @@ static void dequantize_mul_mat_vec_q2_K_sycl(const void *vx, const float *y,
873875
const int ny = 2; // very slightly faster than 1 even when K_QUANTS_PER_ITERATION = 2
874876
const int block_num_y = (nrows + ny - 1) / ny;
875877
const sycl::range<3> block_nums(1, 1, block_num_y);
876-
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
878+
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
877879
stream->parallel_for(
878880
sycl::nd_range<3>(block_nums * block_dims, block_dims),
879-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
881+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
880882
dequantize_mul_mat_vec_q2_k(vx, y, dst, ncols, nrows, item_ct1);
881883
});
882884
}
@@ -889,10 +891,10 @@ static void dequantize_mul_mat_vec_q3_K_sycl(const void *vx, const float *y,
889891
const int ny = 2 / K_QUANTS_PER_ITERATION;
890892
const int block_num_y = (nrows + ny - 1) / ny;
891893
const sycl::range<3> block_nums(1, 1, block_num_y);
892-
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
894+
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
893895
stream->parallel_for(
894896
sycl::nd_range<3>(block_nums * block_dims, block_dims),
895-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
897+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
896898
dequantize_mul_mat_vec_q3_k(vx, y, dst, ncols, nrows, item_ct1);
897899
});
898900
}
@@ -905,10 +907,10 @@ static void dequantize_mul_mat_vec_q4_K_sycl(const void *vx, const float *y,
905907
const int ny = 2 / K_QUANTS_PER_ITERATION;
906908
const int block_num_y = (nrows + ny - 1) / ny;
907909
const sycl::range<3> block_nums(1, 1, block_num_y);
908-
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
910+
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
909911
stream->parallel_for(
910912
sycl::nd_range<3>(block_nums * block_dims, block_dims),
911-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
913+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
912914
dequantize_mul_mat_vec_q4_k(vx, y, dst, ncols, nrows, item_ct1);
913915
});
914916
}
@@ -918,10 +920,10 @@ static void dequantize_mul_mat_vec_q5_K_sycl(const void *vx, const float *y,
918920
const int nrows,
919921
dpct::queue_ptr stream) {
920922
GGML_ASSERT(ncols % QK_K == 0);
921-
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
923+
const sycl::range<3> block_dims(1, 1, QK_WARP_SIZE);
922924
stream->parallel_for(
923925
sycl::nd_range<3>(sycl::range<3>(1, 1, nrows) * block_dims, block_dims),
924-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
926+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
925927
dequantize_mul_mat_vec_q5_k(vx, y, dst, ncols, item_ct1);
926928
});
927929
}
@@ -934,10 +936,10 @@ static void dequantize_mul_mat_vec_q6_K_sycl(const void *vx, const float *y,
934936
const int ny = 2 / K_QUANTS_PER_ITERATION;
935937
const int block_num_y = (nrows + ny - 1) / ny;
936938
const sycl::range<3> block_nums(1, 1, block_num_y);
937-
const sycl::range<3> block_dims(1, ny, WARP_SIZE);
939+
const sycl::range<3> block_dims(1, ny, QK_WARP_SIZE);
938940
stream->parallel_for(
939941
sycl::nd_range<3>(block_nums * block_dims, block_dims),
940-
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
942+
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(QK_WARP_SIZE)]] {
941943
dequantize_mul_mat_vec_q6_k(vx, y, dst, ncols, nrows, item_ct1);
942944
});
943945
}

tests/test-backend-ops.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2164,7 +2164,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
21642164
test_cases.emplace_back(new test_rms_norm(GGML_TYPE_F32, {64, 10, 10, 10}, eps));
21652165
}
21662166

2167-
for (ggml_type type_a : base_types) {
2167+
for (ggml_type type_a : {GGML_TYPE_Q4_K}) {
21682168
for (ggml_type type_b : {GGML_TYPE_F32, GGML_TYPE_F16}) {
21692169
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, { 1, 1}, {1, 1}));
21702170
test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 1, 256, {10, 1}, {1, 1}));

0 commit comments

Comments
 (0)