handling the case when nb11/nb10 != ne10

lslusarczyk · lslusarczyk · commit f73d0ab516c4 · 2025-05-02T18:13:59.000+02:00
diff --git a/ggml/src/ggml-sycl/gemm.hpp b/ggml/src/ggml-sycl/gemm.hpp
@@ -32,16 +32,30 @@ class DnnlGemmWrapper {
         else static_assert(0);
     }
 
-    static void row_gemm(ggml_backend_sycl_context & ctx, bool a_trans, bool b_trans, int m, int n, int k,
-            const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q,
-            dnnl_dim_t batches = 1) {
+    // matrix A has m rows, k columns
+    // matrix B has k rows, n columns
+    // nra - number of elements to skip when moving into next row in A
+    // nrb - number of elements to skip when moving into next row in B
+    // nca - number of elements to skip when moving into next column in A
+    // ncb - number of elements to skip when moving into next column in B
+    // stride_a - number of elements to skip when moving to next A matrix
+    // stride_b - number of elements to skip when moving to next B matrix
+    // batches - number of A matrices, equal to number of B matrices
+    static void gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
+        const void * a, dt at, dnnl_dim_t nra, dnnl_dim_t nca, dnnl_dim_t stride_a,
+        const void * b, dt bt, dnnl_dim_t nrb, dnnl_dim_t ncb, dnnl_dim_t stride_b,
+        void * c, dt ct, const queue_ptr & q, dnnl_dim_t batches) {
+
         auto stream = ctx.stream_dnnl(q);
         auto eng = ctx.engine_dnnl(q);
         dnnl::memory::dims a_dims = { batches, m, k };
         dnnl::memory::dims b_dims = { batches, k, n };
         dnnl::memory::dims c_dims = { batches, m, n };
-        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_trans ? tag::acb : tag::abc);
-        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_trans ? tag::acb : tag::abc);
+        dnnl::memory::dims a_strides = { stride_a, nra, nca };
+        dnnl::memory::dims b_strides = { stride_b, nrb, ncb };
+
+        const auto a_in_md = dnnl::memory::desc(a_dims, at, a_strides);
+        const auto b_in_md = dnnl::memory::desc(b_dims, bt, b_strides);
         const auto c_md    = dnnl::memory::desc(c_dims, ct, tag::abc);
 
         dnnl::primitive_attr primitive_attr;
@@ -64,6 +78,15 @@ class DnnlGemmWrapper {
 
         matmul_prim.execute(stream, matmul_args);
     }
+
+    // matrices A and B are column major, both having k rows
+    // matrix A has m column, matrix B has n columns
+    // output: column major matrix C = A transposed * B
+    static void row_gemm(ggml_backend_sycl_context & ctx, int m, int n, int k,
+        const void * a, dt at, const void * b, dt bt, void * c, dt ct, const queue_ptr & q) {
+
+        gemm(ctx, m, n, k, a, at, k, 1, k * m, b, bt, 1, k, n * k, c, ct, q, 1);
+    }
 };
 
 #endif
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2043,7 +2043,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
         to_fp32_sycl(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream);
 #else
-        DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ptr,
+        DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ptr,
                                   DnnlGemmWrapper::to_dt<sycl::half>(), src0_ptr, DnnlGemmWrapper::to_dt<sycl::half>(),
                                   dst_f16.get(), DnnlGemmWrapper::to_dt<sycl::half>(), stream);
         const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16, dst);
@@ -2077,7 +2077,7 @@ inline void ggml_sycl_op_mul_mat_sycl(
             src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
             dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
 #else
-        DnnlGemmWrapper::row_gemm(ctx, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i,
+        DnnlGemmWrapper::row_gemm(ctx, src1_ncols, row_diff, ne10, src1_ddf1_i,
                                   DnnlGemmWrapper::to_dt<float>(), src0_ddf_i, DnnlGemmWrapper::to_dt<float>(),
                                   dst_dd_i, DnnlGemmWrapper::to_dt<float>(), stream);
 #endif
@@ -2774,14 +2774,11 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
 
     if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
-#ifdef GGML_SYCL_DNNL
-        // TODO: use strided dnnl::memory::desc ctor in row_gemm to relax below assertions
-        GGML_ASSERT(nb11/nb10 == ne10);
-        GGML_ASSERT(nb01/nb00 == ne00);
-
-        DnnlGemmWrapper::row_gemm(ctx, false, true, ne11, ne01, ne10, src1_f16,
-                                          DnnlGemmWrapper::to_dt<sycl::half>(), src0_as_f16, DnnlGemmWrapper::to_dt<sycl::half>(),
-                                          dst_t, DnnlGemmWrapper::to_dt<float>(), main_stream, ne23);
+#if GGML_SYCL_DNNL
+        DnnlGemmWrapper::gemm(ctx, ne11, ne01, ne10,
+            src1_f16, DnnlGemmWrapper::to_dt<sycl::half>(), nb11/nb10, 1, nb12/nb10,
+            src0_as_f16, DnnlGemmWrapper::to_dt<sycl::half>(), 1, nb01/nb00, nb02/nb00,
+            dst_t, DnnlGemmWrapper::to_dt<float>(), main_stream, ne23);
 #else
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
             *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp
@@ -3870,7 +3870,7 @@ static const ggml_type other_types[] = {
 // Test cases for evaluation: should try to cover edge cases while using small input sizes to keep the runtime low
 static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
     std::vector<std::unique_ptr<test_case>> test_cases;
-    std::default_random_engine rng(0);
+    [[maybe_unused]] std::default_random_engine rng(0);
 
     // unary ops
     for (ggml_type type : {GGML_TYPE_F16, GGML_TYPE_F32}) {
@@ -4188,6 +4188,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 1, 3, 2}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16, 16, 256, {2, 3}, {1, 1}, {0, 3, 2, 1}));
 
+            test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  16, 256, {2, 1}, {1, 1}, {0, 2, 1, 3}));
+
             // test cases with large ne00/ne10 to cover stream-k fixup
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  1, 1024, {3, 2}, {1, 1}));
             test_cases.emplace_back(new test_mul_mat(type_a, type_b, 16,  8, 1024, {3, 2}, {1, 1}));