Revert "fix memcpy() crash, add missed cmd in guide, fix softmax (ggml-org#6622)"

jart · jart · commit 75ce9f9301d0 · 2024-04-20T12:02:21.000-07:00
This reverts commit de17e3f.
diff --git a/examples/sycl/build.sh b/examples/sycl/build.sh
@@ -20,4 +20,4 @@ cmake .. -DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 #cmake --build . --config Release --target llama-bench
 
 #build all binary
-cmake --build . --config Release -j -v
+cmake --build . --config Release -v
diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh
@@ -12,7 +12,6 @@ if [ $# -gt 0 ]; then
     GGML_SYCL_SINGLE_GPU=1
 else
     GGML_SYCL_DEVICE=0
-    GGML_SYCL_SINGLE_GPU=0
 fi
 
 #export GGML_SYCL_DEBUG=1
diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp
@@ -3154,6 +3154,7 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
 #define SYCL_SCALE_BLOCK_SIZE 256
 #define SYCL_CLAMP_BLOCK_SIZE 256
 #define SYCL_ROPE_BLOCK_SIZE 256
+#define SYCL_SOFT_MAX_BLOCK_SIZE 1024
 #define SYCL_ALIBI_BLOCK_SIZE 32
 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
 #define SYCL_QUANTIZE_BLOCK_SIZE 256
@@ -13079,13 +13080,11 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
                               const int nrows_y, const float scale, const float max_bias,
                               dpct::queue_ptr stream) {
     int nth = WARP_SIZE;
-    int max_block_size = g_work_group_size;
-    while (nth < ncols_x && nth < max_block_size) nth *= 2;
-    if (nth>max_block_size) nth = max_block_size;
-
+    while (nth < ncols_x && nth < SYCL_SOFT_MAX_BLOCK_SIZE) nth *= 2;
     const sycl::range<3> block_dims(1, 1, nth);
     const sycl::range<3> block_nums(1, 1, nrows_x);
     const size_t n_local_scratch = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE);
+    static_assert(SYCL_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
 
     const uint32_t n_head_kv   = nrows_x/nrows_y;
     const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
@@ -13095,12 +13094,6 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
 
     const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
     if (n_local_scratch*sizeof(float) < local_mem_size) {
-        if (ncols_x > max_block_size) {
-            soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
-                                               max_bias, m0, m1, n_head_log2, block_nums,
-                                               block_dims, n_local_scratch, stream);
-            return;
-        }
         switch (ncols_x) {
             case 32:
                 soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
@@ -16825,13 +16818,11 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer,
     const dpct::queue_ptr stream = g_syclStreams[ctx->device][0];
     SYCL_CHECK(
         CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw()));
-    char* host_buf = (char*)malloc(size);
-    memcpy(host_buf, data, size);
+
     SYCL_CHECK(
         CHECK_TRY_ERROR((*stream)
-                             .memcpy((char *)tensor->data + offset, host_buf, size)
+                             .memcpy((char *)tensor->data + offset, data, size)
                              .wait()));
-    free(host_buf);
 }
 catch (sycl::exception const &exc) {
   std::cerr << exc.what() << "Exception caught at file:" << __FILE__