Skip to content

CLBlast support #1164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Apr 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ endif()
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
option(LLAMA_CUBLAS "llama: use cuBLAS" OFF)
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)

option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
Expand Down Expand Up @@ -168,6 +169,21 @@ if (LLAMA_CUBLAS)
endif()
endif()

if (LLAMA_CLBLAST)
find_package(CLBlast)
if (CLBlast_FOUND)
message(STATUS "CLBlast found")

set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)

add_compile_definitions(GGML_USE_CLBLAST)

set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
else()
message(WARNING "CLBlast not found")
endif()
endif()

if (LLAMA_ALL_WARNINGS)
if (NOT MSVC)
set(c_flags
Expand Down Expand Up @@ -307,7 +323,8 @@ endif()
add_library(ggml OBJECT
ggml.c
ggml.h
${GGML_CUDA_SOURCES})
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES})

target_include_directories(ggml PUBLIC .)
target_compile_features(ggml PUBLIC c_std_11) # don't bump
Expand Down
11 changes: 9 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,14 +105,21 @@ ifdef LLAMA_OPENBLAS
LDFLAGS += -lopenblas
endif
ifdef LLAMA_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
OBJS += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif
ifdef LLAMA_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST
LDFLAGS += -lclblast -lOpenCL
OBJS += ggml-opencl.o
ggml-opencl.o: ggml-opencl.c ggml-opencl.h
$(CC) $(CFLAGS) -c $< -o $@
endif
ifdef LLAMA_GPROF
CFLAGS += -pg
CXXFLAGS += -pg
Expand Down
84 changes: 84 additions & 0 deletions ggml-opencl-dequant.cl
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#define MULTILINE_QUOTE(...) #__VA_ARGS__
const char * clblast_dequant = MULTILINE_QUOTE(

struct block_q4_0
{
float d;
uchar qs[16];
};

__kernel void dequantize_row_q4_0(__global struct block_q4_0* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;

const uchar vi = blocks[i].qs[l];

const uint index = i*32 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}

struct block_q4_1
{
float d;
float m;
uchar qs[16];
};

__kernel void dequantize_row_q4_1(__global struct block_q4_1* blocks, __global float* result) {
const uint i = get_global_id(0) / 32;
const uint l = get_local_id(0);

const float d = blocks[i].d;
const float m = blocks[i].m;

const uchar vi = blocks[i].qs[l];

const uint index = i*32 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}

struct block_q4_2
{
ushort d;
uchar qs[8];
};

__kernel void dequantize_row_q4_2(__global struct block_q4_2* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);

const float d = vload_half(0, (__global half*) &blocks[i].d);;

const uchar vi = blocks[i].qs[l];

const uint index = i*16 + l*2;
result[index + 0] = ((vi & 0xf) - 8)*d;
result[index + 1] = ((vi >> 4) - 8)*d;
}

struct block_q4_3
{
ushort d;
ushort m;
uchar qs[8];
};

__kernel void dequantize_row_q4_3(__global struct block_q4_3* blocks, __global float* result) {
const uint i = get_global_id(0) / 16;
const uint l = get_local_id(0);

const float d = vload_half(0, (__global half*) &(blocks[i].d));
const float m = vload_half(0, (__global half*) &(blocks[i].m));

const uchar vi = blocks[i].qs[l];

const uint index = i*16 + l*2;
result[index + 0] = (vi & 0xf) * d + m;
result[index + 1] = (vi >> 4) * d + m;
}

);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I prefer to have this inlined in ggml-opencl.c and avoid this extra file. We can do this later - it's not a problem

The way I see it is that at this point of the development we want to have as few files as possible.
It can seem like a weird constraint and requirement, but I really think that we benefit a lot when we have everything in one place. It is more difficult for a new person to understand the structure of the code, but after they get used to it, it becomes a benefit.

In the future, we will split the library in the proper source files and directory structure, but at the start I think it is a better strategy to have everything packed in one place.

216 changes: 216 additions & 0 deletions ggml-opencl.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,216 @@
#include "ggml-opencl.h"

#define CL_TARGET_OPENCL_VERSION 110
#include <clblast_c.h>

#include <stdio.h>
#include <string.h>

#include "ggml.h"

#include "ggml-opencl-dequant.cl"

#define CL_CHECK(err, name) \
do { \
cl_int err_ = (err); \
if (err_ != CL_SUCCESS) { \
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)

static cl_platform_id platform;
static cl_device_id device;
static cl_context context;
static cl_command_queue queue;
static cl_program program;
static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q4_2, kernel_q4_3;
static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;

static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
cl_program p;
char *program_log;
size_t program_size, log_size;
int err;

program_size = strlen(program_buffer);

p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
if(err < 0) {
fprintf(stderr, "OpenCL error creating program");
exit(1);
}

err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
if(err < 0) {

clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
program_log = (char*) malloc(log_size + 1);
program_log[log_size] = '\0';
clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
printf("%s\n", program_log);
free(program_log);
exit(1);
}

return p;
}

void ggml_cl_init(void) {
cl_int err = 0;
char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
printf("\nInitializing CLBlast (First Run)...");
printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
cl_uint num_platforms;
clGetPlatformIDs(0, NULL, &num_platforms);
cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
clGetPlatformIDs(num_platforms, platforms, NULL);
platform = platforms[plat_num];
char platform_buffer[1024];
clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
cl_uint num_devices;
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
device = devices[dev_num];
char device_buffer[1024];
clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It takes 9 seconds to go from Attempting to use to this print statement on my iGPU. Is it expected, and if so, why is it so slow?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It does not take that long for me, but this depends on your OpenCL (graphics) driver.

context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
CL_CHECK(err, "clCreateContext");
queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
CL_CHECK(err, "clCreateCommandQueue");

free(platforms);
free(devices);

program = build_program_from_source(context, device, clblast_dequant);

// Prepare dequantize kernels
kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_2 = clCreateKernel(program, "dequantize_row_q4_2", &err);
CL_CHECK(err, "clCreateKernel");
kernel_q4_3 = clCreateKernel(program, "dequantize_row_q4_3", &err);
CL_CHECK(err, "clCreateKernel");
}

static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
if (req_size <= *cur_size) {
return;
}

// Reallocate buffer with enough space
if (*cur_size > 0) {
clReleaseMemObject(*buf);
}
cl_int err;
*buf = clCreateBuffer(context, flags, req_size, NULL, &err);
*cur_size = req_size;
CL_CHECK(err, "clCreateBuffer");
}

void ggml_cl_sgemm_wrapper(
const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
const int m, const int n, const int k,
const float alpha, const void *host_a, const int lda,
const float *host_b, const int ldb, const float beta,
float *host_c, const int ldc, const int btype) {
cl_int err = 0;

cl_kernel kernel;
size_t global = n * k, local, size_qb;
bool dequant;

switch (btype) {
case GGML_TYPE_F32:
dequant = false;
break;
case GGML_TYPE_Q4_0:
dequant = true;
kernel = kernel_q4_0;
local = 16;
size_qb = global * (sizeof(float) + local) / 32;
break;
case GGML_TYPE_Q4_1:
dequant = true;
kernel = kernel_q4_1;
local = 16;
size_qb = global * (sizeof(float) * 2 + local) / 32;
break;
case GGML_TYPE_Q4_2:
dequant = true;
kernel = kernel_q4_2;
local = 8;
size_qb = global * (sizeof(short) + local) / 16;
break;
case GGML_TYPE_Q4_3:
dequant = true;
kernel = kernel_q4_3;
local = 8;
size_qb = global * (sizeof(short) * 2 + local) / 16;
break;
default:
fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
abort();
}

const size_t size_a = m * k * sizeof(float);
const size_t size_b = n * k * sizeof(float);
const size_t size_c = m * n * sizeof(float);

// Prepare buffers
ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
if (dequant) {
ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
}
ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);

cl_event ev_a, ev_qb, ev_b;

if (dequant) {
err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
CL_CHECK(err, "clSetKernelArg");
clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
} else {
clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
}

clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
if (dequant) {
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
CL_CHECK(err, "clEnqueueNDRangeKernel");
clReleaseEvent(ev_qb);
}
clWaitForEvents(1, &ev_a);
clWaitForEvents(1, &ev_b);
clReleaseEvent(ev_a);
clReleaseEvent(ev_b);

cl_event ev_sgemm;
CLBlastSgemm((CLBlastLayout)order,
(CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
m, n, k,
alpha,
cl_buffer_a, 0, lda,
cl_buffer_b, 0, ldb,
beta,
cl_buffer_c, 0, ldc,
&queue, &ev_sgemm);

cl_event ev_c;
clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);

// Wait for completion
clWaitForEvents(1, &ev_c);
clReleaseEvent(ev_sgemm);
clReleaseEvent(ev_c);
}
24 changes: 24 additions & 0 deletions ggml-opencl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#pragma once

#ifdef __cplusplus
extern "C" {
#endif

void ggml_cl_init(void);

enum ggml_blas_order {
GGML_BLAS_ORDER_ROW_MAJOR = 101,
GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
};

enum ggml_blas_op {
GGML_BLAS_OP_N = 111,
GGML_BLAS_OP_T = 112,
GGML_BLAS_OP_C = 113,
};

void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);

#ifdef __cplusplus
}
#endif
Loading