@@ -444,19 +444,8 @@ static ggml_backend_opencl_context * ggml_cl2_init(ggml_backend_dev_t dev) {
444
444
backend_ctx->gpu_family = GPU_FAMILY::ADRENO;
445
445
backend_ctx->adreno_gen = get_adreno_gpu_gen (default_device->name );
446
446
447
- // Default wave size is 128, A8x uses 64.
448
- if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A8X) {
449
- backend_ctx->adreno_wave_size = 64 ;
450
- } else if (backend_ctx->adreno_gen == ADRENO_GPU_GEN::A7X ||
451
- backend_ctx->adreno_gen == ADRENO_GPU_GEN::X1E) {
452
- backend_ctx->adreno_wave_size = 128 ;
453
- } else {
454
- backend_ctx->adreno_wave_size = 128 ;
455
- GGML_LOG_WARN (" ggml_opencl: Unsupported Adreno GPU: %s, "
456
- " using wave size %d, "
457
- " may not work as expected\n " ,
458
- backend_ctx->device_name .c_str (), backend_ctx->adreno_wave_size );
459
- }
447
+ // Use wave size of 64 for all Adreno GPUs.
448
+ backend_ctx->adreno_wave_size = 64 ;
460
449
} else if (strstr (default_device->name , " Intel" )) {
461
450
backend_ctx->gpu_family = GPU_FAMILY::INTEL;
462
451
} else {
@@ -1376,6 +1365,11 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1376
1365
int M = tensor->ne [1 ]; // ne01
1377
1366
int K = tensor->ne [0 ]; // ne00
1378
1367
1368
+ // For matrix-vector multiplication kernel, we assume K is a multiple of 32
1369
+ GGML_ASSERT (K % 32 == 0 );
1370
+ // For transpose kernels, we assume K is a multiple of 4 (satisfied by prior assert), and M is a multiple of 4
1371
+ GGML_ASSERT (M % 4 == 0 );
1372
+
1379
1373
// transpose is out of place, so we need to allocate transposed buffers
1380
1374
// <----------------------------------------------------------------------------------> //
1381
1375
// use sub_buffer of max buffer size instead
@@ -1416,36 +1410,36 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1416
1410
cl_mem qT_d_image1D;
1417
1411
cl_mem dT_d_image1D;
1418
1412
1419
- cl_image_format img_fmt_1d = { CL_RGBA, CL_FLOAT };
1413
+ cl_image_format img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1420
1414
cl_image_desc img_desc_1d;
1421
1415
1422
1416
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1423
1417
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1424
- img_desc_1d.image_width = M * K / 8 / 4 ;
1418
+ img_desc_1d.image_width = M * K / 4 / 4 ;
1425
1419
img_desc_1d.buffer = extra->q ;
1426
1420
q_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1427
1421
CL_CHECK (err);
1428
1422
1429
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1423
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1430
1424
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1431
1425
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1432
- img_desc_1d.image_width = M * K / 8 / 4 ;
1426
+ img_desc_1d.image_width = M * K / 4 / 4 ;
1433
1427
img_desc_1d.buffer = qT_d;
1434
1428
qT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1435
1429
CL_CHECK (err);
1436
1430
1437
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1431
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1438
1432
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1439
1433
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1440
- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1434
+ img_desc_1d.image_width = M * K / 32 / 4 ;
1441
1435
img_desc_1d.buffer = extra->d ;
1442
1436
d_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1443
1437
CL_CHECK (err);
1444
1438
1445
- img_fmt_1d = { CL_RGBA, CL_FLOAT };
1439
+ img_fmt_1d = { CL_RGBA, CL_HALF_FLOAT };
1446
1440
memset (&img_desc_1d, 0 , sizeof (img_desc_1d));
1447
1441
img_desc_1d.image_type = CL_MEM_OBJECT_IMAGE1D_BUFFER;
1448
- img_desc_1d.image_width = M * K / 32 / 4 / 2 ;
1442
+ img_desc_1d.image_width = M * K / 32 / 4 ;
1449
1443
img_desc_1d.buffer = dT_d;
1450
1444
dT_d_image1D = clCreateImage (context, 0 , &img_fmt_1d, &img_desc_1d, NULL , &err);
1451
1445
CL_CHECK (err);
@@ -1454,8 +1448,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1454
1448
// set up and call the transpose kernels
1455
1449
// <----------------------------------------------------------------------------------> //
1456
1450
// weights
1457
- int height_q = M / 8 ;
1458
- int width_q = K / 8 / 4 ;
1451
+ int height_q = M / 4 ;
1452
+ int width_q = K / 4 / 4 ;
1459
1453
kernel = backend_ctx->kernel_transpose_16 ;
1460
1454
1461
1455
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &q_d_image1D));
@@ -1469,8 +1463,8 @@ static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer,
1469
1463
CL_CHECK (clWaitForEvents (1 , &evt));
1470
1464
1471
1465
// scales
1472
- int height_s = M / 8 ;
1473
- int width_s = K / 32 / 8 ;
1466
+ int height_s = M / 4 ;
1467
+ int width_s = K / 32 / 4 ;
1474
1468
1475
1469
kernel = backend_ctx->kernel_transpose_16 ;
1476
1470
CL_CHECK (clSetKernelArg (kernel, 0 , sizeof (cl_mem), &d_d_image1D));
@@ -1864,7 +1858,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1864
1858
void * buf_d;
1865
1859
#endif
1866
1860
1867
- #ifdef GGML_USE_OPENCL
1868
1861
// Make sure everything is done.
1869
1862
CL_CHECK(clFinish(queue));
1870
1863
@@ -1900,7 +1893,6 @@ static void dump_tensor(ggml_backend_t backend, const struct ggml_tensor * tenso
1900
1893
extra->offset, ggml_nbytes(tensor), buf, 0, NULL, NULL));
1901
1894
CL_CHECK(clFinish(queue));
1902
1895
#endif // GGML_OPENCL_SOA_Q
1903
- #endif // GGML_USE_OPENCL
1904
1896
1905
1897
// Open file and dump.
1906
1898
char fname[512];
@@ -2865,6 +2857,9 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
2865
2857
CL_CHECK (status);
2866
2858
2867
2859
int height_B = N/4 ;
2860
+ if (height_B == 0 ) {
2861
+ height_B = 1 ;
2862
+ }
2868
2863
int width_B = K/4 ;
2869
2864
int padded_height_B = (N + padding)/4 ;
2870
2865
@@ -3013,11 +3008,12 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
3013
3008
}
3014
3009
3015
3010
if (N == 1 ) {
3016
- local_work_size[0 ] = backend_ctx->adreno_wave_size ; // localsize
3011
+ size_t wavesize = backend_ctx->adreno_wave_size ;
3012
+ local_work_size[0 ] = wavesize; // localsize
3017
3013
local_work_size[1 ] = 4 ; // reduce factor
3018
3014
local_work_size[2 ] = 1 ;
3019
3015
3020
- global_work_size[0 ] = M / 2 ;
3016
+ global_work_size[0 ] = ((( M / 2 ) + wavesize - 1 ) / wavesize) * wavesize ;
3021
3017
global_work_size[1 ] = 4 ; // reduce factor
3022
3018
global_work_size[2 ] = 1 ;
3023
3019
}
0 commit comments