14
14
#define GGML_SYCL_VECDOTQ_HPP
15
15
16
16
#include " dpct/helper.hpp"
17
- #include " syclcompat/math.hpp"
18
17
19
18
typedef float (*vec_dot_q_sycl_t )(const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs);
20
19
@@ -90,14 +89,14 @@ static __dpct_inline__ float vec_dot_q2_K_q8_1_impl_mmvq(
90
89
const int vi = (v >> (2 *i)) & 0x03030303 ;
91
90
92
91
sumf_d +=
93
- d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
92
+ d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * (sc & 0xF )); // SIMD dot product
94
93
95
94
// fill int with 4x m
96
95
int m = sc >> 4 ;
97
96
m |= m << 8 ;
98
97
m |= m << 16 ;
99
98
sumf_m += d8[i] *
100
- syclcompat ::dp4a (
99
+ dpct ::dp4a (
101
100
m, u[i],
102
101
0 ); // multiply constant q2_K part with sum of q8_1 values
103
102
}
@@ -140,7 +139,7 @@ static __dpct_inline__ float vec_dot_q3_K_q8_1_impl_mmvq(
140
139
const int vi =
141
140
dpct::vectorized_binary<sycl::char4>(vil, vih, dpct::sub_sat ());
142
141
143
- sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
142
+ sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
144
143
}
145
144
146
145
return d3 * sumf;
@@ -163,11 +162,11 @@ static __dpct_inline__ float vec_dot_q4_K_q8_1_impl_vmmq(
163
162
const int v1i = (v[1 ] >> (4 *i)) & 0x0F0F0F0F ;
164
163
165
164
const int dot1 =
166
- syclcompat ::dp4a (v1i, u[2 * i + 1 ],
167
- syclcompat ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
165
+ dpct ::dp4a (v1i, u[2 * i + 1 ],
166
+ dpct ::dp4a (v0i, u[2 * i + 0 ], 0 )); // SIMD dot product
168
167
const int dot2 =
169
- syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ],
170
- syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
168
+ dpct ::dp4a (0x01010101 , u[2 * i + 1 ],
169
+ dpct ::dp4a (0x01010101 , u[2 * i + 0 ], 0 )); // sum of u
171
170
172
171
sumf_d += d8[i] * (dot1 * sc[i]);
173
172
sumf_m += d8[i] * (dot2 * m[i]); // multiply constant part of q4_K with sum of q8_1 values
@@ -204,11 +203,11 @@ static __dpct_inline__ float vec_dot_q5_K_q8_1_impl_vmmq(
204
203
const int v1i = vl1i | vh1i;
205
204
206
205
const int dot1 =
207
- syclcompat ::dp4a (v0i, u[2 * i + 0 ],
208
- syclcompat ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
206
+ dpct ::dp4a (v0i, u[2 * i + 0 ],
207
+ dpct ::dp4a (v1i, u[2 * i + 1 ], 0 )); // SIMD dot product
209
208
const int dot2 =
210
- syclcompat ::dp4a (0x01010101 , u[2 * i + 0 ],
211
- syclcompat ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
209
+ dpct ::dp4a (0x01010101 , u[2 * i + 0 ],
210
+ dpct ::dp4a (0x01010101 , u[2 * i + 1 ], 0 )); // sum of u
212
211
213
212
sumf_d += d8[i] * (dot1 * sc[i]);
214
213
sumf_m += d8[i] * (dot2 * m[i]);
@@ -244,7 +243,7 @@ vec_dot_q6_K_q8_1_impl_mmvq(const int &vl, const int &vh,
244
243
const int vi = dpct::vectorized_binary<sycl::char4>(
245
244
(vil | vih), 0x20202020 , dpct::sub_sat ()); // vi = (vil | vih) - 32
246
245
247
- sumf += d8[i] * (syclcompat ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
246
+ sumf += d8[i] * (dpct ::dp4a (vi, u[i], 0 ) * sc); // SIMD dot product
248
247
}
249
248
250
249
return d*sumf;
@@ -267,8 +266,8 @@ static __dpct_inline__ float vec_dot_q4_0_q8_1_impl(const int *v, const int *u,
267
266
const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
268
267
269
268
// SIMD dot product of quantized values
270
- sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
271
- sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
269
+ sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
270
+ sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
272
271
}
273
272
274
273
const sycl::float2 ds8f =
@@ -294,8 +293,8 @@ static __dpct_inline__ float vec_dot_q4_1_q8_1_impl(const int *v, const int *u,
294
293
const int vi1 = (v[i] >> 4 ) & 0x0F0F0F0F ;
295
294
296
295
// SIMD dot product of quantized values
297
- sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ], sumi);
298
- sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ], sumi);
296
+ sumi = dpct ::dp4a (vi0, u[2 * i + 0 ], sumi);
297
+ sumi = dpct ::dp4a (vi1, u[2 * i + 1 ], sumi);
299
298
}
300
299
301
300
#ifdef GGML_SYCL_F16
@@ -332,15 +331,15 @@ vec_dot_q5_0_q8_1_impl(const int *vl, const int *vh, const int *u,
332
331
vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
333
332
vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
334
333
vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
335
- sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
334
+ sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
336
335
sumi); // SIMD dot product of quantized values
337
336
338
337
int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
339
338
vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
340
339
vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
341
340
vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
342
341
vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
343
- sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
342
+ sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
344
343
sumi); // SIMD dot product of quantized values
345
344
}
346
345
@@ -368,15 +367,15 @@ vec_dot_q5_1_q8_1_impl(const int *vl, const int *vh, const int *u,
368
367
vi0 |= (vh[i] << 11 ) & 0x00001000 ; // 1 -> 12
369
368
vi0 |= (vh[i] << 18 ) & 0x00100000 ; // 2 -> 20
370
369
vi0 |= (vh[i] << 25 ) & 0x10000000 ; // 3 -> 28
371
- sumi = syclcompat ::dp4a (vi0, u[2 * i + 0 ],
370
+ sumi = dpct ::dp4a (vi0, u[2 * i + 0 ],
372
371
sumi); // SIMD dot product of quantized values
373
372
374
373
int vi1 = (vl[i] >> 4 ) & 0x0F0F0F0F ; // upper 4 qs bits, still need qh as 5th bits
375
374
vi1 |= (vh[i] >> 12 ) & 0x00000010 ; // 16 -> 4
376
375
vi1 |= (vh[i] >> 5 ) & 0x00001000 ; // 17 -> 12
377
376
vi1 |= (vh[i] << 2 ) & 0x00100000 ; // 18 -> 20
378
377
vi1 |= (vh[i] << 9 ) & 0x10000000 ; // 19 -> 28
379
- sumi = syclcompat ::dp4a (vi1, u[2 * i + 1 ],
378
+ sumi = dpct ::dp4a (vi1, u[2 * i + 1 ],
380
379
sumi); // SIMD dot product of quantized values
381
380
}
382
381
@@ -413,7 +412,7 @@ static __dpct_inline__ float vec_dot_q8_0_q8_1_impl(const int *v, const int *u,
413
412
#pragma unroll
414
413
for (int i = 0 ; i < vdr; ++i) {
415
414
// SIMD dot product of quantized values
416
- sumi = syclcompat ::dp4a (v[i], u[i], sumi);
415
+ sumi = dpct ::dp4a (v[i], u[i], sumi);
417
416
}
418
417
419
418
return d8_0*d8_1 * sumi;
@@ -429,7 +428,7 @@ static __dpct_inline__ float vec_dot_q8_1_q8_1_impl(const int *v, const int *u,
429
428
#pragma unroll
430
429
for (int i = 0 ; i < vdr; ++i) {
431
430
// SIMD dot product of quantized values
432
- sumi = syclcompat ::dp4a (v[i], u[i], sumi);
431
+ sumi = dpct ::dp4a (v[i], u[i], sumi);
433
432
}
434
433
435
434
#ifdef GGML_SYCL_F16
@@ -678,10 +677,10 @@ vec_dot_q4_K_q8_1(const void *__restrict__ vbq,
678
677
const int v1 = q4[0 ];
679
678
const int v2 = q4[4 ];
680
679
681
- const int dot1 = syclcompat ::dp4a (ui2, v2 & 0x0f0f0f0f , syclcompat ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
682
- const int dot2 = syclcompat ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , syclcompat ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
683
- const int dot3 = syclcompat ::dp4a (0x01010101 , ui2, syclcompat ::dp4a (0x01010101 , ui1, 0 ));
684
- const int dot4 = syclcompat ::dp4a (0x01010101 , ui4, syclcompat ::dp4a (0x01010101 , ui3, 0 ));
680
+ const int dot1 = dpct ::dp4a (ui2, v2 & 0x0f0f0f0f , dpct ::dp4a (ui1, v1 & 0x0f0f0f0f , 0 ));
681
+ const int dot2 = dpct ::dp4a (ui4, (v2 >> 4 ) & 0x0f0f0f0f , dpct ::dp4a (ui3, (v1 >> 4 ) & 0x0f0f0f0f , 0 ));
682
+ const int dot3 = dpct ::dp4a (0x01010101 , ui2, dpct ::dp4a (0x01010101 , ui1, 0 ));
683
+ const int dot4 = dpct ::dp4a (0x01010101 , ui4, dpct ::dp4a (0x01010101 , ui3, 0 ));
685
684
686
685
sumf_d += d8_1 * (dot1 * s[0 ]) + d8_2 * (dot2 * s[1 ]);
687
686
sumf_m += d8_1 * (dot3 * s[2 ]) + d8_2 * (dot4 * s[3 ]);
@@ -773,8 +772,8 @@ vec_dot_q5_K_q8_1(const void *__restrict__ vbq,
773
772
const int v3 = (((vh >> 0 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl1 >> 4 ) & 0x0f0f0f0f );
774
773
const int v4 = (((vh >> 2 ) & 0x10101010 ) ^ 0x10101010 ) | ((vl2 >> 4 ) & 0x0f0f0f0f );
775
774
776
- const float sumf_d = d8_1 * (syclcompat ::dp4a (ui1, v1, 0 ) * s[0 ] + syclcompat ::dp4a (ui2, v2, 0 ) * s[1 ])
777
- + d8_2 * (syclcompat ::dp4a (ui3, v3, 0 ) * s[2 ] + syclcompat ::dp4a (ui4, v4, 0 ) * s[3 ]);
775
+ const float sumf_d = d8_1 * (dpct ::dp4a (ui1, v1, 0 ) * s[0 ] + dpct ::dp4a (ui2, v2, 0 ) * s[1 ])
776
+ + d8_2 * (dpct ::dp4a (ui3, v3, 0 ) * s[2 ] + dpct ::dp4a (ui4, v4, 0 ) * s[3 ]);
778
777
779
778
return d * sumf_d;
780
779
@@ -866,8 +865,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
866
865
grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
867
866
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
868
867
grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
869
- sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
870
- sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
868
+ sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
869
+ sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
871
870
q8 += 8 ;
872
871
}
873
872
int sumi2 = 0 ;
@@ -878,8 +877,8 @@ vec_dot_iq2_xs_q8_1(const void *__restrict__ vbq,
878
877
grid[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
879
878
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
880
879
grid[1 ] ^ signs[1 ], signs[1 ], std::minus<>());
881
- sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
882
- sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
880
+ sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
881
+ sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
883
882
q8 += 8 ;
884
883
}
885
884
const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -918,8 +917,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
918
917
grid[0 ] ^ signs0, signs0, std::minus<>());
919
918
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
920
919
grid[1 ] ^ signs1, signs1, std::minus<>());
921
- sumi1 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
922
- sumi1 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
920
+ sumi1 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi1);
921
+ sumi1 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi1);
923
922
q8 += 8 ;
924
923
}
925
924
int sumi2 = 0 ;
@@ -935,8 +934,8 @@ vec_dot_iq2_s_q8_1(const void *__restrict__ vbq,
935
934
grid[0 ] ^ signs0, signs0, std::minus<>());
936
935
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
937
936
grid[1 ] ^ signs1, signs1, std::minus<>());
938
- sumi2 = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
939
- sumi2 = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
937
+ sumi2 = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi2);
938
+ sumi2 = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi2);
940
939
q8 += 8 ;
941
940
}
942
941
const float d = (float )bq2->d * bq8_1[ib32].ds [0 ] * 0 .25f ;
@@ -969,8 +968,8 @@ vec_dot_iq3_xxs_q8_1(const void *__restrict__ vbq,
969
968
grid1[0 ] ^ signs[0 ], signs[0 ], std::minus<>());
970
969
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
971
970
grid2[0 ] ^ signs[1 ], signs[1 ], std::minus<>());
972
- sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
973
- sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
971
+ sumi = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
972
+ sumi = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
974
973
q8 += 8 ;
975
974
aux32 >>= 7 ;
976
975
}
@@ -1010,8 +1009,8 @@ vec_dot_iq3_s_q8_1(const void *__restrict__ vbq,
1010
1009
grid1[0 ] ^ signs0, signs0, std::minus<>());
1011
1010
const int grid_h = dpct::vectorized_binary<sycl::uchar4>(
1012
1011
grid2[0 ] ^ signs1, signs1, std::minus<>());
1013
- sumi = syclcompat ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
1014
- sumi = syclcompat ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
1012
+ sumi = dpct ::dp4a (grid_l, *((const int *)q8 + 0 ), sumi);
1013
+ sumi = dpct ::dp4a (grid_h, *((const int *)q8 + 1 ), sumi);
1015
1014
q8 += 8 ;
1016
1015
}
1017
1016
const float d =
@@ -1038,8 +1037,8 @@ vec_dot_iq1_s_q8_1(const void *__restrict__ vbq,
1038
1037
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [ib32] >> 3 *l) & 7 ) << 8 )));
1039
1038
int grid0 = grid[0 ] & 0x0f0f0f0f ;
1040
1039
int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1041
- sumi = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1042
- syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi));
1040
+ sumi = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1041
+ dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi));
1043
1042
}
1044
1043
1045
1044
const float delta = bq1->qh [ib32] & 0x8000 ? -1 -IQ1S_DELTA : -1 +IQ1S_DELTA;
@@ -1067,11 +1066,11 @@ vec_dot_iq1_m_q8_1(const void *__restrict__ vbq,
1067
1066
const int * grid = (const int *)(iq1s_grid_gpu + (bq1->qs [4 *ib32+l] | (((bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 7 ) << 8 )));
1068
1067
int grid0 = grid[0 ] & 0x0f0f0f0f ;
1069
1068
int grid1 = (grid[0 ] >> 4 ) & 0x0f0f0f0f ;
1070
- sumi[l / 2 ] = syclcompat ::dp4a (q8[2 * l + 1 ], grid1,
1071
- syclcompat ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
1069
+ sumi[l / 2 ] = dpct ::dp4a (q8[2 * l + 1 ], grid1,
1070
+ dpct ::dp4a (q8[2 * l + 0 ], grid0, sumi[l / 2 ]));
1072
1071
const float delta = (bq1->qh [2 *ib32+l/2 ] >> 4 *(l%2 )) & 0x08 ? -1 -IQ1M_DELTA : -1 +IQ1M_DELTA;
1073
- const int sumy = syclcompat ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1074
- syclcompat ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
1072
+ const int sumy = dpct ::dp4a (q8[2 * l + 1 ], 0x01010101 ,
1073
+ dpct ::dp4a (q8[2 * l + 0 ], 0x01010101 , 0 ));
1075
1074
sumf[l/2 ] += delta*sumy;
1076
1075
}
1077
1076
@@ -1102,8 +1101,8 @@ vec_dot_iq4_nl_q8_1(const void *__restrict__ vbq,
1102
1101
for (int l = 0 ; l < VDR_Q4_0_Q8_1_MMVQ; ++l) {
1103
1102
const uint32_t aux = q4[2 *l] | (q4[2 *l+1 ] << 16 );
1104
1103
get_int_from_table_16 (aux, values, v1, v2);
1105
- sumi1 = syclcompat ::dp4a (v1, q8[l + 0 ], sumi1);
1106
- sumi2 = syclcompat ::dp4a (v2, q8[l + 4 ], sumi2);
1104
+ sumi1 = dpct ::dp4a (v1, q8[l + 0 ], sumi1);
1105
+ sumi2 = dpct ::dp4a (v2, q8[l + 4 ], sumi2);
1107
1106
}
1108
1107
1109
1108
const float d = (float )bq->d * bq8_1->ds [0 ];
@@ -1129,8 +1128,8 @@ vec_dot_iq4_xs_q8_1(const void *__restrict__ vbq,
1129
1128
int sumi1 = 0 , sumi2 = 0 ;
1130
1129
for (int j = 0 ; j < 4 ; ++j) {
1131
1130
get_int_from_table_16 (q4[j], values, v1, v2);
1132
- sumi1 = syclcompat ::dp4a (v1, q8[j + 0 ], sumi1);
1133
- sumi2 = syclcompat ::dp4a (v2, q8[j + 4 ], sumi2);
1131
+ sumi1 = dpct ::dp4a (v1, q8[j + 0 ], sumi1);
1132
+ sumi2 = dpct ::dp4a (v2, q8[j + 4 ], sumi2);
1134
1133
}
1135
1134
return d * (sumi1 + sumi2);
1136
1135
#else
0 commit comments