@@ -163,21 +163,31 @@ typedef float2 dfloat2;
163
163
#endif // GGML_CUDA_F16
164
164
165
165
static __device__ __forceinline__ int get_int_from_int8 (const int8_t * x8, const int & i32 ) {
166
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
166
+ // const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
167
+ x8 += sizeof (int ) * i32 ;
167
168
168
169
int x32 = 0 ;
169
- x32 |= x16[0 ] << 0 ;
170
- x32 |= x16[1 ] << 16 ;
170
+ // x32 |= x16[0] << 0;
171
+ // x32 |= x16[1] << 16;
172
+ x32 |= ((uint32_t )(x8[0 ])) << 0 ;
173
+ x32 |= ((uint32_t )(x8[1 ])) << 8 ;
174
+ x32 |= ((uint32_t )(x8[2 ])) << 16 ;
175
+ x32 |= ((uint32_t )(x8[3 ])) << 24 ;
171
176
172
177
return x32;
173
178
}
174
179
175
180
static __device__ __forceinline__ int get_int_from_uint8 (const uint8_t * x8, const int & i32 ) {
176
- const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
181
+ // const uint16_t * x16 = (uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment
182
+ x8 += sizeof (int ) * i32 ;
177
183
178
184
int x32 = 0 ;
179
- x32 |= x16[0 ] << 0 ;
180
- x32 |= x16[1 ] << 16 ;
185
+ // x32 |= x16[0] << 0;
186
+ // x32 |= x16[1] << 16;
187
+ x32 |= ((uint32_t )(x8[0 ])) << 0 ;
188
+ x32 |= ((uint32_t )(x8[1 ])) << 8 ;
189
+ x32 |= ((uint32_t )(x8[2 ])) << 16 ;
190
+ x32 |= ((uint32_t )(x8[3 ])) << 24 ;
181
191
182
192
return x32;
183
193
}
@@ -2093,7 +2103,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2093
2103
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbx;
2094
2104
2095
2105
x_ql[i * (WARP_SIZE + 1 ) + k] = get_int_from_uint8 (bxi->qs , kqsx);
2096
- // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = bxi->d;
2106
+ // x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbx] = Q4_0D( bxi->d) ;
2097
2107
}
2098
2108
2099
2109
const int blocks_per_tile_x_row = WARP_SIZE / QI4_0;
@@ -2109,7 +2119,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2109
2119
2110
2120
const block_q4_0 * bxi = bx0 + i*blocks_per_row + kbxd;
2111
2121
2112
- x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = bxi->d ;
2122
+ x_dmf[i * (WARP_SIZE/QI4_0) + i / QI4_0 + kbxd] = Q4_0D ( bxi->d ) ;
2113
2123
}
2114
2124
}
2115
2125
@@ -2143,15 +2153,15 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
2143
2153
2144
2154
#pragma unroll
2145
2155
for (int i = 0 ; i < VDR_Q4_1_Q8_1_MMVQ; ++i) {
2146
- v[i] = get_int_from_uint8_aligned (bq4_1->qs , iqs + i);
2156
+ v[i] = get_int_from_uint8 (bq4_1->qs , iqs + i);
2147
2157
u[2 *i+0 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i);
2148
2158
u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI4_1);
2149
2159
}
2150
2160
2151
2161
const float d = Q4_1D (bq4_1->dm );
2152
2162
const float m = Q4_1M (bq4_1->dm );
2153
2163
2154
- const float2 dm = {d, m};
2164
+ const half2 dm = {d, m};
2155
2165
2156
2166
return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, dm, bq8_1->ds );
2157
2167
}
@@ -2189,7 +2199,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2189
2199
2190
2200
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbx;
2191
2201
2192
- x_ql[i * (WARP_SIZE + 1 ) + k] = get_int_from_uint8_aligned (bxi->qs , kqsx);
2202
+ x_ql[i * (WARP_SIZE + 1 ) + k] = get_int_from_uint8 (bxi->qs , kqsx);
2193
2203
}
2194
2204
2195
2205
const int blocks_per_tile_x_row = WARP_SIZE / QI4_1;
@@ -2205,7 +2215,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2205
2215
2206
2216
const block_q4_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2207
2217
2208
- x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd] = bxi->dm ;
2218
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd].x = Q4_1D (bxi->dm );
2219
+ x_dm[i * (WARP_SIZE/QI4_1) + i / QI4_1 + kbxd].y = Q4_1M (bxi->dm );
2209
2220
}
2210
2221
}
2211
2222
@@ -2353,16 +2364,16 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
2353
2364
2354
2365
#pragma unroll
2355
2366
for (int i = 0 ; i < VDR_Q5_1_Q8_1_MMVQ; ++i) {
2356
- vl[i] = get_int_from_uint8_aligned (bq5_1->qs , iqs + i);
2357
- vh[i] = get_int_from_uint8_aligned (bq5_1->qh , 0 ) >> (4 * (iqs + i));
2367
+ vl[i] = get_int_from_uint8 (bq5_1->qs , iqs + i);
2368
+ vh[i] = get_int_from_uint8 (bq5_1->qh , 0 ) >> (4 * (iqs + i));
2358
2369
u[2 *i+0 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i);
2359
2370
u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI5_1);
2360
2371
}
2361
2372
2362
- const float d = Q5_1D (bq4_1 ->dm );
2363
- const float m = Q5_1M (bq4_1 ->dm );
2373
+ const half d = Q5_1D (bq5_1 ->dm );
2374
+ const half m = Q5_1M (bq5_1 ->dm );
2364
2375
2365
- const float2 dm = {d, m};
2376
+ const half2 dm = {d, m};
2366
2377
2367
2378
return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, dm, bq8_1->ds );
2368
2379
}
@@ -2400,8 +2411,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2400
2411
2401
2412
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbx;
2402
2413
2403
- const int ql = get_int_from_uint8_aligned (bxi->qs , kqsx);
2404
- const int qh = get_int_from_uint8_aligned (bxi->qh , 0 ) >> (4 * (k % QI5_1));
2414
+ const int ql = get_int_from_uint8 (bxi->qs , kqsx);
2415
+ const int qh = get_int_from_uint8 (bxi->qh , 0 ) >> (4 * (k % QI5_1));
2405
2416
2406
2417
int qs0 = (ql >> 0 ) & 0x0F0F0F0F ;
2407
2418
qs0 |= (qh << 4 ) & 0x00000010 ; // 0 -> 4
@@ -2433,7 +2444,8 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
2433
2444
2434
2445
const block_q5_1 * bxi = bx0 + i*blocks_per_row + kbxd;
2435
2446
2436
- x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd] = bxi->dm ;
2447
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd].x = Q5_1D (bxi->dm );
2448
+ x_dm[i * (WARP_SIZE/QI5_1) + i / QI5_1 + kbxd].y = Q5_1M (bxi->dm );
2437
2449
}
2438
2450
}
2439
2451
0 commit comments