@@ -1001,10 +1001,15 @@ void ggml_gemv_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
1001
1001
1002
1002
vfloat32m1_t sumf = __riscv_vfmv_v_f_f32m1 (0.0 , vl / 4 );
1003
1003
for (int l = 0 ; l < nb ; l ++ ) {
1004
- const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 ((const int64_t * )& a_ptr [l ].qs [0 ], 0 , vl / 4 ));
1005
- const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 ((const int64_t * )& a_ptr [l ].qs [8 ], 0 , vl / 4 ));
1006
- const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 ((const int64_t * )& a_ptr [l ].qs [16 ], 0 , vl / 4 ));
1007
- const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 ((const int64_t * )& a_ptr [l ].qs [24 ], 0 , vl / 4 ));
1004
+ const int64_t a0 = * (const int64_t * )& a_ptr [l ].qs [0 ];
1005
+ const int64_t a1 = * (const int64_t * )& a_ptr [l ].qs [8 ];
1006
+ const int64_t a2 = * (const int64_t * )& a_ptr [l ].qs [16 ];
1007
+ const int64_t a3 = * (const int64_t * )& a_ptr [l ].qs [24 ];
1008
+ __asm__ __volatile__("" ::: "memory" ); // prevent gcc from emitting fused vlse64, violating alignment
1009
+ const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 (a0 , vl / 4 ));
1010
+ const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 (a1 , vl / 4 ));
1011
+ const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 (a2 , vl / 4 ));
1012
+ const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 (a3 , vl / 4 ));
1008
1013
1009
1014
const vint8m4_t rhs_raw_vec = __riscv_vle8_v_i8m4 ((const int8_t * )b_ptr [l ].qs , vl * 4 );
1010
1015
const vint8m4_t rhs_vec_lo = __riscv_vsra_vx_i8m4 (__riscv_vsll_vx_i8m4 (rhs_raw_vec , 4 , vl * 4 ), 4 , vl * 4 );
@@ -3275,12 +3280,17 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3275
3280
};
3276
3281
const vfloat32m1_t b_scales_vec = __riscv_vle32_v_f32m1 (b_scales , vl / 4 );
3277
3282
3283
+ const int64_t A0 = * (const int64_t * )& a_ptr [l ].qs [0 ];
3284
+ const int64_t A4 = * (const int64_t * )& a_ptr [l ].qs [32 ];
3285
+ const int64_t A8 = * (const int64_t * )& a_ptr [l ].qs [64 ];
3286
+ const int64_t Ac = * (const int64_t * )& a_ptr [l ].qs [96 ];
3287
+ __asm__ __volatile__("" ::: "memory" ); // prevent gcc from emitting fused vlse64, violating alignment
3278
3288
vint16m4_t sumi_l0 ;
3279
3289
{
3280
- const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 0 ], 0 , vl / 4 ));
3281
- const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 32 ], 0 , vl / 4 ));
3282
- const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 64 ], 0 , vl / 4 ));
3283
- const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 96 ], 0 , vl / 4 ));
3290
+ const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A0 , vl / 4 ));
3291
+ const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A4 , vl / 4 ));
3292
+ const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A8 , vl / 4 ));
3293
+ const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Ac , vl / 4 ));
3284
3294
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4 (rhs_vec_lo_0 , lhs_0_8 , vl * 2 );
3285
3295
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4 (sumi_lo_0 , rhs_vec_lo_1 , lhs_1_8 , vl * 2 );
3286
3296
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4 (sumi_lo_1 , rhs_vec_hi_0 , lhs_2_8 , vl * 2 );
@@ -3307,14 +3317,18 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3307
3317
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1 (facc , a_scales [0 ], vl / 4 );
3308
3318
sumf0 = __riscv_vfmacc_vv_f32m1 (sumf0 , tmp1 , b_scales_vec , vl / 4 );
3309
3319
}
3310
- // __asm__ __volatile__("" ::: "memory");
3311
3320
3321
+ const int64_t A1 = * (const int64_t * )& a_ptr [l ].qs [8 ];
3322
+ const int64_t A5 = * (const int64_t * )& a_ptr [l ].qs [40 ];
3323
+ const int64_t A9 = * (const int64_t * )& a_ptr [l ].qs [72 ];
3324
+ const int64_t Ad = * (const int64_t * )& a_ptr [l ].qs [104 ];
3325
+ __asm__ __volatile__("" ::: "memory" ); // prevent gcc from emitting fused vlse64, violating alignment
3312
3326
vint16m4_t sumi_l1 ;
3313
3327
{
3314
- const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 8 ], 0 , vl / 4 ));
3315
- const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 40 ], 0 , vl / 4 ));
3316
- const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 72 ], 0 , vl / 4 ));
3317
- const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 104 ], 0 , vl / 4 ));
3328
+ const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A1 , vl / 4 ));
3329
+ const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A5 , vl / 4 ));
3330
+ const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A9 , vl / 4 ));
3331
+ const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Ad , vl / 4 ));
3318
3332
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4 (rhs_vec_lo_0 , lhs_0_8 , vl * 2 );
3319
3333
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4 (sumi_lo_0 , rhs_vec_lo_1 , lhs_1_8 , vl * 2 );
3320
3334
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4 (sumi_lo_1 , rhs_vec_hi_0 , lhs_2_8 , vl * 2 );
@@ -3341,14 +3355,18 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3341
3355
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1 (facc , a_scales [0 ], vl / 4 );
3342
3356
sumf1 = __riscv_vfmacc_vv_f32m1 (sumf1 , tmp1 , b_scales_vec , vl / 4 );
3343
3357
}
3344
- // __asm__ __volatile__("" ::: "memory");
3345
3358
3359
+ const int64_t A2 = * (const int64_t * )& a_ptr [l ].qs [16 ];
3360
+ const int64_t A6 = * (const int64_t * )& a_ptr [l ].qs [48 ];
3361
+ const int64_t Aa = * (const int64_t * )& a_ptr [l ].qs [80 ];
3362
+ const int64_t Ae = * (const int64_t * )& a_ptr [l ].qs [112 ];
3363
+ __asm__ __volatile__("" ::: "memory" ); // prevent gcc from emitting fused vlse64, violating alignment
3346
3364
vint16m4_t sumi_l2 ;
3347
3365
{
3348
- const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 16 ], 0 , vl / 4 ));
3349
- const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 48 ], 0 , vl / 4 ));
3350
- const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 80 ], 0 , vl / 4 ));
3351
- const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 112 ], 0 , vl / 4 ));
3366
+ const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A2 , vl / 4 ));
3367
+ const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A6 , vl / 4 ));
3368
+ const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Aa , vl / 4 ));
3369
+ const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Ae , vl / 4 ));
3352
3370
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4 (rhs_vec_lo_0 , lhs_0_8 , vl * 2 );
3353
3371
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4 (sumi_lo_0 , rhs_vec_lo_1 , lhs_1_8 , vl * 2 );
3354
3372
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4 (sumi_lo_1 , rhs_vec_hi_0 , lhs_2_8 , vl * 2 );
@@ -3375,14 +3393,18 @@ void ggml_gemm_q4_0_8x8_q8_0(int n, float * restrict s, size_t bs, const void *
3375
3393
const vfloat32m1_t tmp1 = __riscv_vfmul_vf_f32m1 (facc , a_scales [2 ], vl / 4 );
3376
3394
sumf2 = __riscv_vfmacc_vv_f32m1 (sumf2 , tmp1 , b_scales_vec , vl / 4 );
3377
3395
}
3378
- // __asm__ __volatile__("" ::: "memory");
3379
3396
3397
+ const int64_t A3 = * (const int64_t * )& a_ptr [l ].qs [24 ];
3398
+ const int64_t A7 = * (const int64_t * )& a_ptr [l ].qs [56 ];
3399
+ const int64_t Ab = * (const int64_t * )& a_ptr [l ].qs [88 ];
3400
+ const int64_t Af = * (const int64_t * )& a_ptr [l ].qs [120 ];
3401
+ __asm__ __volatile__("" ::: "memory" ); // prevent gcc from emitting fused vlse64, violating alignment
3380
3402
vint16m4_t sumi_l3 ;
3381
3403
{
3382
- const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 24 ], 0 , vl / 4 ));
3383
- const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 56 ], 0 , vl / 4 ));
3384
- const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 88 ], 0 , vl / 4 ));
3385
- const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vlse64_v_i64m2 (( const int64_t * ) & a_ptr [ l ]. qs [ 120 ], 0 , vl / 4 ));
3404
+ const vint8m2_t lhs_0_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A3 , vl / 4 ));
3405
+ const vint8m2_t lhs_1_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( A7 , vl / 4 ));
3406
+ const vint8m2_t lhs_2_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Ab , vl / 4 ));
3407
+ const vint8m2_t lhs_3_8 = __riscv_vreinterpret_v_i64m2_i8m2 (__riscv_vmv_v_x_i64m2 ( Af , vl / 4 ));
3386
3408
const vint16m4_t sumi_lo_0 = __riscv_vwmul_vv_i16m4 (rhs_vec_lo_0 , lhs_0_8 , vl * 2 );
3387
3409
const vint16m4_t sumi_lo_1 = __riscv_vwmacc_vv_i16m4 (sumi_lo_0 , rhs_vec_lo_1 , lhs_1_8 , vl * 2 );
3388
3410
const vint16m4_t sumi_hi_0 = __riscv_vwmacc_vv_i16m4 (sumi_lo_1 , rhs_vec_hi_0 , lhs_2_8 , vl * 2 );
0 commit comments