@@ -4447,7 +4447,7 @@ void kernel_mul_mv_q2_K_f32_impl(
4447
4447
4448
4448
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
4449
4449
4450
- for (int row = 0 ; row < N_DST; ++row) {
4450
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
4451
4451
all_sum = simd_sum (sumf[row]);
4452
4452
if (tiisg == 0 ) {
4453
4453
dst_f32[first_row + row] = all_sum;
@@ -4613,7 +4613,7 @@ void kernel_mul_mv_q3_K_f32_impl(
4613
4613
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
4614
4614
4615
4615
if (tiisg == 0 ) {
4616
- for (int row = 0 ; row < 2 ; ++row) {
4616
+ for (int row = 0 ; row < 2 && first_row + row < args. ne0 ; ++row) {
4617
4617
dst_f32[first_row + row] = sumf1[row];
4618
4618
}
4619
4619
}
@@ -4729,7 +4729,7 @@ void kernel_mul_mv_q4_K_f32_impl(
4729
4729
4730
4730
device float * dst_f32 = (device float *) dst + (int64_t )im*args.ne0 *args.ne1 + (int64_t )r1*args.ne0 ;
4731
4731
4732
- for (int row = 0 ; row < N_DST; ++row) {
4732
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
4733
4733
all_sum = simd_sum (sumf[row]);
4734
4734
if (tiisg == 0 ) {
4735
4735
dst_f32[first_row + row] = all_sum;
@@ -4861,7 +4861,7 @@ void kernel_mul_mv_q5_K_f32_impl(
4861
4861
4862
4862
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
4863
4863
4864
- for (int row = 0 ; row < 2 ; ++row) {
4864
+ for (int row = 0 ; row < 2 && first_row + row < args. ne0 ; ++row) {
4865
4865
const float tot = simd_sum (sumf[row]);
4866
4866
if (tiisg == 0 ) {
4867
4867
dst_f32[first_row + row] = tot;
@@ -4906,6 +4906,10 @@ void kernel_mul_mv_q6_K_f32_impl(
4906
4906
4907
4907
const int row = 2 *r0 + sgitg;
4908
4908
4909
+ if (row >= args.ne0 ) {
4910
+ return ;
4911
+ }
4912
+
4909
4913
const uint i12 = im%args.ne12 ;
4910
4914
const uint i13 = im/args.ne12 ;
4911
4915
@@ -5061,7 +5065,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl(
5061
5065
5062
5066
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5063
5067
5064
- for (int row = 0 ; row < N_DST; ++row) {
5068
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5065
5069
all_sum = simd_sum (sumf[row]);
5066
5070
if (tiisg == 0 ) {
5067
5071
dst_f32[first_row + row] = all_sum * 0 .25f ;
@@ -5179,7 +5183,7 @@ void kernel_mul_mv_iq2_xs_f32_impl(
5179
5183
5180
5184
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5181
5185
5182
- for (int row = 0 ; row < N_DST; ++row) {
5186
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5183
5187
all_sum = simd_sum (sumf[row]);
5184
5188
if (tiisg == 0 ) {
5185
5189
dst_f32[first_row + row] = all_sum * 0 .25f ;
@@ -5289,7 +5293,7 @@ void kernel_mul_mv_iq3_xxs_f32_impl(
5289
5293
5290
5294
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5291
5295
5292
- for (int row = 0 ; row < N_DST; ++row) {
5296
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5293
5297
all_sum = simd_sum (sumf[row]);
5294
5298
if (tiisg == 0 ) {
5295
5299
dst_f32[first_row + row] = all_sum * 0 .5f ;
@@ -5401,7 +5405,7 @@ void kernel_mul_mv_iq3_s_f32_impl(
5401
5405
5402
5406
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5403
5407
5404
- for (int row = 0 ; row < N_DST; ++row) {
5408
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5405
5409
all_sum = simd_sum (sumf[row]);
5406
5410
if (tiisg == 0 ) {
5407
5411
dst_f32[first_row + row] = all_sum;
@@ -5514,7 +5518,7 @@ void kernel_mul_mv_iq2_s_f32_impl(
5514
5518
5515
5519
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5516
5520
5517
- for (int row = 0 ; row < N_DST; ++row) {
5521
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5518
5522
all_sum = simd_sum (sumf[row]);
5519
5523
if (tiisg == 0 ) {
5520
5524
dst_f32[first_row + row] = all_sum * 0 .25f ;
@@ -5614,7 +5618,7 @@ void kernel_mul_mv_iq1_s_f32_impl(
5614
5618
5615
5619
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5616
5620
5617
- for (int row = 0 ; row < N_DST; ++row) {
5621
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5618
5622
all_sum = simd_sum (sumf[row]);
5619
5623
if (tiisg == 0 ) {
5620
5624
dst_f32[first_row + row] = all_sum;
@@ -5709,7 +5713,7 @@ void kernel_mul_mv_iq1_m_f32_impl(
5709
5713
5710
5714
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5711
5715
5712
- for (int row = 0 ; row < N_DST; ++row) {
5716
+ for (int row = 0 ; row < N_DST && first_row + row < args. ne0 ; ++row) {
5713
5717
all_sum = simd_sum (sumf[row]);
5714
5718
if (tiisg == 0 ) {
5715
5719
dst_f32[first_row + row] = all_sum;
@@ -5799,7 +5803,7 @@ void kernel_mul_mv_iq4_nl_f32_impl(
5799
5803
5800
5804
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5801
5805
5802
- for (int row = 0 ; row < 2 && first_row + row < args.ne01 ; ++row) {
5806
+ for (int row = 0 ; row < 2 && first_row + row < args.ne0 ; ++row) {
5803
5807
all_sum = simd_sum (sumf[row]);
5804
5808
if (tiisg == 0 ) {
5805
5809
dst_f32[first_row + row] = all_sum;
@@ -5888,7 +5892,7 @@ void kernel_mul_mv_iq4_xs_f32_impl(
5888
5892
5889
5893
device float * dst_f32 = (device float *) dst + (uint64_t )im*args.ne0 *args.ne1 + (uint64_t )r1*args.ne0 ;
5890
5894
5891
- for (int row = 0 ; row < 2 ; ++row) {
5895
+ for (int row = 0 ; row < 2 && first_row + row < args. ne0 ; ++row) {
5892
5896
all_sum = simd_sum (sumf[row]);
5893
5897
if (tiisg == 0 ) {
5894
5898
dst_f32[first_row + row] = all_sum;
0 commit comments