@@ -297,3 +297,186 @@ entry:
297
297
%res = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %a.ext )
298
298
ret i32 %res
299
299
}
300
+
301
+ define i32 @vqdot_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
302
+ ; CHECK-LABEL: vqdot_vv_accum:
303
+ ; CHECK: # %bb.0: # %entry
304
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
305
+ ; CHECK-NEXT: vsext.vf2 v10, v8
306
+ ; CHECK-NEXT: vsext.vf2 v16, v9
307
+ ; CHECK-NEXT: vwmacc.vv v12, v10, v16
308
+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
309
+ ; CHECK-NEXT: vmv.s.x v8, zero
310
+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
311
+ ; CHECK-NEXT: vmv.x.s a0, v8
312
+ ; CHECK-NEXT: ret
313
+ entry:
314
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
315
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
316
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
317
+ %add = add <16 x i32 > %mul , %x
318
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
319
+ ret i32 %sum
320
+ }
321
+
322
+ define i32 @vqdotu_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
323
+ ; CHECK-LABEL: vqdotu_vv_accum:
324
+ ; CHECK: # %bb.0: # %entry
325
+ ; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
326
+ ; CHECK-NEXT: vwmulu.vv v10, v8, v9
327
+ ; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma
328
+ ; CHECK-NEXT: vwaddu.wv v12, v12, v10
329
+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
330
+ ; CHECK-NEXT: vmv.s.x v8, zero
331
+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
332
+ ; CHECK-NEXT: vmv.x.s a0, v8
333
+ ; CHECK-NEXT: ret
334
+ entry:
335
+ %a.zext = zext <16 x i8 > %a to <16 x i32 >
336
+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
337
+ %mul = mul nuw nsw <16 x i32 > %a.zext , %b.zext
338
+ %add = add <16 x i32 > %mul , %x
339
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
340
+ ret i32 %sum
341
+ }
342
+
343
+ define i32 @vqdotsu_vv_accum (<16 x i8 > %a , <16 x i8 > %b , <16 x i32 > %x ) {
344
+ ; CHECK-LABEL: vqdotsu_vv_accum:
345
+ ; CHECK: # %bb.0: # %entry
346
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
347
+ ; CHECK-NEXT: vsext.vf2 v10, v8
348
+ ; CHECK-NEXT: vzext.vf2 v16, v9
349
+ ; CHECK-NEXT: vwmaccsu.vv v12, v10, v16
350
+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
351
+ ; CHECK-NEXT: vmv.s.x v8, zero
352
+ ; CHECK-NEXT: vredsum.vs v8, v12, v8
353
+ ; CHECK-NEXT: vmv.x.s a0, v8
354
+ ; CHECK-NEXT: ret
355
+ entry:
356
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
357
+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
358
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.zext
359
+ %add = add <16 x i32 > %mul , %x
360
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
361
+ ret i32 %sum
362
+ }
363
+
364
+ define i32 @vqdot_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
365
+ ; NODOT-LABEL: vqdot_vv_scalar_add:
366
+ ; NODOT: # %bb.0: # %entry
367
+ ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
368
+ ; NODOT-NEXT: vsext.vf2 v12, v8
369
+ ; NODOT-NEXT: vsext.vf2 v14, v9
370
+ ; NODOT-NEXT: vwmul.vv v8, v12, v14
371
+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
372
+ ; NODOT-NEXT: vmv.s.x v12, a0
373
+ ; NODOT-NEXT: vredsum.vs v8, v8, v12
374
+ ; NODOT-NEXT: vmv.x.s a0, v8
375
+ ; NODOT-NEXT: ret
376
+ ;
377
+ ; DOT-LABEL: vqdot_vv_scalar_add:
378
+ ; DOT: # %bb.0: # %entry
379
+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
380
+ ; DOT-NEXT: vmv.v.i v10, 0
381
+ ; DOT-NEXT: vqdot.vv v10, v8, v9
382
+ ; DOT-NEXT: vmv.s.x v8, a0
383
+ ; DOT-NEXT: vredsum.vs v8, v10, v8
384
+ ; DOT-NEXT: vmv.x.s a0, v8
385
+ ; DOT-NEXT: ret
386
+ entry:
387
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
388
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
389
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
390
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
391
+ %add = add i32 %sum , %x
392
+ ret i32 %add
393
+ }
394
+
395
+ define i32 @vqdotu_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
396
+ ; NODOT-LABEL: vqdotu_vv_scalar_add:
397
+ ; NODOT: # %bb.0: # %entry
398
+ ; NODOT-NEXT: vsetivli zero, 16, e8, m1, ta, ma
399
+ ; NODOT-NEXT: vwmulu.vv v10, v8, v9
400
+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
401
+ ; NODOT-NEXT: vmv.s.x v8, a0
402
+ ; NODOT-NEXT: vsetvli zero, zero, e16, m2, ta, ma
403
+ ; NODOT-NEXT: vwredsumu.vs v8, v10, v8
404
+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
405
+ ; NODOT-NEXT: vmv.x.s a0, v8
406
+ ; NODOT-NEXT: ret
407
+ ;
408
+ ; DOT-LABEL: vqdotu_vv_scalar_add:
409
+ ; DOT: # %bb.0: # %entry
410
+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
411
+ ; DOT-NEXT: vmv.v.i v10, 0
412
+ ; DOT-NEXT: vqdotu.vv v10, v8, v9
413
+ ; DOT-NEXT: vmv.s.x v8, a0
414
+ ; DOT-NEXT: vredsum.vs v8, v10, v8
415
+ ; DOT-NEXT: vmv.x.s a0, v8
416
+ ; DOT-NEXT: ret
417
+ entry:
418
+ %a.zext = zext <16 x i8 > %a to <16 x i32 >
419
+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
420
+ %mul = mul nuw nsw <16 x i32 > %a.zext , %b.zext
421
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
422
+ %add = add i32 %sum , %x
423
+ ret i32 %add
424
+ }
425
+
426
+ define i32 @vqdotsu_vv_scalar_add (<16 x i8 > %a , <16 x i8 > %b , i32 %x ) {
427
+ ; NODOT-LABEL: vqdotsu_vv_scalar_add:
428
+ ; NODOT: # %bb.0: # %entry
429
+ ; NODOT-NEXT: vsetivli zero, 16, e16, m2, ta, ma
430
+ ; NODOT-NEXT: vsext.vf2 v12, v8
431
+ ; NODOT-NEXT: vzext.vf2 v14, v9
432
+ ; NODOT-NEXT: vwmulsu.vv v8, v12, v14
433
+ ; NODOT-NEXT: vsetvli zero, zero, e32, m4, ta, ma
434
+ ; NODOT-NEXT: vmv.s.x v12, a0
435
+ ; NODOT-NEXT: vredsum.vs v8, v8, v12
436
+ ; NODOT-NEXT: vmv.x.s a0, v8
437
+ ; NODOT-NEXT: ret
438
+ ;
439
+ ; DOT-LABEL: vqdotsu_vv_scalar_add:
440
+ ; DOT: # %bb.0: # %entry
441
+ ; DOT-NEXT: vsetivli zero, 4, e32, m1, ta, ma
442
+ ; DOT-NEXT: vmv.v.i v10, 0
443
+ ; DOT-NEXT: vqdotsu.vv v10, v8, v9
444
+ ; DOT-NEXT: vmv.s.x v8, a0
445
+ ; DOT-NEXT: vredsum.vs v8, v10, v8
446
+ ; DOT-NEXT: vmv.x.s a0, v8
447
+ ; DOT-NEXT: ret
448
+ entry:
449
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
450
+ %b.zext = zext <16 x i8 > %b to <16 x i32 >
451
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.zext
452
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %mul )
453
+ %add = add i32 %sum , %x
454
+ ret i32 %add
455
+ }
456
+
457
+ define i32 @vqdot_vv_split (<16 x i8 > %a , <16 x i8 > %b , <16 x i8 > %c , <16 x i8 > %d ) {
458
+ ; CHECK-LABEL: vqdot_vv_split:
459
+ ; CHECK: # %bb.0: # %entry
460
+ ; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
461
+ ; CHECK-NEXT: vsext.vf2 v12, v8
462
+ ; CHECK-NEXT: vsext.vf2 v14, v9
463
+ ; CHECK-NEXT: vsext.vf2 v16, v10
464
+ ; CHECK-NEXT: vsext.vf2 v18, v11
465
+ ; CHECK-NEXT: vwmul.vv v8, v12, v14
466
+ ; CHECK-NEXT: vwmacc.vv v8, v16, v18
467
+ ; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
468
+ ; CHECK-NEXT: vmv.s.x v12, zero
469
+ ; CHECK-NEXT: vredsum.vs v8, v8, v12
470
+ ; CHECK-NEXT: vmv.x.s a0, v8
471
+ ; CHECK-NEXT: ret
472
+ entry:
473
+ %a.sext = sext <16 x i8 > %a to <16 x i32 >
474
+ %b.sext = sext <16 x i8 > %b to <16 x i32 >
475
+ %mul = mul nuw nsw <16 x i32 > %a.sext , %b.sext
476
+ %c.sext = sext <16 x i8 > %c to <16 x i32 >
477
+ %d.sext = sext <16 x i8 > %d to <16 x i32 >
478
+ %mul2 = mul nuw nsw <16 x i32 > %c.sext , %d.sext
479
+ %add = add <16 x i32 > %mul , %mul2
480
+ %sum = tail call i32 @llvm.vector.reduce.add.v16i32 (<16 x i32 > %add )
481
+ ret i32 %sum
482
+ }
0 commit comments