Skip to content

Commit 0c66deb

Browse files
committed
[RISCV] Scalarize gather/scatter on RV64 with Zve32* extension.
i64 indices aren't supported on Zve32*. Scalarize gathers to prevent generating illegal instructions. Since InstCombine will aggressively canonicalize GEP indices to pointer size, we're pretty much always going to have an i64 index. Trying to predict when SelectionDAG will find a smaller index from the TTI hook used by the ScalarizeMaskedMemIntrinPass seems fragile. To optimize this we probably need an IR pass to rewrite it earlier. Test RUN lines have also been added to make sure the strided load/store optimization still works. Reviewed By: reames Differential Revision: https://reviews.llvm.org/D127179
1 parent 3731bbc commit 0c66deb

File tree

5 files changed

+23277
-1584
lines changed

5 files changed

+23277
-1584
lines changed

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,16 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
161161
return isLegalMaskedGatherScatter(DataType, Alignment);
162162
}
163163

164+
bool forceScalarizeMaskedGather(VectorType *VTy, Align Alignment) {
165+
// Scalarize masked gather for RV64 if EEW=64 indices aren't supported.
166+
return ST->is64Bit() && !ST->hasVInstructionsI64();
167+
}
168+
169+
bool forceScalarizeMaskedScatter(VectorType *VTy, Align Alignment) {
170+
// Scalarize masked scatter for RV64 if EEW=64 indices aren't supported.
171+
return ST->is64Bit() && !ST->hasVInstructionsI64();
172+
}
173+
164174
/// \returns How the target needs this vector-predicated operation to be
165175
/// transformed.
166176
TargetTransformInfo::VPLegalization

llvm/test/CodeGen/RISCV/rvv/fixed-vector-strided-load-store-asm.ll

Lines changed: 190 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s
2+
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,V
3+
; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+f,+zve32f -riscv-v-vector-bits-min=256 | FileCheck %s --check-prefixes=CHECK,ZVE32F
34

45
%struct.foo = type { i32, i32, i32, i32 }
56

@@ -54,30 +55,55 @@ for.cond.cleanup: ; preds = %vector.body
5455

5556
define void @gather_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
5657
;
57-
; CHECK-LABEL: gather_masked:
58-
; CHECK: # %bb.0: # %entry
59-
; CHECK-NEXT: li a2, 0
60-
; CHECK-NEXT: lui a3, 983765
61-
; CHECK-NEXT: addiw a3, a3, 873
62-
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
63-
; CHECK-NEXT: vmv.s.x v0, a3
64-
; CHECK-NEXT: li a3, 32
65-
; CHECK-NEXT: li a4, 5
66-
; CHECK-NEXT: li a5, 1024
67-
; CHECK-NEXT: .LBB1_1: # %vector.body
68-
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
69-
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
70-
; CHECK-NEXT: vmv1r.v v9, v8
71-
; CHECK-NEXT: vlse8.v v9, (a1), a4, v0.t
72-
; CHECK-NEXT: add a6, a0, a2
73-
; CHECK-NEXT: vle8.v v10, (a6)
74-
; CHECK-NEXT: vadd.vv v9, v10, v9
75-
; CHECK-NEXT: vse8.v v9, (a6)
76-
; CHECK-NEXT: addi a2, a2, 32
77-
; CHECK-NEXT: addi a1, a1, 160
78-
; CHECK-NEXT: bne a2, a5, .LBB1_1
79-
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
80-
; CHECK-NEXT: ret
58+
; V-LABEL: gather_masked:
59+
; V: # %bb.0: # %entry
60+
; V-NEXT: li a2, 0
61+
; V-NEXT: lui a3, 983765
62+
; V-NEXT: addiw a3, a3, 873
63+
; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
64+
; V-NEXT: vmv.s.x v0, a3
65+
; V-NEXT: li a3, 32
66+
; V-NEXT: li a4, 5
67+
; V-NEXT: li a5, 1024
68+
; V-NEXT: .LBB1_1: # %vector.body
69+
; V-NEXT: # =>This Inner Loop Header: Depth=1
70+
; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu
71+
; V-NEXT: vmv1r.v v9, v8
72+
; V-NEXT: vlse8.v v9, (a1), a4, v0.t
73+
; V-NEXT: add a6, a0, a2
74+
; V-NEXT: vle8.v v10, (a6)
75+
; V-NEXT: vadd.vv v9, v10, v9
76+
; V-NEXT: vse8.v v9, (a6)
77+
; V-NEXT: addi a2, a2, 32
78+
; V-NEXT: addi a1, a1, 160
79+
; V-NEXT: bne a2, a5, .LBB1_1
80+
; V-NEXT: # %bb.2: # %for.cond.cleanup
81+
; V-NEXT: ret
82+
;
83+
; ZVE32F-LABEL: gather_masked:
84+
; ZVE32F: # %bb.0: # %entry
85+
; ZVE32F-NEXT: li a2, 0
86+
; ZVE32F-NEXT: lui a3, 983765
87+
; ZVE32F-NEXT: addiw a3, a3, 873
88+
; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
89+
; ZVE32F-NEXT: vmv.s.x v0, a3
90+
; ZVE32F-NEXT: li a3, 32
91+
; ZVE32F-NEXT: li a4, 5
92+
; ZVE32F-NEXT: li a5, 1024
93+
; ZVE32F-NEXT: .LBB1_1: # %vector.body
94+
; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
95+
; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu
96+
; ZVE32F-NEXT: vmv1r.v v9, v8
97+
; ZVE32F-NEXT: vlse8.v v9, (a1), a4, v0.t
98+
; ZVE32F-NEXT: add a6, a0, a2
99+
; ZVE32F-NEXT: vle8.v v10, (a6)
100+
; ZVE32F-NEXT: vadd.vv v9, v10, v9
101+
; ZVE32F-NEXT: vse8.v v9, (a6)
102+
; ZVE32F-NEXT: addi a2, a2, 32
103+
; ZVE32F-NEXT: addi a1, a1, 160
104+
; ZVE32F-NEXT: bne a2, a5, .LBB1_1
105+
; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
106+
; ZVE32F-NEXT: ret
81107
entry:
82108
br label %vector.body
83109

@@ -242,30 +268,55 @@ for.cond.cleanup: ; preds = %vector.body
242268

243269
define void @scatter_masked(i8* noalias nocapture %A, i8* noalias nocapture readonly %B, <32 x i8> %maskedoff) {
244270
;
245-
; CHECK-LABEL: scatter_masked:
246-
; CHECK: # %bb.0: # %entry
247-
; CHECK-NEXT: li a2, 0
248-
; CHECK-NEXT: li a3, 32
249-
; CHECK-NEXT: lui a4, 983765
250-
; CHECK-NEXT: addiw a4, a4, 873
251-
; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
252-
; CHECK-NEXT: vmv.s.x v0, a4
253-
; CHECK-NEXT: li a4, 5
254-
; CHECK-NEXT: li a5, 1024
255-
; CHECK-NEXT: .LBB5_1: # %vector.body
256-
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
257-
; CHECK-NEXT: add a6, a1, a2
258-
; CHECK-NEXT: vsetvli zero, a3, e8, m1, ta, mu
259-
; CHECK-NEXT: vle8.v v9, (a6)
260-
; CHECK-NEXT: vmv1r.v v10, v8
261-
; CHECK-NEXT: vlse8.v v10, (a0), a4, v0.t
262-
; CHECK-NEXT: vadd.vv v9, v10, v9
263-
; CHECK-NEXT: vsse8.v v9, (a0), a4, v0.t
264-
; CHECK-NEXT: addi a2, a2, 32
265-
; CHECK-NEXT: addi a0, a0, 160
266-
; CHECK-NEXT: bne a2, a5, .LBB5_1
267-
; CHECK-NEXT: # %bb.2: # %for.cond.cleanup
268-
; CHECK-NEXT: ret
271+
; V-LABEL: scatter_masked:
272+
; V: # %bb.0: # %entry
273+
; V-NEXT: li a2, 0
274+
; V-NEXT: li a3, 32
275+
; V-NEXT: lui a4, 983765
276+
; V-NEXT: addiw a4, a4, 873
277+
; V-NEXT: vsetivli zero, 1, e32, mf2, ta, mu
278+
; V-NEXT: vmv.s.x v0, a4
279+
; V-NEXT: li a4, 5
280+
; V-NEXT: li a5, 1024
281+
; V-NEXT: .LBB5_1: # %vector.body
282+
; V-NEXT: # =>This Inner Loop Header: Depth=1
283+
; V-NEXT: add a6, a1, a2
284+
; V-NEXT: vsetvli zero, a3, e8, m1, ta, mu
285+
; V-NEXT: vle8.v v9, (a6)
286+
; V-NEXT: vmv1r.v v10, v8
287+
; V-NEXT: vlse8.v v10, (a0), a4, v0.t
288+
; V-NEXT: vadd.vv v9, v10, v9
289+
; V-NEXT: vsse8.v v9, (a0), a4, v0.t
290+
; V-NEXT: addi a2, a2, 32
291+
; V-NEXT: addi a0, a0, 160
292+
; V-NEXT: bne a2, a5, .LBB5_1
293+
; V-NEXT: # %bb.2: # %for.cond.cleanup
294+
; V-NEXT: ret
295+
;
296+
; ZVE32F-LABEL: scatter_masked:
297+
; ZVE32F: # %bb.0: # %entry
298+
; ZVE32F-NEXT: li a2, 0
299+
; ZVE32F-NEXT: li a3, 32
300+
; ZVE32F-NEXT: lui a4, 983765
301+
; ZVE32F-NEXT: addiw a4, a4, 873
302+
; ZVE32F-NEXT: vsetivli zero, 1, e32, m1, ta, mu
303+
; ZVE32F-NEXT: vmv.s.x v0, a4
304+
; ZVE32F-NEXT: li a4, 5
305+
; ZVE32F-NEXT: li a5, 1024
306+
; ZVE32F-NEXT: .LBB5_1: # %vector.body
307+
; ZVE32F-NEXT: # =>This Inner Loop Header: Depth=1
308+
; ZVE32F-NEXT: add a6, a1, a2
309+
; ZVE32F-NEXT: vsetvli zero, a3, e8, m1, ta, mu
310+
; ZVE32F-NEXT: vle8.v v9, (a6)
311+
; ZVE32F-NEXT: vmv1r.v v10, v8
312+
; ZVE32F-NEXT: vlse8.v v10, (a0), a4, v0.t
313+
; ZVE32F-NEXT: vadd.vv v9, v10, v9
314+
; ZVE32F-NEXT: vsse8.v v9, (a0), a4, v0.t
315+
; ZVE32F-NEXT: addi a2, a2, 32
316+
; ZVE32F-NEXT: addi a0, a0, 160
317+
; ZVE32F-NEXT: bne a2, a5, .LBB5_1
318+
; ZVE32F-NEXT: # %bb.2: # %for.cond.cleanup
319+
; ZVE32F-NEXT: ret
269320
entry:
270321
br label %vector.body
271322

@@ -554,24 +605,51 @@ declare void @llvm.masked.scatter.v8i32.v8p0i32(<8 x i32>, <8 x i32*>, i32 immar
554605
; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
555606
define void @gather_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) {
556607
;
557-
; CHECK-LABEL: gather_of_pointers:
558-
; CHECK: # %bb.0:
559-
; CHECK-NEXT: li a2, 1024
560-
; CHECK-NEXT: li a3, 40
561-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
562-
; CHECK-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
563-
; CHECK-NEXT: vlse64.v v8, (a1), a3
564-
; CHECK-NEXT: addi a4, a1, 80
565-
; CHECK-NEXT: vlse64.v v9, (a4), a3
566-
; CHECK-NEXT: vse64.v v8, (a0)
567-
; CHECK-NEXT: addi a4, a0, 16
568-
; CHECK-NEXT: vse64.v v9, (a4)
569-
; CHECK-NEXT: addi a2, a2, -4
570-
; CHECK-NEXT: addi a0, a0, 32
571-
; CHECK-NEXT: addi a1, a1, 160
572-
; CHECK-NEXT: bnez a2, .LBB10_1
573-
; CHECK-NEXT: # %bb.2:
574-
; CHECK-NEXT: ret
608+
; V-LABEL: gather_of_pointers:
609+
; V: # %bb.0:
610+
; V-NEXT: li a2, 1024
611+
; V-NEXT: li a3, 40
612+
; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu
613+
; V-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
614+
; V-NEXT: vlse64.v v8, (a1), a3
615+
; V-NEXT: addi a4, a1, 80
616+
; V-NEXT: vlse64.v v9, (a4), a3
617+
; V-NEXT: vse64.v v8, (a0)
618+
; V-NEXT: addi a4, a0, 16
619+
; V-NEXT: vse64.v v9, (a4)
620+
; V-NEXT: addi a2, a2, -4
621+
; V-NEXT: addi a0, a0, 32
622+
; V-NEXT: addi a1, a1, 160
623+
; V-NEXT: bnez a2, .LBB10_1
624+
; V-NEXT: # %bb.2:
625+
; V-NEXT: ret
626+
;
627+
; ZVE32F-LABEL: gather_of_pointers:
628+
; ZVE32F: # %bb.0:
629+
; ZVE32F-NEXT: li a2, 0
630+
; ZVE32F-NEXT: li a3, 1
631+
; ZVE32F-NEXT: li a4, 1024
632+
; ZVE32F-NEXT: li a5, 40
633+
; ZVE32F-NEXT: .LBB10_1: # =>This Inner Loop Header: Depth=1
634+
; ZVE32F-NEXT: mul a6, a3, a5
635+
; ZVE32F-NEXT: add a6, a1, a6
636+
; ZVE32F-NEXT: mul a7, a2, a5
637+
; ZVE32F-NEXT: add a7, a1, a7
638+
; ZVE32F-NEXT: ld t0, 0(a6)
639+
; ZVE32F-NEXT: ld t1, 0(a7)
640+
; ZVE32F-NEXT: ld a6, 80(a6)
641+
; ZVE32F-NEXT: ld a7, 80(a7)
642+
; ZVE32F-NEXT: sd t0, 8(a0)
643+
; ZVE32F-NEXT: sd t1, 0(a0)
644+
; ZVE32F-NEXT: sd a6, 24(a0)
645+
; ZVE32F-NEXT: sd a7, 16(a0)
646+
; ZVE32F-NEXT: addi a2, a2, 4
647+
; ZVE32F-NEXT: addi a3, a3, 4
648+
; ZVE32F-NEXT: addi a4, a4, -4
649+
; ZVE32F-NEXT: addi a0, a0, 32
650+
; ZVE32F-NEXT: bnez a4, .LBB10_1
651+
; ZVE32F-NEXT: # %bb.2:
652+
; ZVE32F-NEXT: ret
575653
br label %3
576654

577655
3: ; preds = %3, %2
@@ -604,24 +682,51 @@ declare <2 x i32*> @llvm.masked.gather.v2p0i32.v2p0p0i32(<2 x i32**>, i32 immarg
604682
; Make sure we don't crash in getTgtMemIntrinsic for a vector of pointers.
605683
define void @scatter_of_pointers(i32** noalias nocapture %0, i32** noalias nocapture readonly %1) {
606684
;
607-
; CHECK-LABEL: scatter_of_pointers:
608-
; CHECK: # %bb.0:
609-
; CHECK-NEXT: li a2, 1024
610-
; CHECK-NEXT: li a3, 40
611-
; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, mu
612-
; CHECK-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
613-
; CHECK-NEXT: vle64.v v8, (a1)
614-
; CHECK-NEXT: addi a4, a1, 16
615-
; CHECK-NEXT: vle64.v v9, (a4)
616-
; CHECK-NEXT: addi a4, a0, 80
617-
; CHECK-NEXT: vsse64.v v8, (a0), a3
618-
; CHECK-NEXT: vsse64.v v9, (a4), a3
619-
; CHECK-NEXT: addi a2, a2, -4
620-
; CHECK-NEXT: addi a1, a1, 32
621-
; CHECK-NEXT: addi a0, a0, 160
622-
; CHECK-NEXT: bnez a2, .LBB11_1
623-
; CHECK-NEXT: # %bb.2:
624-
; CHECK-NEXT: ret
685+
; V-LABEL: scatter_of_pointers:
686+
; V: # %bb.0:
687+
; V-NEXT: li a2, 1024
688+
; V-NEXT: li a3, 40
689+
; V-NEXT: vsetivli zero, 2, e64, m1, ta, mu
690+
; V-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
691+
; V-NEXT: vle64.v v8, (a1)
692+
; V-NEXT: addi a4, a1, 16
693+
; V-NEXT: vle64.v v9, (a4)
694+
; V-NEXT: addi a4, a0, 80
695+
; V-NEXT: vsse64.v v8, (a0), a3
696+
; V-NEXT: vsse64.v v9, (a4), a3
697+
; V-NEXT: addi a2, a2, -4
698+
; V-NEXT: addi a1, a1, 32
699+
; V-NEXT: addi a0, a0, 160
700+
; V-NEXT: bnez a2, .LBB11_1
701+
; V-NEXT: # %bb.2:
702+
; V-NEXT: ret
703+
;
704+
; ZVE32F-LABEL: scatter_of_pointers:
705+
; ZVE32F: # %bb.0:
706+
; ZVE32F-NEXT: li a2, 0
707+
; ZVE32F-NEXT: li a3, 1
708+
; ZVE32F-NEXT: li a4, 1024
709+
; ZVE32F-NEXT: li a5, 40
710+
; ZVE32F-NEXT: .LBB11_1: # =>This Inner Loop Header: Depth=1
711+
; ZVE32F-NEXT: ld a6, 8(a1)
712+
; ZVE32F-NEXT: ld a7, 0(a1)
713+
; ZVE32F-NEXT: ld t0, 24(a1)
714+
; ZVE32F-NEXT: ld t1, 16(a1)
715+
; ZVE32F-NEXT: mul t2, a3, a5
716+
; ZVE32F-NEXT: add t2, a0, t2
717+
; ZVE32F-NEXT: mul t3, a2, a5
718+
; ZVE32F-NEXT: add t3, a0, t3
719+
; ZVE32F-NEXT: sd a7, 0(t3)
720+
; ZVE32F-NEXT: sd a6, 0(t2)
721+
; ZVE32F-NEXT: sd t1, 80(t3)
722+
; ZVE32F-NEXT: sd t0, 80(t2)
723+
; ZVE32F-NEXT: addi a2, a2, 4
724+
; ZVE32F-NEXT: addi a3, a3, 4
725+
; ZVE32F-NEXT: addi a4, a4, -4
726+
; ZVE32F-NEXT: addi a1, a1, 32
727+
; ZVE32F-NEXT: bnez a4, .LBB11_1
728+
; ZVE32F-NEXT: # %bb.2:
729+
; ZVE32F-NEXT: ret
625730
br label %3
626731

627732
3: ; preds = %3, %2

0 commit comments

Comments
 (0)