Skip to content

Commit 09159bd

Browse files
committed
[MachinePipeliner] Fix incorrect handlings of unpipelineable insts
There was a case where `normalizeNonPipelinedInstructions` didn't schedule unpipelineable instructions correctly, which could generate illegal code. This patch fixes this issue by rejecting the schedule if we fail to insert the unpipelineable instructions in stage 0. Here is a part of the previous debug output for the added test, where `SU(14)` and `SU(15)` are scheduled in Stage 1: ``` Do not pipeline SU(16) Do not pipeline SU(1) Do not pipeline SU(0) Do not pipeline SU(15) Do not pipeline SU(14) SU(0) is not pipelined; moving from cycle 19 to 0 Instr: ... SU(1) is not pipelined; moving from cycle 10 to 0 Instr: ... SU(15) is not pipelined; moving from cycle 28 to 19 Instr: ... SU(16) is not pipelined; moving from cycle 19 to 0 Instr: ... Schedule Found? 1 (II=10) ... cycle 9 (1) (14) %41:gpr32 = ADDWrr %27:gpr32, %12:gpr32common cycle 9 (1) (15) %28:gpr32all = COPY %41:gpr32 ```
1 parent ad38c4c commit 09159bd

File tree

2 files changed

+232
-0
lines changed

2 files changed

+232
-0
lines changed

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3178,6 +3178,12 @@ bool SMSchedule::normalizeNonPipelinedInstructions(
31783178
<< ") is not pipelined; moving from cycle " << OldCycle
31793179
<< " to " << NewCycle << " Instr:" << *SU.getInstr());
31803180
}
3181+
3182+
// There is a case where the `NewCycle` is too large to be scheduled in
3183+
// Stage 0. In this case, we reject the schedule.
3184+
if (FirstCycle + InitiationInterval <= NewCycle)
3185+
return false;
3186+
31813187
NewLastCycle = std::max(NewLastCycle, NewCycle);
31823188
}
31833189
LastCycle = NewLastCycle;
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -run-pass=pipeliner -o - %s -aarch64-enable-pipeliner -pipeliner-enable-copytophi=1
2+
3+
--- |
4+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
5+
6+
@glb = internal unnamed_addr global { [256 x i32], [256 x i32], [256 x i32] } zeroinitializer
7+
8+
; Function Attrs: nounwind vscale_range(1,16)
9+
define internal void @f(i32 %0, i32 %1) #0 {
10+
entry:
11+
%reass.sub = sub i32 %1, %0
12+
%invariant.op = add i32 %0, 1
13+
%invariant.op3 = add i32 %0, 2
14+
%omp_loop.cmp5.not = icmp eq i32 %reass.sub, -1
15+
br i1 %omp_loop.cmp5.not, label %exit, label %preheader
16+
17+
preheader: ; preds = %entry
18+
%2 = add i32 %1, 1
19+
%3 = icmp slt i32 %2, %invariant.op
20+
br i1 %3, label %body.preheader, label %vector.ph
21+
22+
body.preheader: ; preds = %preheader
23+
%4 = add i32 %1, 1
24+
%5 = sub i32 %4, %0
25+
br label %body
26+
27+
vector.ph: ; preds = %preheader
28+
%6 = add i32 %1, 1
29+
%7 = sub i32 %6, %0
30+
%8 = tail call i32 @llvm.vscale.i32()
31+
%9 = shl nuw nsw i32 %8, 2
32+
%10 = tail call i32 @llvm.vscale.i32()
33+
%11 = shl nuw nsw i32 %10, 2
34+
%12 = call i32 @llvm.usub.sat.i32(i32 %7, i32 %11)
35+
%active.lane.mask.entry = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %7)
36+
%13 = tail call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
37+
%.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %9, i64 0
38+
%.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
39+
%broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %invariant.op, i64 0
40+
%broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
41+
%broadcast.splatinsert7 = insertelement <vscale x 4 x i32> poison, i32 %invariant.op3, i64 0
42+
%broadcast.splat8 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert7, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
43+
br label %vector.body
44+
45+
vector.body: ; preds = %vector.body, %vector.ph
46+
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
47+
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
48+
%vec.ind = phi <vscale x 4 x i32> [ %13, %vector.ph ], [ %vec.ind.next, %vector.body ]
49+
%14 = add <vscale x 4 x i32> %vec.ind, %broadcast.splat
50+
%15 = extractelement <vscale x 4 x i32> %14, i64 0
51+
%16 = sext i32 %15 to i64
52+
%17 = add nsw i64 %16, -1
53+
%18 = getelementptr i32, ptr @glb, i64 %17
54+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %14, ptr %18, i32 4, <vscale x 4 x i1> %active.lane.mask)
55+
%19 = add <vscale x 4 x i32> %vec.ind, %broadcast.splat8
56+
%20 = mul <vscale x 4 x i32> %14, %19
57+
%21 = sdiv <vscale x 4 x i32> %20, splat (i32 2)
58+
%22 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %17
59+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %21, ptr %22, i32 4, <vscale x 4 x i1> %active.lane.mask)
60+
%23 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %17
61+
%wide.masked.load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %23, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
62+
%24 = add <vscale x 4 x i32> %wide.masked.load, %21
63+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %24, ptr %23, i32 4, <vscale x 4 x i1> %active.lane.mask)
64+
%25 = tail call i32 @llvm.vscale.i32()
65+
%26 = shl nuw nsw i32 %25, 2
66+
%index.next = add i32 %index, %26
67+
%active.lane.mask.next = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 %index, i32 %12)
68+
%vec.ind.next = add <vscale x 4 x i32> %vec.ind, %.splat
69+
%27 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
70+
br i1 %27, label %vector.body, label %exit
71+
72+
exit: ; preds = %vector.body, %body, %entry
73+
ret void
74+
75+
body: ; preds = %body.preheader, %body
76+
%lsr.iv2 = phi i32 [ %invariant.op3, %body.preheader ], [ %lsr.iv.next3, %body ]
77+
%lsr.iv = phi i32 [ %5, %body.preheader ], [ %lsr.iv.next, %body ]
78+
%28 = add i32 %lsr.iv2, -1
79+
%29 = sext i32 %28 to i64
80+
%30 = add nsw i64 %29, -1
81+
%31 = getelementptr i32, ptr @glb, i64 %30
82+
store i32 %28, ptr %31, align 4
83+
%32 = mul i32 %28, %lsr.iv2
84+
%33 = sdiv i32 %32, 2
85+
%34 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %30
86+
store i32 %33, ptr %34, align 4
87+
%35 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %30
88+
%36 = load i32, ptr %35, align 4
89+
%37 = add i32 %36, %33
90+
store i32 %37, ptr %35, align 4
91+
%lsr.iv.next = add i32 %lsr.iv, -1
92+
%lsr.iv.next3 = add i32 %lsr.iv2, 1
93+
%exitcond.not = icmp eq i32 %lsr.iv.next, 0
94+
br i1 %exitcond.not, label %exit, label %body
95+
}
96+
97+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
98+
declare <vscale x 4 x i32> @llvm.stepvector.nxv4i32() #1
99+
100+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
101+
declare i32 @llvm.vscale.i32() #1
102+
103+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
104+
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #1
105+
106+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
107+
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
108+
109+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
110+
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #3
111+
112+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
113+
declare i32 @llvm.usub.sat.i32(i32, i32) #4
114+
115+
attributes #0 = { nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "target-cpu"="neoverse-v1" "target-features"="+sve" }
116+
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
117+
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
118+
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
119+
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
120+
121+
...
122+
---
123+
name: f
124+
tracksRegLiveness: true
125+
body: |
126+
bb.0.entry:
127+
successors: %bb.5(0x30000000), %bb.1(0x50000000)
128+
liveins: $w0, $w1
129+
130+
%20:gpr32common = COPY $w1
131+
%19:gpr32common = COPY $w0
132+
%21:gpr32common = SUBWrr %20, %19
133+
dead $wzr = ADDSWri %21, 1, 0, implicit-def $nzcv
134+
Bcc 0, %bb.5, implicit $nzcv
135+
B %bb.1
136+
137+
bb.1.preheader:
138+
successors: %bb.2(0x40000000), %bb.3(0x40000000)
139+
140+
%22:gpr32common = ADDWri %19, 1, 0
141+
%23:gpr32sp = ADDWri %19, 2, 0
142+
%25:gpr32common = ADDWri %20, 1, 0
143+
dead $wzr = SUBSWrr killed %25, %22, implicit-def $nzcv
144+
Bcc 10, %bb.3, implicit $nzcv
145+
B %bb.2
146+
147+
bb.2.body.preheader:
148+
successors: %bb.6(0x80000000)
149+
150+
%1:gpr32sp = COPY %23
151+
%55:gpr32sp = ADDWri %21, 1, 0
152+
%2:gpr32all = COPY %55
153+
%57:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb
154+
B %bb.6
155+
156+
bb.3.vector.ph:
157+
successors: %bb.4(0x80000000)
158+
159+
%29:gpr32common = ADDWri %21, 1, 0
160+
%30:gpr64 = CNTW_XPiI 31, 1, implicit $vg
161+
%31:gpr32common = COPY %30.sub_32
162+
%32:gpr32 = SUBSWrr %29, %31, implicit-def $nzcv
163+
%33:gpr32 = COPY $wzr
164+
%34:gpr32 = CSELWr %33, killed %32, 3, implicit $nzcv
165+
%4:ppr = WHILELO_PWW_S %33, %29, implicit-def dead $nzcv
166+
%5:zpr = INDEX_II_S 0, 1, implicit $vg
167+
%6:zpr = DUP_ZR_S %31
168+
%7:zpr = DUP_ZR_S %22
169+
%8:zpr = DUP_ZR_S %23
170+
%27:gpr32all = COPY %33
171+
%37:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb
172+
%39:gpr64common = MOVi64imm -1
173+
%41:ppr_3b = PTRUE_S 31, implicit $vg
174+
%44:gpr64common = MOVi64imm 255
175+
%45:gpr64common = MOVi64imm 511
176+
177+
bb.4.vector.body:
178+
successors: %bb.4(0x7c000000), %bb.5(0x04000000)
179+
180+
%9:gpr32 = PHI %27, %bb.3, %12, %bb.4
181+
%10:ppr_3b = PHI %4, %bb.3, %13, %bb.4
182+
%11:zpr = PHI %5, %bb.3, %14, %bb.4
183+
%35:zpr = ADD_ZZZ_S %11, %7
184+
%36:gpr32 = COPY %35.ssub
185+
%38:gpr64sp = ADDXrx %37, killed %36, 50
186+
ST1W %35, %10, %38, %39 :: (store unknown-size into %ir.18, align 4)
187+
%40:zpr = ADD_ZZZ_S %11, %8
188+
%42:zpr = MUL_ZPZZ_S_UNDEF %41, %35, killed %40
189+
%43:zpr = ASRD_ZPmI_S %41, %42, 1
190+
ST1W %43, %10, %38, %44 :: (store unknown-size into %ir.22, align 4)
191+
%46:zpr = LD1W %10, %38, %45 :: (load unknown-size from %ir.23, align 4)
192+
%47:zpr = ADD_ZZZ_S killed %46, %43
193+
ST1W killed %47, %10, %38, %45 :: (store unknown-size into %ir.23, align 4)
194+
%50:gpr32 = ADDWrr %9, %31
195+
%12:gpr32all = COPY %50
196+
%13:ppr = WHILELO_PWW_S %9, %34, implicit-def $nzcv
197+
%14:zpr = ADD_ZZZ_S %11, %6
198+
Bcc 4, %bb.4, implicit $nzcv
199+
B %bb.5
200+
201+
bb.5.exit:
202+
RET_ReallyLR
203+
204+
bb.6.body:
205+
successors: %bb.5(0x04000000), %bb.6(0x7c000000)
206+
207+
%15:gpr32common = PHI %1, %bb.2, %18, %bb.6
208+
%16:gpr32sp = PHI %2, %bb.2, %17, %bb.6
209+
%56:gpr32common = SUBWri %15, 1, 0
210+
%58:gpr64sp = ADDXrx %57, %56, 50
211+
STURWi %56, %58, -4 :: (store (s32) into %ir.31)
212+
%59:gpr32 = MADDWrrr %56, %15, $wzr
213+
%60:gpr32 = ADDWrs %59, %59, 95
214+
%61:gpr32 = SBFMWri killed %60, 1, 31
215+
STRWui %61, %58, 255 :: (store (s32) into %ir.34)
216+
%62:gpr32 = LDRWui %58, 511 :: (load (s32) from %ir.35)
217+
%63:gpr32 = ADDWrr killed %62, %61
218+
STRWui killed %63, %58, 511 :: (store (s32) into %ir.35)
219+
%64:gpr32 = SUBSWri %16, 1, 0, implicit-def $nzcv
220+
%17:gpr32all = COPY %64
221+
%65:gpr32sp = ADDWri %15, 1, 0
222+
%18:gpr32all = COPY %65
223+
Bcc 0, %bb.5, implicit $nzcv
224+
B %bb.6
225+
226+
...

0 commit comments

Comments
 (0)