Skip to content

Commit 809600d

Browse files
author
Francesco Petrogalli
committed
[llvm][sve] Reg + Imm addressing mode for ld1ro.
Reviewers: kmclaughlin, efriedma, sdesmalen Subscribers: tschuett, hiraditya, psnobl, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D83357
1 parent 8158f0c commit 809600d

File tree

4 files changed

+188
-4
lines changed

4 files changed

+188
-4
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12347,6 +12347,9 @@ static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
1234712347
"Unsupported opcode.");
1234812348
SDLoc DL(N);
1234912349
EVT VT = N->getValueType(0);
12350+
if (VT == MVT::nxv8bf16 &&
12351+
!static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
12352+
return SDValue();
1235012353

1235112354
EVT LoadVT = VT;
1235212355
if (VT.isFloatingPoint())

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -495,6 +495,9 @@ def SImmS4XForm : SDNodeXForm<imm, [{
495495
def SImmS16XForm : SDNodeXForm<imm, [{
496496
return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
497497
}]>;
498+
def SImmS32XForm : SDNodeXForm<imm, [{
499+
return CurDAG->getTargetConstant(N->getSExtValue() / 32, SDLoc(N), MVT::i64);
500+
}]>;
498501

499502
// simm6sN predicate - True if the immediate is a multiple of N in the range
500503
// [-32 * N, 31 * N].
@@ -546,7 +549,7 @@ def simm4s16 : Operand<i64>, ImmLeaf<i64,
546549
let DecoderMethod = "DecodeSImm<4>";
547550
}
548551
def simm4s32 : Operand<i64>, ImmLeaf<i64,
549-
[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> {
552+
[{ return Imm >=-256 && Imm <= 224 && (Imm % 32) == 0x0; }], SImmS32XForm> {
550553
let PrintMethod = "printImmScale<32>";
551554
let ParserMatchClass = SImm4s32Operand;
552555
let DecoderMethod = "DecodeSImm<4>";

llvm/lib/Target/AArch64/SVEInstrFormats.td

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7728,9 +7728,13 @@ multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
77287728
(!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
77297729

77307730
// Base addressing mode
7731-
def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
7732-
(!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
7733-
7731+
def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), GPR64sp:$base)),
7732+
(!cast<Instruction>(NAME) PPR3bAny:$Pg, GPR64sp:$base, (i64 0))>;
7733+
let AddedComplexity = 2 in {
7734+
// Reg + Imm addressing mode
7735+
def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$Pg), (add GPR64:$base, (i64 simm4s32:$imm)))),
7736+
(!cast<Instruction>(NAME) $Pg, $base, simm4s32:$imm)>;
7737+
}
77347738
}
77357739

77367740
class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+f64mm -asm-verbose=0 < %s 2>%t | FileCheck %s
2+
; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t
3+
4+
; WARN-NOT: warning
5+
6+
;
7+
; LD1ROB
8+
;
9+
10+
define <vscale x 16 x i8> @ld1rob_i8(<vscale x 16 x i1> %pg, i8* %a) nounwind {
11+
; CHECK-LABEL: ld1rob_i8:
12+
; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #32]
13+
; CHECK-NEXT: ret
14+
%base = getelementptr i8, i8* %a, i64 32
15+
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
16+
ret <vscale x 16 x i8> %load
17+
}
18+
19+
;
20+
; LD1ROH
21+
;
22+
23+
define <vscale x 8 x i16> @ld1roh_i16(<vscale x 8 x i1> %pg, i16* %a) nounwind {
24+
; CHECK-LABEL: ld1roh_i16:
25+
; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64]
26+
; CHECK-NEXT: ret
27+
%base = getelementptr i16, i16* %a, i64 32
28+
%load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1> %pg, i16* %base)
29+
ret <vscale x 8 x i16> %load
30+
}
31+
32+
define <vscale x 8 x half> @ld1roh_f16(<vscale x 8 x i1> %pg, half* %a) nounwind {
33+
; CHECK-LABEL: ld1roh_f16:
34+
; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64]
35+
; CHECK-NEXT: ret
36+
%base = getelementptr half, half* %a, i64 32
37+
%load = call <vscale x 8 x half> @llvm.aarch64.sve.ld1ro.nxv8f16(<vscale x 8 x i1> %pg, half* %base)
38+
ret <vscale x 8 x half> %load
39+
}
40+
41+
define <vscale x 8 x bfloat> @ld1roh_bf16(<vscale x 8 x i1> %pg, bfloat* %a) nounwind #0 {
42+
; CHECK-LABEL: ld1roh_bf16:
43+
; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x0, #64]
44+
; CHECK-NEXT: ret
45+
%base = getelementptr bfloat, bfloat* %a, i64 32
46+
%load = call <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1> %pg, bfloat* %base)
47+
ret <vscale x 8 x bfloat> %load
48+
}
49+
50+
;
51+
; LD1ROW
52+
;
53+
54+
define<vscale x 4 x i32> @ld1row_i32(<vscale x 4 x i1> %pg, i32* %a) nounwind {
55+
; CHECK-LABEL: ld1row_i32:
56+
; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128]
57+
; CHECK-NEXT: ret
58+
%base = getelementptr i32, i32* %a, i64 32
59+
%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1> %pg, i32* %base)
60+
ret <vscale x 4 x i32> %load
61+
}
62+
63+
define<vscale x 4 x float> @ld1row_f32(<vscale x 4 x i1> %pg, float* %a) nounwind {
64+
; CHECK-LABEL: ld1row_f32:
65+
; CHECK-NEXT: ld1row { z0.s }, p0/z, [x0, #128]
66+
; CHECK-NEXT: ret
67+
%base = getelementptr float, float* %a, i64 32
68+
%load = call <vscale x 4 x float> @llvm.aarch64.sve.ld1ro.nxv4f32(<vscale x 4 x i1> %pg, float* %base)
69+
ret <vscale x 4 x float> %load
70+
}
71+
72+
;
73+
; LD1ROD
74+
;
75+
76+
define <vscale x 2 x i64> @ld1rod_i64(<vscale x 2 x i1> %pg, i64* %a) nounwind {
77+
; CHECK-LABEL: ld1rod_i64:
78+
; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-64]
79+
; CHECK-NEXT: ret
80+
%base = getelementptr i64, i64* %a, i64 -8
81+
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1> %pg, i64* %base)
82+
ret <vscale x 2 x i64> %load
83+
}
84+
85+
define <vscale x 2 x double> @ld1rod_f64(<vscale x 2 x i1> %pg, double* %a) nounwind {
86+
; CHECK-LABEL: ld1rod_f64:
87+
; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #-128]
88+
; CHECK-NEXT: ret
89+
%base = getelementptr double, double* %a, i64 -16
90+
%load = call <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1> %pg, double* %base)
91+
ret <vscale x 2 x double> %load
92+
}
93+
94+
95+
;;;;;;;;;;;;;;
96+
; range checks: immediate must be a multiple of 32 in the range -256, ..., 224
97+
98+
; lower bound
99+
define <vscale x 16 x i8> @ld1rob_i8_lower_bound(<vscale x 16 x i1> %pg, i8* %a) nounwind {
100+
; CHECK-LABEL: ld1rob_i8_lower_bound:
101+
; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, #-256]
102+
; CHECK-NEXT: ret
103+
%base = getelementptr i8, i8* %a, i64 -256
104+
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
105+
ret <vscale x 16 x i8> %load
106+
}
107+
108+
; below lower bound
109+
define <vscale x 8 x i16> @ld1roh_i16_below_lower_bound(<vscale x 8 x i1> %pg, i16* %a) nounwind {
110+
; CHECK-LABEL: ld1roh_i16_below_lower_bound:
111+
; CHECK-NEXT: sub x[[BASE:[0-9]+]], x0, #258
112+
; CHECK-NEXT: ld1roh { z0.h }, p0/z, [x[[BASE]]]
113+
; CHECK-NEXT: ret
114+
%base = getelementptr i16, i16* %a, i64 -129
115+
%load = call <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1> %pg, i16* %base)
116+
ret <vscale x 8 x i16> %load
117+
}
118+
119+
define <vscale x 16 x i8> @ld1rob_i8_below_lower_bound_01(<vscale x 16 x i1> %pg, i8* %a) nounwind {
120+
; CHECK-LABEL: ld1rob_i8_below_lower_bound_01:
121+
; CHECK-NEXT: mov x[[OFFSET:[0-9]+]], #-257
122+
; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]]
123+
; CHECK-NEXT: ret
124+
%base = getelementptr i8, i8* %a, i64 -257
125+
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
126+
ret <vscale x 16 x i8> %load
127+
}
128+
129+
; not a multiple of 32
130+
define<vscale x 4 x i32> @ld1row_i32_not_multiple(<vscale x 4 x i1> %pg, i32* %a) nounwind {
131+
; CHECK-LABEL: ld1row_i32_not_multiple:
132+
; CHECK-NEXT: add x[[BASE:[0-9]+]], x0, #12
133+
; CHECK-NEXT: ld1row { z0.s }, p0/z, [x[[BASE]]]
134+
; CHECK-NEXT: ret
135+
%base = getelementptr i32, i32* %a, i64 3
136+
%load = call <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1> %pg, i32* %base)
137+
ret <vscale x 4 x i32> %load
138+
}
139+
140+
; upper bound
141+
define <vscale x 2 x i64> @ld1rod_i64_upper_bound(<vscale x 2 x i1> %pg, i64* %a) nounwind {
142+
; CHECK-LABEL: ld1rod_i64_upper_bound:
143+
; CHECK-NEXT: ld1rod { z0.d }, p0/z, [x0, #224]
144+
; CHECK-NEXT: ret
145+
%base = getelementptr i64, i64* %a, i64 28
146+
%load = call <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1> %pg, i64* %base)
147+
ret <vscale x 2 x i64> %load
148+
}
149+
150+
define <vscale x 16 x i8> @ld1rob_i8_beyond_upper_bound(<vscale x 16 x i1> %pg, i8* %a) nounwind {
151+
; CHECK-LABEL: ld1rob_i8_beyond_upper_bound:
152+
; CHECK-NEXT: mov w[[OFFSET:[0-9]+]], #225
153+
; CHECK-NEXT: ld1rob { z0.b }, p0/z, [x0, x[[OFFSET]]]
154+
; CHECK-NEXT: ret
155+
%base = getelementptr i8, i8* %a, i64 225
156+
%load = call <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1> %pg, i8* %base)
157+
ret <vscale x 16 x i8> %load
158+
}
159+
160+
declare <vscale x 16 x i8> @llvm.aarch64.sve.ld1ro.nxv16i8(<vscale x 16 x i1>, i8*)
161+
162+
declare <vscale x 8 x i16> @llvm.aarch64.sve.ld1ro.nxv8i16(<vscale x 8 x i1>, i16*)
163+
declare <vscale x 8 x half> @llvm.aarch64.sve.ld1ro.nxv8f16(<vscale x 8 x i1>, half*)
164+
declare <vscale x 8 x bfloat> @llvm.aarch64.sve.ld1ro.nxv8bf16(<vscale x 8 x i1>, bfloat*)
165+
166+
declare <vscale x 4 x i32> @llvm.aarch64.sve.ld1ro.nxv4i32(<vscale x 4 x i1>, i32*)
167+
declare <vscale x 4 x float> @llvm.aarch64.sve.ld1ro.nxv4f32(<vscale x 4 x i1>, float*)
168+
169+
declare <vscale x 2 x i64> @llvm.aarch64.sve.ld1ro.nxv2i64(<vscale x 2 x i1>, i64*)
170+
declare <vscale x 2 x double> @llvm.aarch64.sve.ld1ro.nxv2f64(<vscale x 2 x i1>, double*)
171+
172+
173+
; +bf16 is required for the bfloat version.
174+
attributes #0 = { "target-features"="+sve,+f64mm,+bf16" }

0 commit comments

Comments
 (0)