Skip to content

Commit c3e3aa9

Browse files
authored
[AArch64][SVE2] Generate XAR (llvm#77160)
Bitwise exclusive OR and rotate right by immediate Select xar (x, y, imm) for the following pattern: or (shl (xor x, y), nBits-imm), (shr (xor x, y), imm) This is essentially: rotr (xor(x, y), imm)
1 parent 4619e21 commit c3e3aa9

File tree

3 files changed

+294
-1
lines changed

3 files changed

+294
-1
lines changed

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4275,6 +4275,58 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
42754275

42764276
SDValue N0 = N->getOperand(0);
42774277
SDValue N1 = N->getOperand(1);
4278+
EVT VT = N->getValueType(0);
4279+
4280+
// Essentially: rotr (xor(x, y), imm) -> xar (x, y, imm)
4281+
// Rotate by a constant is a funnel shift in IR which is exanded to
4282+
// an OR with shifted operands.
4283+
// We do the following transform:
4284+
// OR N0, N1 -> xar (x, y, imm)
4285+
// Where:
4286+
// N1 = SRL_PRED true, V, splat(imm) --> rotr amount
4287+
// N0 = SHL_PRED true, V, splat(bits-imm)
4288+
// V = (xor x, y)
4289+
if (VT.isScalableVector() && Subtarget->hasSVE2orSME()) {
4290+
if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
4291+
N1.getOpcode() != AArch64ISD::SRL_PRED)
4292+
std::swap(N0, N1);
4293+
if (N0.getOpcode() != AArch64ISD::SHL_PRED ||
4294+
N1.getOpcode() != AArch64ISD::SRL_PRED)
4295+
return false;
4296+
4297+
auto *TLI = static_cast<const AArch64TargetLowering *>(getTargetLowering());
4298+
if (!TLI->isAllActivePredicate(*CurDAG, N0.getOperand(0)) ||
4299+
!TLI->isAllActivePredicate(*CurDAG, N1.getOperand(0)))
4300+
return false;
4301+
4302+
SDValue XOR = N0.getOperand(1);
4303+
if (XOR.getOpcode() != ISD::XOR || XOR != N1.getOperand(1))
4304+
return false;
4305+
4306+
APInt ShlAmt, ShrAmt;
4307+
if (!ISD::isConstantSplatVector(N0.getOperand(2).getNode(), ShlAmt) ||
4308+
!ISD::isConstantSplatVector(N1.getOperand(2).getNode(), ShrAmt))
4309+
return false;
4310+
4311+
if (ShlAmt + ShrAmt != VT.getScalarSizeInBits())
4312+
return false;
4313+
4314+
SDLoc DL(N);
4315+
SDValue Imm =
4316+
CurDAG->getTargetConstant(ShrAmt.getZExtValue(), DL, MVT::i32);
4317+
4318+
SDValue Ops[] = {XOR.getOperand(0), XOR.getOperand(1), Imm};
4319+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
4320+
VT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
4321+
AArch64::XAR_ZZZI_D})) {
4322+
CurDAG->SelectNodeTo(N, Opc, VT, Ops);
4323+
return true;
4324+
}
4325+
return false;
4326+
}
4327+
4328+
if (!Subtarget->hasSHA3())
4329+
return false;
42784330

42794331
if (N0->getOpcode() != AArch64ISD::VSHL ||
42804332
N1->getOpcode() != AArch64ISD::VLSHR)
@@ -4367,7 +4419,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
43674419
case ISD::OR:
43684420
if (tryBitfieldInsertOp(Node))
43694421
return;
4370-
if (Subtarget->hasSHA3() && trySelectXAR(Node))
4422+
if (trySelectXAR(Node))
43714423
return;
43724424
break;
43734425

llvm/lib/Target/AArch64/AArch64Subtarget.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
394394
void mirFileLoaded(MachineFunction &MF) const override;
395395

396396
bool hasSVEorSME() const { return hasSVE() || hasSME(); }
397+
bool hasSVE2orSME() const { return hasSVE2() || hasSME(); }
397398

398399
// Return the known range for the bit length of SVE data registers. A value
399400
// of 0 means nothing is known about that particular limit beyong what's

llvm/test/CodeGen/AArch64/sve2-xar.ll

Lines changed: 240 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,240 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -mtriple=aarch64 -mattr=+sve < %s -o - | FileCheck --check-prefixes=CHECK,SVE %s
3+
; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s -o - | FileCheck --check-prefixes=CHECK,SVE2 %s
4+
5+
define <vscale x 2 x i64> @xar_nxv2i64_l(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
6+
; SVE-LABEL: xar_nxv2i64_l:
7+
; SVE: // %bb.0:
8+
; SVE-NEXT: eor z0.d, z0.d, z1.d
9+
; SVE-NEXT: lsr z1.d, z0.d, #4
10+
; SVE-NEXT: lsl z0.d, z0.d, #60
11+
; SVE-NEXT: orr z0.d, z0.d, z1.d
12+
; SVE-NEXT: ret
13+
;
14+
; SVE2-LABEL: xar_nxv2i64_l:
15+
; SVE2: // %bb.0:
16+
; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4
17+
; SVE2-NEXT: ret
18+
%a = xor <vscale x 2 x i64> %x, %y
19+
%b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 60))
20+
ret <vscale x 2 x i64> %b
21+
}
22+
23+
define <vscale x 2 x i64> @xar_nxv2i64_r(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
24+
; SVE-LABEL: xar_nxv2i64_r:
25+
; SVE: // %bb.0:
26+
; SVE-NEXT: eor z0.d, z0.d, z1.d
27+
; SVE-NEXT: lsl z1.d, z0.d, #60
28+
; SVE-NEXT: lsr z0.d, z0.d, #4
29+
; SVE-NEXT: orr z0.d, z0.d, z1.d
30+
; SVE-NEXT: ret
31+
;
32+
; SVE2-LABEL: xar_nxv2i64_r:
33+
; SVE2: // %bb.0:
34+
; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4
35+
; SVE2-NEXT: ret
36+
%a = xor <vscale x 2 x i64> %x, %y
37+
%b = call <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 4))
38+
ret <vscale x 2 x i64> %b
39+
}
40+
41+
42+
define <vscale x 4 x i32> @xar_nxv4i32_l(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
43+
; SVE-LABEL: xar_nxv4i32_l:
44+
; SVE: // %bb.0:
45+
; SVE-NEXT: eor z0.d, z0.d, z1.d
46+
; SVE-NEXT: lsr z1.s, z0.s, #4
47+
; SVE-NEXT: lsl z0.s, z0.s, #28
48+
; SVE-NEXT: orr z0.d, z0.d, z1.d
49+
; SVE-NEXT: ret
50+
;
51+
; SVE2-LABEL: xar_nxv4i32_l:
52+
; SVE2: // %bb.0:
53+
; SVE2-NEXT: xar z0.s, z0.s, z1.s, #4
54+
; SVE2-NEXT: ret
55+
%a = xor <vscale x 4 x i32> %x, %y
56+
%b = call <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 28))
57+
ret <vscale x 4 x i32> %b
58+
}
59+
60+
define <vscale x 4 x i32> @xar_nxv4i32_r(<vscale x 4 x i32> %x, <vscale x 4 x i32> %y) {
61+
; SVE-LABEL: xar_nxv4i32_r:
62+
; SVE: // %bb.0:
63+
; SVE-NEXT: eor z0.d, z0.d, z1.d
64+
; SVE-NEXT: lsl z1.s, z0.s, #28
65+
; SVE-NEXT: lsr z0.s, z0.s, #4
66+
; SVE-NEXT: orr z0.d, z0.d, z1.d
67+
; SVE-NEXT: ret
68+
;
69+
; SVE2-LABEL: xar_nxv4i32_r:
70+
; SVE2: // %bb.0:
71+
; SVE2-NEXT: xar z0.s, z0.s, z1.s, #4
72+
; SVE2-NEXT: ret
73+
%a = xor <vscale x 4 x i32> %x, %y
74+
%b = call <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %a, <vscale x 4 x i32> splat (i32 4))
75+
ret <vscale x 4 x i32> %b
76+
}
77+
78+
define <vscale x 8 x i16> @xar_nxv8i16_l(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
79+
; SVE-LABEL: xar_nxv8i16_l:
80+
; SVE: // %bb.0:
81+
; SVE-NEXT: eor z0.d, z0.d, z1.d
82+
; SVE-NEXT: lsr z1.h, z0.h, #4
83+
; SVE-NEXT: lsl z0.h, z0.h, #12
84+
; SVE-NEXT: orr z0.d, z0.d, z1.d
85+
; SVE-NEXT: ret
86+
;
87+
; SVE2-LABEL: xar_nxv8i16_l:
88+
; SVE2: // %bb.0:
89+
; SVE2-NEXT: xar z0.h, z0.h, z1.h, #4
90+
; SVE2-NEXT: ret
91+
%a = xor <vscale x 8 x i16> %x, %y
92+
%b = call <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 12))
93+
ret <vscale x 8 x i16> %b
94+
}
95+
96+
define <vscale x 8 x i16> @xar_nxv8i16_r(<vscale x 8 x i16> %x, <vscale x 8 x i16> %y) {
97+
; SVE-LABEL: xar_nxv8i16_r:
98+
; SVE: // %bb.0:
99+
; SVE-NEXT: eor z0.d, z0.d, z1.d
100+
; SVE-NEXT: lsl z1.h, z0.h, #12
101+
; SVE-NEXT: lsr z0.h, z0.h, #4
102+
; SVE-NEXT: orr z0.d, z0.d, z1.d
103+
; SVE-NEXT: ret
104+
;
105+
; SVE2-LABEL: xar_nxv8i16_r:
106+
; SVE2: // %bb.0:
107+
; SVE2-NEXT: xar z0.h, z0.h, z1.h, #4
108+
; SVE2-NEXT: ret
109+
%a = xor <vscale x 8 x i16> %x, %y
110+
%b = call <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %a, <vscale x 8 x i16> splat (i16 4))
111+
ret <vscale x 8 x i16> %b
112+
}
113+
114+
define <vscale x 16 x i8> @xar_nxv16i8_l(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
115+
; SVE-LABEL: xar_nxv16i8_l:
116+
; SVE: // %bb.0:
117+
; SVE-NEXT: eor z0.d, z0.d, z1.d
118+
; SVE-NEXT: lsr z1.b, z0.b, #4
119+
; SVE-NEXT: lsl z0.b, z0.b, #4
120+
; SVE-NEXT: orr z0.d, z0.d, z1.d
121+
; SVE-NEXT: ret
122+
;
123+
; SVE2-LABEL: xar_nxv16i8_l:
124+
; SVE2: // %bb.0:
125+
; SVE2-NEXT: xar z0.b, z0.b, z1.b, #4
126+
; SVE2-NEXT: ret
127+
%a = xor <vscale x 16 x i8> %x, %y
128+
%b = call <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 4))
129+
ret <vscale x 16 x i8> %b
130+
}
131+
132+
define <vscale x 16 x i8> @xar_nxv16i8_r(<vscale x 16 x i8> %x, <vscale x 16 x i8> %y) {
133+
; SVE-LABEL: xar_nxv16i8_r:
134+
; SVE: // %bb.0:
135+
; SVE-NEXT: eor z0.d, z0.d, z1.d
136+
; SVE-NEXT: lsl z1.b, z0.b, #4
137+
; SVE-NEXT: lsr z0.b, z0.b, #4
138+
; SVE-NEXT: orr z0.d, z0.d, z1.d
139+
; SVE-NEXT: ret
140+
;
141+
; SVE2-LABEL: xar_nxv16i8_r:
142+
; SVE2: // %bb.0:
143+
; SVE2-NEXT: xar z0.b, z0.b, z1.b, #4
144+
; SVE2-NEXT: ret
145+
%a = xor <vscale x 16 x i8> %x, %y
146+
%b = call <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %a, <vscale x 16 x i8> splat (i8 4))
147+
ret <vscale x 16 x i8> %b
148+
}
149+
150+
; Shift is not a constant.
151+
define <vscale x 2 x i64> @xar_nxv2i64_l_neg1(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y, <vscale x 2 x i64> %z) {
152+
; CHECK-LABEL: xar_nxv2i64_l_neg1:
153+
; CHECK: // %bb.0:
154+
; CHECK-NEXT: mov z3.d, z2.d
155+
; CHECK-NEXT: ptrue p0.d
156+
; CHECK-NEXT: subr z2.d, z2.d, #0 // =0x0
157+
; CHECK-NEXT: eor z0.d, z0.d, z1.d
158+
; CHECK-NEXT: and z2.d, z2.d, #0x3f
159+
; CHECK-NEXT: and z3.d, z3.d, #0x3f
160+
; CHECK-NEXT: movprfx z1, z0
161+
; CHECK-NEXT: lsl z1.d, p0/m, z1.d, z3.d
162+
; CHECK-NEXT: lsr z0.d, p0/m, z0.d, z2.d
163+
; CHECK-NEXT: orr z0.d, z1.d, z0.d
164+
; CHECK-NEXT: ret
165+
%a = xor <vscale x 2 x i64> %x, %y
166+
%b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> %z)
167+
ret <vscale x 2 x i64> %b
168+
}
169+
170+
; OR instead of an XOR.
171+
; TODO: We could use usra instruction here for SVE2.
172+
define <vscale x 2 x i64> @xar_nxv2i64_l_neg2(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
173+
; CHECK-LABEL: xar_nxv2i64_l_neg2:
174+
; CHECK: // %bb.0:
175+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
176+
; CHECK-NEXT: lsr z1.d, z0.d, #4
177+
; CHECK-NEXT: lsl z0.d, z0.d, #60
178+
; CHECK-NEXT: orr z0.d, z0.d, z1.d
179+
; CHECK-NEXT: ret
180+
%a = or <vscale x 2 x i64> %x, %y
181+
%b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 60))
182+
ret <vscale x 2 x i64> %b
183+
}
184+
185+
; Rotate amount is 0.
186+
define <vscale x 2 x i64> @xar_nxv2i64_l_neg3(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
187+
; CHECK-LABEL: xar_nxv2i64_l_neg3:
188+
; CHECK: // %bb.0:
189+
; CHECK-NEXT: eor z0.d, z0.d, z1.d
190+
; CHECK-NEXT: ret
191+
%a = xor <vscale x 2 x i64> %x, %y
192+
%b = call <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %a, <vscale x 2 x i64> splat (i64 64))
193+
ret <vscale x 2 x i64> %b
194+
}
195+
196+
; Uses individual shifts instead of funnel shifts, just one test.
197+
define <vscale x 2 x i64> @xar_nxv2i64_shifts(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
198+
; SVE-LABEL: xar_nxv2i64_shifts:
199+
; SVE: // %bb.0:
200+
; SVE-NEXT: eor z0.d, z0.d, z1.d
201+
; SVE-NEXT: lsr z1.d, z0.d, #4
202+
; SVE-NEXT: lsl z0.d, z0.d, #60
203+
; SVE-NEXT: orr z0.d, z0.d, z1.d
204+
; SVE-NEXT: ret
205+
;
206+
; SVE2-LABEL: xar_nxv2i64_shifts:
207+
; SVE2: // %bb.0:
208+
; SVE2-NEXT: xar z0.d, z0.d, z1.d, #4
209+
; SVE2-NEXT: ret
210+
%xor = xor <vscale x 2 x i64> %x, %y
211+
%shl = shl <vscale x 2 x i64> %xor, splat (i64 60)
212+
%shr = lshr <vscale x 2 x i64> %xor, splat (i64 4)
213+
%or = or <vscale x 2 x i64> %shl, %shr
214+
ret <vscale x 2 x i64> %or
215+
}
216+
217+
; Not a rotate operation as 60 + 3 != 64
218+
define <vscale x 2 x i64> @xar_nxv2i64_shifts_neg(<vscale x 2 x i64> %x, <vscale x 2 x i64> %y) {
219+
; CHECK-LABEL: xar_nxv2i64_shifts_neg:
220+
; CHECK: // %bb.0:
221+
; CHECK-NEXT: eor z0.d, z0.d, z1.d
222+
; CHECK-NEXT: lsl z1.d, z0.d, #60
223+
; CHECK-NEXT: lsr z0.d, z0.d, #3
224+
; CHECK-NEXT: orr z0.d, z1.d, z0.d
225+
; CHECK-NEXT: ret
226+
%xor = xor <vscale x 2 x i64> %x, %y
227+
%shl = shl <vscale x 2 x i64> %xor, splat (i64 60)
228+
%shr = lshr <vscale x 2 x i64> %xor, splat (i64 3)
229+
%or = or <vscale x 2 x i64> %shl, %shr
230+
ret <vscale x 2 x i64> %or
231+
}
232+
233+
declare <vscale x 2 x i64> @llvm.fshl.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
234+
declare <vscale x 4 x i32> @llvm.fshl.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
235+
declare <vscale x 8 x i16> @llvm.fshl.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
236+
declare <vscale x 16 x i8> @llvm.fshl.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
237+
declare <vscale x 2 x i64> @llvm.fshr.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
238+
declare <vscale x 4 x i32> @llvm.fshr.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
239+
declare <vscale x 8 x i16> @llvm.fshr.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
240+
declare <vscale x 16 x i8> @llvm.fshr.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)

0 commit comments

Comments
 (0)