Skip to content

Commit 404c919

Browse files
committed
[AArch64] Extend usage of XAR instruction for fixed-length operations
Resolves #139229 In #137162, support for `v2i64` was implemented for vector rotate transformation, although types like `v4i32`, `v8i16` and `v16i8` do not have Neon SHA3, we can use SVE operations if sve2-sha3 is available.
1 parent ee91f9b commit 404c919

File tree

2 files changed

+80
-6
lines changed

2 files changed

+80
-6
lines changed

llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4632,22 +4632,55 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
46324632
SDValue Imm = CurDAG->getTargetConstant(
46334633
ShAmt, DL, N0.getOperand(1).getValueType(), false);
46344634

4635-
if (ShAmt + HsAmt != 64)
4635+
unsigned VTSizeInBits =
4636+
(Subtarget->hasSVE2() ? VT.getScalarSizeInBits() : 64);
4637+
if (ShAmt + HsAmt != VTSizeInBits)
46364638
return false;
46374639

4640+
// We have Neon SHA3 XAR operation for v2i64 but for types
4641+
// v4i32, v8i16, v16i8 we can use SVE operations when SVE2-SHA3
4642+
// is available.
4643+
EVT SVT = MVT::v2i64;
4644+
switch (VT.getSimpleVT().SimpleTy) {
4645+
case MVT::v4i32:
4646+
SVT = MVT::nxv4i32;
4647+
break;
4648+
case MVT::v8i16:
4649+
SVT = MVT::nxv8i16;
4650+
break;
4651+
case MVT::v16i8:
4652+
SVT = MVT::nxv16i8;
4653+
break;
4654+
default:
4655+
if (!(VT == MVT::v2i64 || VT == MVT::v1i64))
4656+
return false;
4657+
break;
4658+
}
4659+
46384660
if (!IsXOROperand) {
46394661
SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i64);
4640-
SDNode *MOV =
4641-
CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, MVT::v2i64, Zero);
4662+
SDNode *MOV = CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, SVT, Zero);
46424663
SDValue MOVIV = SDValue(MOV, 0);
4664+
46434665
R1 = N1->getOperand(0);
46444666
R2 = MOVIV;
46454667
}
46464668

4669+
if (SVT != VT && VT != MVT::v1i64) {
4670+
SDValue Undef =
4671+
SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, SVT), 0);
4672+
SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
4673+
4674+
R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef,
4675+
R1, ZSub),
4676+
0);
4677+
R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef,
4678+
R2, ZSub),
4679+
0);
4680+
}
4681+
46474682
// If the input is a v1i64, widen to a v2i64 to use XAR.
4648-
assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!");
46494683
if (VT == MVT::v1i64) {
4650-
EVT SVT = MVT::v2i64;
46514684
SDValue Undef =
46524685
SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0);
46534686
SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
@@ -4661,12 +4694,28 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) {
46614694
}
46624695

46634696
SDValue Ops[] = {R1, R2, Imm};
4664-
SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops);
4697+
SDNode *XAR = nullptr;
4698+
4699+
if (SVT != VT && VT != MVT::v1i64) {
4700+
if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::Int>(
4701+
SVT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S,
4702+
AArch64::XAR_ZZZI_D}))
4703+
XAR = CurDAG->getMachineNode(Opc, DL, VT, Ops);
4704+
} else {
4705+
XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops);
4706+
}
4707+
4708+
if (!XAR)
4709+
return false;
46654710

46664711
if (VT == MVT::v1i64) {
46674712
SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
46684713
XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
46694714
SDValue(XAR, 0), DSub);
4715+
} else if (SVT != VT) {
4716+
SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
4717+
XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT,
4718+
SDValue(XAR, 0), ZSub);
46704719
}
46714720
ReplaceNode(N, XAR);
46724721
return true;

llvm/test/CodeGen/AArch64/xar.ll

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
22
; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s
33
; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s
4+
; RUN: llc -mtriple=aarch64 -mattr=+sve2,sve2-sha3,+sha3 < %s | FileCheck --check-prefix=SVE2SHA3 %s
45

56
define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) {
67
; SHA3-LABEL: xar:
@@ -90,6 +91,14 @@ define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) {
9091
; NOSHA3-NEXT: usra v1.4s, v0.4s, #7
9192
; NOSHA3-NEXT: mov v0.16b, v1.16b
9293
; NOSHA3-NEXT: ret
94+
;
95+
; SVE2SHA3-LABEL: xar_instead_of_or2:
96+
; SVE2SHA3: // %bb.0: // %entry
97+
; SVE2SHA3-NEXT: movi v1.2d, #0000000000000000
98+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 def $z0
99+
; SVE2SHA3-NEXT: xar z0.s, z0.s, z1.s, #7
100+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 killed $z0
101+
; SVE2SHA3-NEXT: ret
93102
entry:
94103
%or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25))
95104
ret <4 x i32> %or
@@ -109,6 +118,14 @@ define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) {
109118
; NOSHA3-NEXT: usra v1.8h, v0.8h, #7
110119
; NOSHA3-NEXT: mov v0.16b, v1.16b
111120
; NOSHA3-NEXT: ret
121+
;
122+
; SVE2SHA3-LABEL: xar_instead_of_or3:
123+
; SVE2SHA3: // %bb.0: // %entry
124+
; SVE2SHA3-NEXT: movi v1.2d, #0000000000000000
125+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 def $z0
126+
; SVE2SHA3-NEXT: xar z0.h, z0.h, z1.h, #7
127+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 killed $z0
128+
; SVE2SHA3-NEXT: ret
112129
entry:
113130
%or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25))
114131
ret <8 x i16> %or
@@ -128,6 +145,14 @@ define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) {
128145
; NOSHA3-NEXT: usra v1.16b, v0.16b, #7
129146
; NOSHA3-NEXT: mov v0.16b, v1.16b
130147
; NOSHA3-NEXT: ret
148+
;
149+
; SVE2SHA3-LABEL: xar_instead_of_or4:
150+
; SVE2SHA3: // %bb.0: // %entry
151+
; SVE2SHA3-NEXT: movi v1.2d, #0000000000000000
152+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 def $z0
153+
; SVE2SHA3-NEXT: xar z0.b, z0.b, z1.b, #7
154+
; SVE2SHA3-NEXT: // kill: def $q0 killed $q0 killed $z0
155+
; SVE2SHA3-NEXT: ret
131156
entry:
132157
%or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25))
133158
ret <16 x i8> %or

0 commit comments

Comments
 (0)