Skip to content

Commit 62fc58a

Browse files
committed
[AArch64] Improve codegen for "trunc <4 x i64> to <4 x i8>" for all cases
To achieve this, we need this observation: `uzp1` is just a `xtn` that operates on two registers For example, given the following register with type v2i64: LSB_______MSB x0 x1 x2 x3 Applying xtn on it we get: x0 x2 This is equivalent to bitcast it to v4i32, and then applying uzp1 on it: x0 x1 x2 x3 | uzp1 v x0 x2 <value from other register> We can transform xtn to uzp1 by this observation, and vice versa. This observation only works on little endian target. Big endian target has a problem: the uzp1 cannot be replaced by xtn since there is a discrepancy in the behavior of uzp1 between the little endian and big endian. To illustrate, take the following for example: LSB____________________MSB x0 x1 x2 x3 On little endian, uzp1 grabs x0 and x2, which is right; on big endian, it grabs x3 and x1, which doesn't match what I saw on the document. But, since I'm new to AArch64, take my word with a pinch of salt. This bevavior is observed on gdb, maybe there's issue in the order of the value printed by it ? Whatever the reason is, the execution result given by qemu just doesn't match. So I disable this on big endian target temporarily until we find the crux. Fixes #57502 Reviewed By: dmgreen, mingmingl Co-authored-by: Mingming Liu <[email protected]> Differential Revision: https://reviews.llvm.org/D133850
1 parent b13f7f9 commit 62fc58a

File tree

4 files changed

+86
-28
lines changed

4 files changed

+86
-28
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18029,7 +18029,76 @@ static SDValue performUzpCombine(SDNode *N, SelectionDAG &DAG) {
1802918029
}
1803018030
}
1803118031

18032-
return SDValue();
18032+
// uzp1(xtn x, xtn y) -> xtn(uzp1 (x, y))
18033+
// Only implemented on little-endian subtargets.
18034+
bool IsLittleEndian = DAG.getDataLayout().isLittleEndian();
18035+
18036+
// This optimization only works on little endian.
18037+
if (!IsLittleEndian)
18038+
return SDValue();
18039+
18040+
if (ResVT != MVT::v2i32 && ResVT != MVT::v4i16 && ResVT != MVT::v8i8)
18041+
return SDValue();
18042+
18043+
auto getSourceOp = [](SDValue Operand) -> SDValue {
18044+
const unsigned Opcode = Operand.getOpcode();
18045+
if (Opcode == ISD::TRUNCATE)
18046+
return Operand->getOperand(0);
18047+
if (Opcode == ISD::BITCAST &&
18048+
Operand->getOperand(0).getOpcode() == ISD::TRUNCATE)
18049+
return Operand->getOperand(0)->getOperand(0);
18050+
return SDValue();
18051+
};
18052+
18053+
SDValue SourceOp0 = getSourceOp(Op0);
18054+
SDValue SourceOp1 = getSourceOp(Op1);
18055+
18056+
if (!SourceOp0 || !SourceOp1)
18057+
return SDValue();
18058+
18059+
if (SourceOp0.getValueType() != SourceOp1.getValueType() ||
18060+
!SourceOp0.getValueType().isSimple())
18061+
return SDValue();
18062+
18063+
EVT ResultTy;
18064+
18065+
switch (SourceOp0.getSimpleValueType().SimpleTy) {
18066+
case MVT::v2i64:
18067+
ResultTy = MVT::v4i32;
18068+
break;
18069+
case MVT::v4i32:
18070+
ResultTy = MVT::v8i16;
18071+
break;
18072+
case MVT::v8i16:
18073+
ResultTy = MVT::v16i8;
18074+
break;
18075+
default:
18076+
return SDValue();
18077+
}
18078+
18079+
SDValue UzpOp0 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp0);
18080+
SDValue UzpOp1 = DAG.getNode(ISD::BITCAST, DL, ResultTy, SourceOp1);
18081+
SDValue UzpResult =
18082+
DAG.getNode(AArch64ISD::UZP1, DL, UzpOp0.getValueType(), UzpOp0, UzpOp1);
18083+
18084+
EVT BitcastResultTy;
18085+
18086+
switch (ResVT.getSimpleVT().SimpleTy) {
18087+
case MVT::v2i32:
18088+
BitcastResultTy = MVT::v2i64;
18089+
break;
18090+
case MVT::v4i16:
18091+
BitcastResultTy = MVT::v4i32;
18092+
break;
18093+
case MVT::v8i8:
18094+
BitcastResultTy = MVT::v8i16;
18095+
break;
18096+
default:
18097+
llvm_unreachable("Should be one of {v2i32, v4i16, v8i8}");
18098+
}
18099+
18100+
return DAG.getNode(ISD::TRUNCATE, DL, ResVT,
18101+
DAG.getNode(ISD::BITCAST, DL, BitcastResultTy, UzpResult));
1803318102
}
1803418103

1803518104
static SDValue performGLD1Combine(SDNode *N, SelectionDAG &DAG) {

llvm/test/CodeGen/AArch64/aarch64-uzp1-combine.ll

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,11 @@
22
; RUN: llc < %s -mtriple aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LE %s
33
; RUN: llc < %s -mtriple aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
44

5-
; Test cases to show when UZP1 (TRUNC, TRUNC) could be combined to TRUNC (UZP1) but not yet implemented.
6-
75
define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) {
86
; CHECK-LE-LABEL: test_combine_v4i16_v2i64:
97
; CHECK-LE: // %bb.0:
10-
; CHECK-LE-NEXT: xtn v0.2s, v0.2d
11-
; CHECK-LE-NEXT: xtn v1.2s, v1.2d
12-
; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
8+
; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
9+
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
1310
; CHECK-LE-NEXT: ret
1411
;
1512
; CHECK-BE-LABEL: test_combine_v4i16_v2i64:
@@ -36,9 +33,8 @@ define <4 x i16> @test_combine_v4i16_v2i64(<2 x i64> %a, <2 x i64> %b) {
3633
define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) {
3734
; CHECK-LE-LABEL: test_combine_v4i16_v4i32:
3835
; CHECK-LE: // %bb.0:
36+
; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
3937
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
40-
; CHECK-LE-NEXT: xtn v1.4h, v1.4s
41-
; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
4238
; CHECK-LE-NEXT: ret
4339
;
4440
; CHECK-BE-LABEL: test_combine_v4i16_v4i32:
@@ -62,9 +58,8 @@ define <4 x i16> @test_combine_v4i16_v4i32(<4 x i32> %a, <4 x i32> %b) {
6258
define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
6359
; CHECK-LE-LABEL: test_combine_v4i16_v8i16:
6460
; CHECK-LE: // %bb.0:
65-
; CHECK-LE-NEXT: xtn v0.8b, v0.8h
66-
; CHECK-LE-NEXT: xtn v1.8b, v1.8h
67-
; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
61+
; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
62+
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
6863
; CHECK-LE-NEXT: ret
6964
;
7065
; CHECK-BE-LABEL: test_combine_v4i16_v8i16:
@@ -94,9 +89,8 @@ define <4 x i16> @test_combine_v4i16_v8i16(<8 x i16> %a, <8 x i16> %b) {
9489
define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) {
9590
; CHECK-LE-LABEL: test_combine_v8i8_v2i64:
9691
; CHECK-LE: // %bb.0:
97-
; CHECK-LE-NEXT: xtn v0.2s, v0.2d
98-
; CHECK-LE-NEXT: xtn v1.2s, v1.2d
99-
; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
92+
; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
93+
; CHECK-LE-NEXT: xtn v0.8b, v0.8h
10094
; CHECK-LE-NEXT: ret
10195
;
10296
; CHECK-BE-LABEL: test_combine_v8i8_v2i64:
@@ -123,9 +117,8 @@ define <8 x i8> @test_combine_v8i8_v2i64(<2 x i64> %a, <2 x i64> %b) {
123117
define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) {
124118
; CHECK-LE-LABEL: test_combine_v8i8_v4i32:
125119
; CHECK-LE: // %bb.0:
126-
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
127-
; CHECK-LE-NEXT: xtn v1.4h, v1.4s
128-
; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
120+
; CHECK-LE-NEXT: uzp1 v0.8h, v0.8h, v1.8h
121+
; CHECK-LE-NEXT: xtn v0.8b, v0.8h
129122
; CHECK-LE-NEXT: ret
130123
;
131124
; CHECK-BE-LABEL: test_combine_v8i8_v4i32:
@@ -154,9 +147,8 @@ define <8 x i8> @test_combine_v8i8_v4i32(<4 x i32> %a, <4 x i32> %b) {
154147
define <8 x i8> @test_combine_v8i8_v8i16(<8 x i16> %a, <8 x i16> %b) {
155148
; CHECK-LE-LABEL: test_combine_v8i8_v8i16:
156149
; CHECK-LE: // %bb.0:
150+
; CHECK-LE-NEXT: uzp1 v0.16b, v0.16b, v1.16b
157151
; CHECK-LE-NEXT: xtn v0.8b, v0.8h
158-
; CHECK-LE-NEXT: xtn v1.8b, v1.8h
159-
; CHECK-LE-NEXT: uzp1 v0.8b, v0.8b, v1.8b
160152
; CHECK-LE-NEXT: ret
161153
;
162154
; CHECK-BE-LABEL: test_combine_v8i8_v8i16:
@@ -266,9 +258,8 @@ define <2 x i32> @test_combine_v2i32_v8i16(<8 x i16> %a, <8 x i16> %b) {
266258
define i8 @trunc_v4i64_v4i8(<4 x i64> %input) {
267259
; CHECK-LE-LABEL: trunc_v4i64_v4i8:
268260
; CHECK-LE: // %bb.0:
269-
; CHECK-LE-NEXT: xtn v1.2s, v1.2d
270-
; CHECK-LE-NEXT: xtn v0.2s, v0.2d
271-
; CHECK-LE-NEXT: uzp1 v0.4h, v0.4h, v1.4h
261+
; CHECK-LE-NEXT: uzp1 v0.4s, v0.4s, v1.4s
262+
; CHECK-LE-NEXT: xtn v0.4h, v0.4s
272263
; CHECK-LE-NEXT: addv h0, v0.4h
273264
; CHECK-LE-NEXT: fmov w0, s0
274265
; CHECK-LE-NEXT: ret

llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2980,9 +2980,8 @@ define <8 x i8> @test_signed_v8f32_v8i8(<8 x float> %f) {
29802980
; CHECK-NEXT: mvni v2.4s, #127
29812981
; CHECK-NEXT: smax v1.4s, v1.4s, v2.4s
29822982
; CHECK-NEXT: smax v0.4s, v0.4s, v2.4s
2983-
; CHECK-NEXT: xtn v1.4h, v1.4s
2984-
; CHECK-NEXT: xtn v0.4h, v0.4s
2985-
; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
2983+
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
2984+
; CHECK-NEXT: xtn v0.8b, v0.8h
29862985
; CHECK-NEXT: ret
29872986
%x = call <8 x i8> @llvm.fptosi.sat.v8f32.v8i8(<8 x float> %f)
29882987
ret <8 x i8> %x

llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2491,9 +2491,8 @@ define <8 x i8> @test_unsigned_v8f32_v8i8(<8 x float> %f) {
24912491
; CHECK-NEXT: fcvtzu v0.4s, v0.4s
24922492
; CHECK-NEXT: umin v1.4s, v1.4s, v2.4s
24932493
; CHECK-NEXT: umin v0.4s, v0.4s, v2.4s
2494-
; CHECK-NEXT: xtn v1.4h, v1.4s
2495-
; CHECK-NEXT: xtn v0.4h, v0.4s
2496-
; CHECK-NEXT: uzp1 v0.8b, v0.8b, v1.8b
2494+
; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h
2495+
; CHECK-NEXT: xtn v0.8b, v0.8h
24972496
; CHECK-NEXT: ret
24982497
%x = call <8 x i8> @llvm.fptoui.sat.v8f32.v8i8(<8 x float> %f)
24992498
ret <8 x i8> %x

0 commit comments

Comments
 (0)