Skip to content

Commit bbe945b

Browse files
committed
[AArch64][GISel] Expand G_DUP and G_DUPLANE to v8s8 and v4s16
This fills in the gaps with v8s8 and v4s8 vectors for G_DUP and G_DUPLANE, using the existing code that is generalized to more types.
1 parent 19d7ab1 commit bbe945b

File tree

4 files changed

+93
-54
lines changed

4 files changed

+93
-54
lines changed

llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -763,8 +763,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
763763
// to be the same size as the dest.
764764
if (DstTy != SrcTy)
765765
return false;
766-
return llvm::is_contained({v2s32, v4s32, v2s64, v2p0, v16s8, v8s16},
767-
DstTy);
766+
return llvm::is_contained(
767+
{v2s64, v2p0, v2s32, v4s32, v4s16, v16s8, v8s8, v8s16}, DstTy);
768768
})
769769
// G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors), we
770770
// just want those lowered into G_BUILD_VECTOR

llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -720,9 +720,13 @@ bool matchDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
720720
case 4:
721721
if (ScalarSize == 32)
722722
Opc = AArch64::G_DUPLANE32;
723+
else if (ScalarSize == 16)
724+
Opc = AArch64::G_DUPLANE16;
723725
break;
724726
case 8:
725-
if (ScalarSize == 16)
727+
if (ScalarSize == 8)
728+
Opc = AArch64::G_DUPLANE8;
729+
else if (ScalarSize == 16)
726730
Opc = AArch64::G_DUPLANE16;
727731
break;
728732
case 16:
@@ -752,13 +756,10 @@ void applyDupLane(MachineInstr &MI, MachineRegisterInfo &MRI,
752756
Register DupSrc = MI.getOperand(1).getReg();
753757
// For types like <2 x s32>, we can use G_DUPLANE32, with a <4 x s32> source.
754758
// To do this, we can use a G_CONCAT_VECTORS to do the widening.
755-
if (SrcTy == LLT::fixed_vector(2, LLT::scalar(32))) {
756-
assert(MRI.getType(MI.getOperand(0).getReg()).getNumElements() == 2 &&
757-
"Unexpected dest elements");
759+
if (SrcTy.getSizeInBits() == 64) {
758760
auto Undef = B.buildUndef(SrcTy);
759-
DupSrc = B.buildConcatVectors(
760-
SrcTy.changeElementCount(ElementCount::getFixed(4)),
761-
{Src1Reg, Undef.getReg(0)})
761+
DupSrc = B.buildConcatVectors(SrcTy.multiplyElements(2),
762+
{Src1Reg, Undef.getReg(0)})
762763
.getReg(0);
763764
}
764765
B.buildInstr(MatchInfo.first, {MI.getOperand(0).getReg()}, {DupSrc, Lane});

llvm/test/CodeGen/AArch64/arm64-dup.ll

Lines changed: 37 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4-
5-
; CHECK-GI: warning: Instruction selection used fallback path for v_shuffledup8
6-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v_shuffledup16
7-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for vduplane8
8-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for vduplane16
9-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_perfectshuffle_dupext_v4i16
10-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_perfectshuffle_dupext_v4f16
3+
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
114

125
define <8 x i8> @v_dup8(i8 %A) nounwind {
136
; CHECK-LABEL: v_dup8:
@@ -417,25 +410,47 @@ entry:
417410
; Also test the DUP path in the PerfectShuffle generator.
418411

419412
define <4 x i16> @test_perfectshuffle_dupext_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
420-
; CHECK-LABEL: test_perfectshuffle_dupext_v4i16:
421-
; CHECK: // %bb.0:
422-
; CHECK-NEXT: trn1.4h v0, v0, v0
423-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
424-
; CHECK-NEXT: mov.s v0[1], v1[0]
425-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
426-
; CHECK-NEXT: ret
413+
; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4i16:
414+
; CHECK-SD: // %bb.0:
415+
; CHECK-SD-NEXT: trn1.4h v0, v0, v0
416+
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
417+
; CHECK-SD-NEXT: mov.s v0[1], v1[0]
418+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
419+
; CHECK-SD-NEXT: ret
420+
;
421+
; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4i16:
422+
; CHECK-GI: // %bb.0:
423+
; CHECK-GI-NEXT: adrp x8, .LCPI33_0
424+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
425+
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
426+
; CHECK-GI-NEXT: mov.d v0[1], v1[0]
427+
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI33_0]
428+
; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
429+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
430+
; CHECK-GI-NEXT: ret
427431
%r = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
428432
ret <4 x i16> %r
429433
}
430434

431435
define <4 x half> @test_perfectshuffle_dupext_v4f16(<4 x half> %a, <4 x half> %b) nounwind {
432-
; CHECK-LABEL: test_perfectshuffle_dupext_v4f16:
433-
; CHECK: // %bb.0:
434-
; CHECK-NEXT: trn1.4h v0, v0, v0
435-
; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1
436-
; CHECK-NEXT: mov.s v0[1], v1[0]
437-
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
438-
; CHECK-NEXT: ret
436+
; CHECK-SD-LABEL: test_perfectshuffle_dupext_v4f16:
437+
; CHECK-SD: // %bb.0:
438+
; CHECK-SD-NEXT: trn1.4h v0, v0, v0
439+
; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1
440+
; CHECK-SD-NEXT: mov.s v0[1], v1[0]
441+
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
442+
; CHECK-SD-NEXT: ret
443+
;
444+
; CHECK-GI-LABEL: test_perfectshuffle_dupext_v4f16:
445+
; CHECK-GI: // %bb.0:
446+
; CHECK-GI-NEXT: adrp x8, .LCPI34_0
447+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
448+
; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1
449+
; CHECK-GI-NEXT: mov.d v0[1], v1[0]
450+
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI34_0]
451+
; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
452+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
453+
; CHECK-GI-NEXT: ret
439454
%r = shufflevector <4 x half> %a, <4 x half> %b, <4 x i32> <i32 0, i32 0, i32 4, i32 5>
440455
ret <4 x half> %r
441456
}

llvm/test/CodeGen/AArch64/arm64-rev.ll

Lines changed: 46 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4-
5-
; CHECK-GI: warning: Instruction selection used fallback path for test_vrev64D8
6-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vrev64D16
7-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vrev32D8
8-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vrev32D16
9-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vrev16D8
10-
; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_vrev64D8_undef
3+
; RUN: llc < %s -mtriple=aarch64-eabi -aarch64-neon-syntax=apple -global-isel | FileCheck %s --check-prefixes=CHECK,CHECK-GI
114

125
define i32 @test_rev_w(i32 %a) nounwind {
136
; CHECK-LABEL: test_rev_w:
@@ -303,22 +296,42 @@ define <4 x float> @test_vrev64Qf(ptr %A) nounwind {
303296
}
304297

305298
define <8 x i8> @test_vrev32D8(ptr %A) nounwind {
306-
; CHECK-LABEL: test_vrev32D8:
307-
; CHECK: // %bb.0:
308-
; CHECK-NEXT: ldr d0, [x0]
309-
; CHECK-NEXT: rev32.8b v0, v0
310-
; CHECK-NEXT: ret
299+
; CHECK-SD-LABEL: test_vrev32D8:
300+
; CHECK-SD: // %bb.0:
301+
; CHECK-SD-NEXT: ldr d0, [x0]
302+
; CHECK-SD-NEXT: rev32.8b v0, v0
303+
; CHECK-SD-NEXT: ret
304+
;
305+
; CHECK-GI-LABEL: test_vrev32D8:
306+
; CHECK-GI: // %bb.0:
307+
; CHECK-GI-NEXT: ldr d0, [x0]
308+
; CHECK-GI-NEXT: adrp x8, .LCPI19_0
309+
; CHECK-GI-NEXT: mov.d v0[1], v0[0]
310+
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI19_0]
311+
; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
312+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
313+
; CHECK-GI-NEXT: ret
311314
%tmp1 = load <8 x i8>, ptr %A
312315
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
313316
ret <8 x i8> %tmp2
314317
}
315318

316319
define <4 x i16> @test_vrev32D16(ptr %A) nounwind {
317-
; CHECK-LABEL: test_vrev32D16:
318-
; CHECK: // %bb.0:
319-
; CHECK-NEXT: ldr d0, [x0]
320-
; CHECK-NEXT: rev32.4h v0, v0
321-
; CHECK-NEXT: ret
320+
; CHECK-SD-LABEL: test_vrev32D16:
321+
; CHECK-SD: // %bb.0:
322+
; CHECK-SD-NEXT: ldr d0, [x0]
323+
; CHECK-SD-NEXT: rev32.4h v0, v0
324+
; CHECK-SD-NEXT: ret
325+
;
326+
; CHECK-GI-LABEL: test_vrev32D16:
327+
; CHECK-GI: // %bb.0:
328+
; CHECK-GI-NEXT: ldr d0, [x0]
329+
; CHECK-GI-NEXT: adrp x8, .LCPI20_0
330+
; CHECK-GI-NEXT: mov.d v0[1], v0[0]
331+
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI20_0]
332+
; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
333+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
334+
; CHECK-GI-NEXT: ret
322335
%tmp1 = load <4 x i16>, ptr %A
323336
%tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
324337
ret <4 x i16> %tmp2
@@ -363,11 +376,21 @@ define <8 x i16> @test_vrev32Q16(ptr %A) nounwind {
363376
}
364377

365378
define <8 x i8> @test_vrev16D8(ptr %A) nounwind {
366-
; CHECK-LABEL: test_vrev16D8:
367-
; CHECK: // %bb.0:
368-
; CHECK-NEXT: ldr d0, [x0]
369-
; CHECK-NEXT: rev16.8b v0, v0
370-
; CHECK-NEXT: ret
379+
; CHECK-SD-LABEL: test_vrev16D8:
380+
; CHECK-SD: // %bb.0:
381+
; CHECK-SD-NEXT: ldr d0, [x0]
382+
; CHECK-SD-NEXT: rev16.8b v0, v0
383+
; CHECK-SD-NEXT: ret
384+
;
385+
; CHECK-GI-LABEL: test_vrev16D8:
386+
; CHECK-GI: // %bb.0:
387+
; CHECK-GI-NEXT: ldr d0, [x0]
388+
; CHECK-GI-NEXT: adrp x8, .LCPI23_0
389+
; CHECK-GI-NEXT: mov.d v0[1], v0[0]
390+
; CHECK-GI-NEXT: ldr d1, [x8, :lo12:.LCPI23_0]
391+
; CHECK-GI-NEXT: tbl.16b v0, { v0 }, v1
392+
; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
393+
; CHECK-GI-NEXT: ret
371394
%tmp1 = load <8 x i8>, ptr %A
372395
%tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
373396
ret <8 x i8> %tmp2

0 commit comments

Comments
 (0)