Skip to content

Commit abd0d5d

Browse files
committed
Reland: [AArch64][GlobalISel] Adopt dup(load) -> LD1R patterns from SelectionDAG
This relands the fb8f591 and makes isAArch64FrameOffsetLegal function recognize LD1R instructions. Original PR: #66914 PR of the fix: #69003
1 parent 3162cf0 commit abd0d5d

File tree

5 files changed

+103
-26
lines changed

5 files changed

+103
-26
lines changed

llvm/lib/Target/AArch64/AArch64InstrGISel.td

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -511,3 +511,20 @@ let AddedComplexity = 19 in {
511511
defm : VecROStoreLane64_0Pat<ro16, store, v4i16, i16, hsub, STRHroW, STRHroX>;
512512
defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
513513
}
514+
515+
def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
516+
(LD1Rv8b GPR64sp:$Rn)>;
517+
def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
518+
(LD1Rv16b GPR64sp:$Rn)>;
519+
def : Pat<(v4i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
520+
(LD1Rv4h GPR64sp:$Rn)>;
521+
def : Pat<(v8i16 (AArch64dup (i16 (load GPR64sp:$Rn)))),
522+
(LD1Rv8h GPR64sp:$Rn)>;
523+
def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
524+
(LD1Rv2s GPR64sp:$Rn)>;
525+
def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
526+
(LD1Rv4s GPR64sp:$Rn)>;
527+
def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
528+
(LD1Rv2d GPR64sp:$Rn)>;
529+
def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
530+
(LD1Rv1d GPR64sp:$Rn)>;

llvm/lib/Target/AArch64/AArch64InstrInfo.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5584,6 +5584,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
55845584
switch (MI.getOpcode()) {
55855585
default:
55865586
break;
5587+
case AArch64::LD1Rv1d:
5588+
case AArch64::LD1Rv2s:
5589+
case AArch64::LD1Rv2d:
5590+
case AArch64::LD1Rv4h:
5591+
case AArch64::LD1Rv4s:
5592+
case AArch64::LD1Rv8b:
5593+
case AArch64::LD1Rv8h:
5594+
case AArch64::LD1Rv16b:
55875595
case AArch64::LD1Twov2d:
55885596
case AArch64::LD1Threev2d:
55895597
case AArch64::LD1Fourv2d:

llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll

Lines changed: 49 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
22
; RUN: llc -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefixes=CHECK,SDAG
3-
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=arm64-apple-ios7.0 -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
3+
; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=arm64-apple-ios7.0 -o - %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL
44

55
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for test_v8i8_pre_load
66
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for test_v8i8_post_load
@@ -620,9 +620,6 @@
620620
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i8
621621
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i16
622622
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_i32
623-
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_v3i32_small_align
624-
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_v3i32_default_align
625-
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_valid_const_index_v3i32
626623
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_masked_i32
627624
; CHECK-GISEL-NOT: warning: Instruction selection used fallback path for load_single_extract_variable_index_masked2_i32
628625

@@ -13786,11 +13783,18 @@ define ptr @test_v1f64_post_reg_st4lane(ptr %A, ptr %ptr, <1 x double> %B, <1 x
1378613783
declare void @llvm.aarch64.neon.st4lane.v1f64.p0(<1 x double>, <1 x double>, <1 x double>, <1 x double>, i64, ptr)
1378713784

1378813785
define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
13789-
; CHECK-LABEL: test_v16i8_post_imm_ld1r:
13790-
; CHECK: ; %bb.0:
13791-
; CHECK-NEXT: ld1r.16b { v0 }, [x0], #1
13792-
; CHECK-NEXT: str x0, [x1]
13793-
; CHECK-NEXT: ret
13786+
; SDAG-LABEL: test_v16i8_post_imm_ld1r:
13787+
; SDAG: ; %bb.0:
13788+
; SDAG-NEXT: ld1r.16b { v0 }, [x0], #1
13789+
; SDAG-NEXT: str x0, [x1]
13790+
; SDAG-NEXT: ret
13791+
;
13792+
; CHECK-GISEL-LABEL: test_v16i8_post_imm_ld1r:
13793+
; CHECK-GISEL: ; %bb.0:
13794+
; CHECK-GISEL-NEXT: ld1r.16b { v0 }, [x0]
13795+
; CHECK-GISEL-NEXT: add x8, x0, #1
13796+
; CHECK-GISEL-NEXT: str x8, [x1]
13797+
; CHECK-GISEL-NEXT: ret
1379413798
%tmp1 = load i8, ptr %bar
1379513799
%tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
1379613800
%tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13814,11 +13818,18 @@ define <16 x i8> @test_v16i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
1381413818
}
1381513819

1381613820
define <16 x i8> @test_v16i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
13817-
; CHECK-LABEL: test_v16i8_post_reg_ld1r:
13818-
; CHECK: ; %bb.0:
13819-
; CHECK-NEXT: ld1r.16b { v0 }, [x0], x2
13820-
; CHECK-NEXT: str x0, [x1]
13821-
; CHECK-NEXT: ret
13821+
; SDAG-LABEL: test_v16i8_post_reg_ld1r:
13822+
; SDAG: ; %bb.0:
13823+
; SDAG-NEXT: ld1r.16b { v0 }, [x0], x2
13824+
; SDAG-NEXT: str x0, [x1]
13825+
; SDAG-NEXT: ret
13826+
;
13827+
; CHECK-GISEL-LABEL: test_v16i8_post_reg_ld1r:
13828+
; CHECK-GISEL: ; %bb.0:
13829+
; CHECK-GISEL-NEXT: ld1r.16b { v0 }, [x0]
13830+
; CHECK-GISEL-NEXT: add x8, x0, x2
13831+
; CHECK-GISEL-NEXT: str x8, [x1]
13832+
; CHECK-GISEL-NEXT: ret
1382213833
%tmp1 = load i8, ptr %bar
1382313834
%tmp2 = insertelement <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
1382413835
%tmp3 = insertelement <16 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13842,11 +13853,18 @@ define <16 x i8> @test_v16i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
1384213853
}
1384313854

1384413855
define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
13845-
; CHECK-LABEL: test_v8i8_post_imm_ld1r:
13846-
; CHECK: ; %bb.0:
13847-
; CHECK-NEXT: ld1r.8b { v0 }, [x0], #1
13848-
; CHECK-NEXT: str x0, [x1]
13849-
; CHECK-NEXT: ret
13856+
; SDAG-LABEL: test_v8i8_post_imm_ld1r:
13857+
; SDAG: ; %bb.0:
13858+
; SDAG-NEXT: ld1r.8b { v0 }, [x0], #1
13859+
; SDAG-NEXT: str x0, [x1]
13860+
; SDAG-NEXT: ret
13861+
;
13862+
; CHECK-GISEL-LABEL: test_v8i8_post_imm_ld1r:
13863+
; CHECK-GISEL: ; %bb.0:
13864+
; CHECK-GISEL-NEXT: ld1r.8b { v0 }, [x0]
13865+
; CHECK-GISEL-NEXT: add x8, x0, #1
13866+
; CHECK-GISEL-NEXT: str x8, [x1]
13867+
; CHECK-GISEL-NEXT: ret
1385013868
%tmp1 = load i8, ptr %bar
1385113869
%tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
1385213870
%tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1
@@ -13862,11 +13880,18 @@ define <8 x i8> @test_v8i8_post_imm_ld1r(ptr %bar, ptr %ptr) {
1386213880
}
1386313881

1386413882
define <8 x i8> @test_v8i8_post_reg_ld1r(ptr %bar, ptr %ptr, i64 %inc) {
13865-
; CHECK-LABEL: test_v8i8_post_reg_ld1r:
13866-
; CHECK: ; %bb.0:
13867-
; CHECK-NEXT: ld1r.8b { v0 }, [x0], x2
13868-
; CHECK-NEXT: str x0, [x1]
13869-
; CHECK-NEXT: ret
13883+
; SDAG-LABEL: test_v8i8_post_reg_ld1r:
13884+
; SDAG: ; %bb.0:
13885+
; SDAG-NEXT: ld1r.8b { v0 }, [x0], x2
13886+
; SDAG-NEXT: str x0, [x1]
13887+
; SDAG-NEXT: ret
13888+
;
13889+
; CHECK-GISEL-LABEL: test_v8i8_post_reg_ld1r:
13890+
; CHECK-GISEL: ; %bb.0:
13891+
; CHECK-GISEL-NEXT: ld1r.8b { v0 }, [x0]
13892+
; CHECK-GISEL-NEXT: add x8, x0, x2
13893+
; CHECK-GISEL-NEXT: str x8, [x1]
13894+
; CHECK-GISEL-NEXT: ret
1387013895
%tmp1 = load i8, ptr %bar
1387113896
%tmp2 = insertelement <8 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, i8 %tmp1, i32 0
1387213897
%tmp3 = insertelement <8 x i8> %tmp2, i8 %tmp1, i32 1

llvm/test/CodeGen/AArch64/arm64-ld1.ll

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
22
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3-
; RUN: llc < %s -global-isel=1 -global-isel-abort=2 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
3+
; RUN: llc < %s -global-isel=1 -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,CHECK-GI
44

55
%struct.__neon_int8x8x2_t = type { <8 x i8>, <8 x i8> }
66
%struct.__neon_int8x8x3_t = type { <8 x i8>, <8 x i8>, <8 x i8> }
@@ -1712,3 +1712,30 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
17121712
%val = call %struct.__neon_float64x2x4_t @llvm.aarch64.neon.ld1x4.v2f64.p0(ptr %addr)
17131713
ret %struct.__neon_float64x2x4_t %val
17141714
}
1715+
1716+
define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
1717+
; CHECK-SD-LABEL: dup_ld1_from_stack:
1718+
; CHECK-SD: // %bb.0: // %entry
1719+
; CHECK-SD-NEXT: sub sp, sp, #16
1720+
; CHECK-SD-NEXT: .cfi_def_cfa_offset 16
1721+
; CHECK-SD-NEXT: add x8, sp, #15
1722+
; CHECK-SD-NEXT: ld1r.8b { v0 }, [x8]
1723+
; CHECK-SD-NEXT: add sp, sp, #16
1724+
; CHECK-SD-NEXT: ret
1725+
;
1726+
; CHECK-GI-LABEL: dup_ld1_from_stack:
1727+
; CHECK-GI: // %bb.0: // %entry
1728+
; CHECK-GI-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
1729+
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
1730+
; CHECK-GI-NEXT: .cfi_offset w29, -16
1731+
; CHECK-GI-NEXT: add x8, sp, #15
1732+
; CHECK-GI-NEXT: ld1r.8b { v0 }, [x8]
1733+
; CHECK-GI-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
1734+
; CHECK-GI-NEXT: ret
1735+
entry:
1736+
%item = alloca i8, align 1
1737+
%0 = load i8, ptr %item, align 1
1738+
%1 = insertelement <8 x i8> poison, i8 %0, i32 0
1739+
%lane = shufflevector <8 x i8> %1, <8 x i8> %1, <8 x i32> zeroinitializer
1740+
ret <8 x i8> %lane
1741+
}

llvm/test/CodeGen/AArch64/arm64-st1.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
2-
; RUN: llc < %s -global-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
2+
; RUN: llc < %s -global-isel -global-isel-abort=1 -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s
33
; The instruction latencies of Exynos-M3 trigger the transform we see under the Exynos check.
44
; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m3 | FileCheck --check-prefix=EXYNOS %s
55

0 commit comments

Comments
 (0)