llvm
diff --git a/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 271 additions & 120 deletions b/‎llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Lines changed: 271 additions & 120 deletions
diff --git a/‎llvm/lib/CodeGen/TargetLoweringBase.cpp
Lines changed: 1 addition & 1 deletion b/‎llvm/lib/CodeGen/TargetLoweringBase.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 10 additions & 0 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
Lines changed: 10 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/argument-blocks.ll
Lines changed: 1 addition & 1 deletion b/‎llvm/test/CodeGen/AArch64/argument-blocks.ll
Lines changed: 1 addition & 1 deletion
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
Lines changed: 8 additions & 4 deletions b/‎llvm/test/CodeGen/AArch64/arm64-abi-varargs.ll
Lines changed: 8 additions & 4 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-abi.ll
Lines changed: 4 additions & 1 deletion b/‎llvm/test/CodeGen/AArch64/arm64-abi.ll
Lines changed: 4 additions & 1 deletion
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
Lines changed: 1 addition & 5 deletions b/‎llvm/test/CodeGen/AArch64/arm64-memset-inline.ll
Lines changed: 1 addition & 5 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/arm64-stur.ll
Lines changed: 2 additions & 5 deletions b/‎llvm/test/CodeGen/AArch64/arm64-stur.ll
Lines changed: 2 additions & 5 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/merge-store.ll
Lines changed: 2 additions & 3 deletions b/‎llvm/test/CodeGen/AArch64/merge-store.ll
Lines changed: 2 additions & 3 deletions
diff --git a/‎llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
Lines changed: 2 additions & 1 deletion b/‎llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
Lines changed: 2 additions & 1 deletion
diff --git a/‎llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Lines changed: 3 additions & 0 deletions b/‎llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
Lines changed: 3 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
Lines changed: 8 additions & 16 deletions b/‎llvm/test/CodeGen/AMDGPU/debugger-insert-nops.ll
Lines changed: 8 additions & 16 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/merge-stores.ll
Lines changed: 24 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/merge-stores.ll
Lines changed: 24 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/private-element-size.ll
Lines changed: 4 additions & 4 deletions b/‎llvm/test/CodeGen/AMDGPU/private-element-size.ll
Lines changed: 4 additions & 4 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
Lines changed: 9 additions & 8 deletions b/‎llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
Lines changed: 9 additions & 8 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
Lines changed: 0 additions & 7 deletions b/‎llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
Lines changed: 0 additions & 7 deletions
diff --git a/‎llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
Lines changed: 0 additions & 6 deletions b/‎llvm/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
Lines changed: 0 additions & 6 deletions
diff --git a/‎llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
Lines changed: 1 addition & 2 deletions b/‎llvm/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
Lines changed: 1 addition & 2 deletions
diff --git a/‎llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
Lines changed: 2 additions & 2 deletions b/‎llvm/test/CodeGen/ARM/alloc-no-stack-realign.ll
Lines changed: 2 additions & 2 deletions
diff --git a/‎llvm/test/CodeGen/ARM/ifcvt10.ll
Lines changed: 2 additions & 0 deletions b/‎llvm/test/CodeGen/ARM/ifcvt10.ll
Lines changed: 2 additions & 0 deletions
diff --git a/‎llvm/test/CodeGen/ARM/memset-inline.ll
Lines changed: 2 additions & 8 deletions b/‎llvm/test/CodeGen/ARM/memset-inline.ll
Lines changed: 2 additions & 8 deletions
diff --git a/‎llvm/test/CodeGen/ARM/static-addr-hoisting.ll
Lines changed: 3 additions & 3 deletions b/‎llvm/test/CodeGen/ARM/static-addr-hoisting.ll
Lines changed: 3 additions & 3 deletions
@@ -824,7 +824,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
-  GatherAllAliasesMaxDepth = 18;
+  GatherAllAliasesMaxDepth = 6;
   MinStackArgumentAlignment = 1;
   MinimumJumpTableEntries = 4;
   // TODO: the default will be switched to 0 in the next commit, along
 
@@ -446,6 +446,16 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setSelectIsExpensive(false);
   PredictableSelectIsExpensive = false;
 
+  // We want to find all load dependencies for long chains of stores to enable
+  // merging into very wide vectors. The problem is with vectors with > 4
+  // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
+  // vectors are a legal type, even though we have to split the loads
+  // usually. When we can more precisely specify load legality per address
+  // space, we should be able to make FindBetterChain/MergeConsecutiveStores
+  // smarter so that they can figure out what to do in 2 iterations without all
+  // N > 4 stores on the same chain.
+  GatherAllAliasesMaxDepth = 16;
+
   // FIXME: Need to really handle these.
   MaxStoresPerMemcpy  = 4096;
   MaxStoresPerMemmove = 4096;
 
@@ -62,7 +62,7 @@ define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
 ; but should go in an 8-byte aligned slot.
 define void @test_varargs_stackalign() {
 ; CHECK-LABEL: test_varargs_stackalign:
-; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16]
+; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
 
   call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
   ret void
 
@@ -6,13 +6,17 @@
 define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
 ; CHECK-LABEL: fn9:
 ; 9th fixed argument
-; CHECK: add x[[ADDR:[0-9]+]], sp, #72
+; CHECK: ldr {{w[0-9]+}}, [sp, #64]
+; CHECK: add [[ARGS:x[0-9]+]], sp, #72
+; CHECK: add {{x[0-9]+}}, [[ARGS]], #8
 ; First vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: ldr {{w[0-9]+}}, [sp, #72]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Second vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [x[[ADDR]]]
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
+; CHECK: add {{x[0-9]+}}, {{x[0-9]+}}, #8
 ; Third vararg
-; CHECK-DAG: ldr {{w[0-9]+}}, [x[[ADDR]]], #8
+; CHECK: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   %3 = alloca i32, align 4
 
@@ -205,7 +205,10 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
 ; CHECK-LABEL: test8
-; CHECK: str w8, [sp]
+; CHECK: strb {{w[0-9]+}}, [sp, #3]
+; CHECK: strb wzr, [sp, #2]
+; CHECK: strb {{w[0-9]+}}, [sp, #1]
+; CHECK: strb wzr, [sp]
 ; CHECK: bl
 ; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]
 
@@ -9,15 +9,11 @@ entry:
   ret void
 }
 
-; FIXME: This shouldn't need to load in a zero value to store
-;        (e.g. stp xzr,xzr [sp, #16])
-
 define void @t2() nounwind ssp {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: movi v0.2d, #0000000000000000
-; CHECK: stur q0, [sp, #16]
 ; CHECK: strh wzr, [sp, #32]
+; CHECK: stp xzr, xzr, [sp, #16]
 ; CHECK: str xzr, [sp, #8]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
 
@@ -47,14 +47,11 @@ define void @foo5(i8* %p, i32 %val) nounwind {
   ret void
 }
 
-;; FIXME: Again, with the writing of a quadword zero...
-
 define void @foo(%struct.X* nocapture %p) nounwind optsize ssp {
 ; CHECK-LABEL: foo:
 ; CHECK-NOT: str
-; CHECK: stur    q0, [x0, #4]
-; CHECK-FIXME: stur    xzr, [x0, #12]
-; CHECK-FIXME-NEXT: stur    xzr, [x0, #4]
+; CHECK: stur    xzr, [x0, #12]
+; CHECK-NEXT: stur    xzr, [x0, #4]
 ; CHECK-NEXT: ret
   %B = getelementptr inbounds %struct.X, %struct.X* %p, i64 0, i32 1
   %val = bitcast i64* %B to i8*
 
@@ -4,9 +4,8 @@
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
 
-; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0
-;; TODO: this next line seems like a redundant no-op move?
-; CHECK: ins     v0.s[1], v0.s[1]
+; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
+; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
 ; CHECK: str d[[R0]]
 
 define void @blam() {
 
@@ -1,4 +1,5 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
+; RUN: llc --combiner-alias-analysis=true  < %s | FileCheck %s
 
 ; This test checks that we do not merge stores together which have
 ; dependencies through their non-chain operands (e.g. one store is the
 
@@ -88,9 +88,12 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; SI-DAG: v_cvt_f32_ubyte2_e32
 ; SI-DAG: v_cvt_f32_ubyte3_e32
 
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 24
+; SI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16
 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
 ; SI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 8
 ; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; SI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff00,
 ; SI-DAG: v_add_i32
 
 ; SI: buffer_store_dwordx4
 
@@ -1,21 +1,13 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
 
-; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.
-
-; Check that each line appears at least once
-; CHECK-DAG: test01.cl:2:3
-; CHECK-DAG: test01.cl:3:3
-; CHECK-DAG: test01.cl:4:3
+; CHECK: test01.cl:2:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
+; CHECK: test01.cl:3:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
-; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
-; CHECKNOP: test01.cl:{{[234]}}:3
-; CHECKNOP-NEXT: s_nop 0
+; CHECK: test01.cl:4:{{[0-9]+}}
+; CHECK-NEXT: s_nop 0
 
 ; CHECK: test01.cl:5:{{[0-9]+}}
 ; CHECK-NEXT: s_nop 0
@@ -29,7 +21,7 @@ entry:
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
-  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
   store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
 
@@ -1,5 +1,8 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
+
+; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
 ; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
@@ -146,10 +149,17 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   ret void
 }
 
+; FIXME: Should be able to merge this
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
-; GCN: buffer_store_dwordx2
-; GCN: buffer_store_dword v
-; GCN: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+
+; GCN-AA: buffer_store_dwordx2
+; GCN-AA: buffer_store_dword v
+; GCN-AA: buffer_store_dword v
+
 ; GCN: s_endpgm
 define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
@@ -468,9 +478,17 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
   ret void
 }
 
+; This works once AA is enabled on the subtarget
 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; GCN: buffer_store_dwordx4 [[LOAD]]
+
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+; GCN-NOAA: buffer_store_dword v
+
+; GCN-AA: buffer_store_dwordx4 [[LOAD]]
+
 ; GCN: s_endpgm
 define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
@@ -32,10 +32,10 @@
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
 
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
 define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
 
@@ -156,8 +156,9 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 
 ; FUNC-LABEL: @reorder_local_offsets
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
-; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
+; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
+; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
+; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
 define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
@@ -179,12 +180,12 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_store_dword
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
 ; CI: s_endpgm
 define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
 
@@ -3,13 +3,6 @@
 ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
-; FIXME: this fails because the load generated from extractelement is
-;; now properly recognized as forwardable to the value stored in
-;; insertelement, and thus the loads/stores drop away entirely. This
-;; makes the intended test, of running out of registers, not occur.
-
-;; XFAIL: *
-
 ; This ends up using all 256 registers and requires register
 ; scavenging which will fail to find an unsued register.
 
 
@@ -1,12 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-;; FIXME: this fails because the load generated from extractelement is
-;; now properly recognized as forwardable to the value stored in
-;; insertelement, and thus the loads/stores drop away entirely. This
-;; makes the intended test, of running out of registers, not occur.
-;; XFAIL: *
-
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
 
 
@@ -12,8 +12,7 @@ define void @test_byval_8_bytes_alignment(i32 %i, ...) {
 entry:
 ; CHECK: sub       sp, sp, #12
 ; CHECK: sub       sp, sp, #4
-; CHECK: add       r0, sp, #4
-; CHECK: stm       sp, {r0, r1, r2, r3}
+; CHECK: stmib     sp, {r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)
 
@@ -51,12 +51,12 @@ entry:
 ; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
 
 
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
 ; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
 ; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: orr r[[R1:[0-9]+]], r[[R1]], #16
+; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
+; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
 ; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
 
 ; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
 
@@ -9,6 +9,8 @@ entry:
 ; CHECK-LABEL: t:
 ; CHECK: vpop {d8}
 ; CHECK-NOT: vpopne
+; CHECK: pop {r7, pc}
+; CHECK: vpop {d8}
 ; CHECK: pop {r7, pc}
   br i1 undef, label %if.else, label %if.then
 
 
@@ -3,15 +3,9 @@
 define void @t1(i8* nocapture %c) nounwind optsize {
 entry:
 ; CHECK-LABEL: t1:
-
-;; FIXME: like with arm64-memset-inline.ll, learning how to merge
-;; stores made this code worse, since it now uses a vector move,
-;; instead of just using an strd instruction taking two registers.
-
-; CHECK: vmov.i32 d16, #0x0
-; CHECK: vst1.32 {d16}, [r0:64]!
 ; CHECK: movs r1, #0
-; CHECK: str r1, [r0]
+; CHECK: strd r1, r1, [r0]
+; CHECK: str r1, [r0, #8]
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 12, i32 8, i1 false)
   ret void
 }
 
@@ -6,9 +6,9 @@ define void @multiple_store() {
 ; CHECK: movs [[VAL:r[0-9]+]], #42
 ; CHECK: movt r[[BASE1]], #15
 
-; CHECK-DAG: str [[VAL]], [r[[BASE1]]]
-; CHECK-DAG: str [[VAL]], [r[[BASE1]], #24]
-; CHECK-DAG: str.w [[VAL]], [r[[BASE1]], #42]
+; CHECK: str [[VAL]], [r[[BASE1]]]
+; CHECK: str [[VAL]], [r[[BASE1]], #24]
+; CHECK: str.w [[VAL]], [r[[BASE1]], #42]
 
 ; CHECK: movw r[[BASE2:[0-9]+]], #20394
 ; CHECK: movt r[[BASE2]], #18