Skip to content

Commit 2920061

Browse files
committed
ARM64: improve non-zero memset isel by ~2x
Summary: I added a few ARM64 memset codegen tests in r341406 and r341493, and annotated where the generated code was bad. This patch fixes the majority of the issues by requesting that a 2xi64 vector be used for memset of 32 bytes and above. The patch leaves the former request for f128 unchanged, despite f128 materialization being suboptimal: doing otherwise runs into other asserts in isel and makes this patch too broad. This patch hides the issue that was present in bzero_40_stack and bzero_72_stack because the code now generates in a better order which doesn't have the store offset issue. I'm not aware of that issue appearing elsewhere at the moment. <rdar://problem/44157755> Reviewers: t.p.northover, MatzeB, javed.absar Subscribers: eraman, kristof.beyls, chrib, dexonsmith, llvm-commits Differential Revision: https://reviews.llvm.org/D51706 llvm-svn: 341558
1 parent 99d7320 commit 2920061

File tree

2 files changed

+57
-84
lines changed

2 files changed

+57
-84
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 20 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8342,27 +8342,30 @@ EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
83428342
bool ZeroMemset,
83438343
bool MemcpyStrSrc,
83448344
MachineFunction &MF) const {
8345-
// Don't use AdvSIMD to implement 16-byte memset. It would have taken one
8346-
// instruction to materialize the v2i64 zero and one store (with restrictive
8347-
// addressing mode). Just do two i64 store of zero-registers.
8348-
bool Fast;
83498345
const Function &F = MF.getFunction();
8350-
if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
8351-
!F.hasFnAttribute(Attribute::NoImplicitFloat) &&
8352-
(memOpAlign(SrcAlign, DstAlign, 16) ||
8353-
(allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
8354-
return MVT::f128;
8346+
bool CanImplicitFloat = !F.hasFnAttribute(Attribute::NoImplicitFloat);
8347+
bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
8348+
bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat;
8349+
// Only use AdvSIMD to implement memset of 32-byte and above. It would have
8350+
// taken one instruction to materialize the v2i64 zero and one store (with
8351+
// restrictive addressing mode). Just do i64 stores.
8352+
bool IsSmallMemset = IsMemset && Size < 32;
8353+
auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
8354+
if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
8355+
return true;
8356+
bool Fast;
8357+
return allowsMisalignedMemoryAccesses(VT, 0, 1, &Fast) && Fast;
8358+
};
83558359

8356-
if (Size >= 8 &&
8357-
(memOpAlign(SrcAlign, DstAlign, 8) ||
8358-
(allowsMisalignedMemoryAccesses(MVT::i64, 0, 1, &Fast) && Fast)))
8360+
if (CanUseNEON && IsMemset && !IsSmallMemset &&
8361+
AlignmentIsAcceptable(MVT::v2i64, 16))
8362+
return MVT::v2i64;
8363+
if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
8364+
return MVT::f128;
8365+
if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
83598366
return MVT::i64;
8360-
8361-
if (Size >= 4 &&
8362-
(memOpAlign(SrcAlign, DstAlign, 4) ||
8363-
(allowsMisalignedMemoryAccesses(MVT::i32, 0, 1, &Fast) && Fast)))
8367+
if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
83648368
return MVT::i32;
8365-
83668369
return MVT::Other;
83678370
}
83688371

llvm/test/CodeGen/AArch64/arm64-memset-inline.ll

Lines changed: 37 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -137,14 +137,12 @@ define void @bzero_32_stack() {
137137
ret void
138138
}
139139

140-
; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store.
141140
define void @bzero_40_stack() {
142141
; CHECK-LABEL: bzero_40_stack:
143-
; CHECK: stp xzr, x30, [sp, #40]
144-
; CHECK: movi v0.2d, #0000000000000000
145-
; CHECK-NEXT: add x0, sp, #8
146-
; CHECK-NEXT: stur q0, [sp, #24]
147-
; CHECK-NEXT: stur q0, [sp, #8]
142+
; CHECK: movi v0.2d, #0000000000000000
143+
; CHECK-NEXT: mov x0, sp
144+
; CHECK-NEXT: str xzr, [sp, #32]
145+
; CHECK-NEXT: stp q0, q0, [sp]
148146
; CHECK-NEXT: bl something
149147
%buf = alloca [40 x i8], align 1
150148
%cast = bitcast [40 x i8]* %buf to i8*
@@ -167,16 +165,13 @@ define void @bzero_64_stack() {
167165
ret void
168166
}
169167

170-
; FIXME These don't pair up because the offset isn't a multiple of 16 bits. x0, however, could be used as a base for a paired store.
171168
define void @bzero_72_stack() {
172169
; CHECK-LABEL: bzero_72_stack:
173-
; CHECK: stp xzr, x30, [sp, #72]
174170
; CHECK: movi v0.2d, #0000000000000000
175-
; CHECK-NEXT: x0, sp, #8
176-
; CHECK-NEXT: stur q0, [sp, #56]
177-
; CHECK-NEXT: stur q0, [sp, #40]
178-
; CHECK-NEXT: stur q0, [sp, #24]
179-
; CHECK-NEXT: stur q0, [sp, #8]
171+
; CHECK-NEXT: mov x0, sp
172+
; CHECK-NEXT: str xzr, [sp, #64]
173+
; CHECK-NEXT: stp q0, q0, [sp, #32]
174+
; CHECK-NEXT: stp q0, q0, [sp]
180175
; CHECK-NEXT: bl something
181176
%buf = alloca [72 x i8], align 1
182177
%cast = bitcast [72 x i8]* %buf to i8*
@@ -310,14 +305,11 @@ define void @memset_26_stack() {
310305
ret void
311306
}
312307

313-
; FIXME This could use FP ops.
314308
define void @memset_32_stack() {
315309
; CHECK-LABEL: memset_32_stack:
316-
; CHECK: mov x8, #-6148914691236517206
310+
; CHECK: movi v0.16b, #170
317311
; CHECK-NEXT: mov x0, sp
318-
; CHECK-NEXT: stp x8, x30, [sp, #24]
319-
; CHECK-NEXT: stp x8, x8, [sp, #8]
320-
; CHECK-NEXT: str x8, [sp]
312+
; CHECK-NEXT: stp q0, q0, [sp]
321313
; CHECK-NEXT: bl something
322314
%buf = alloca [32 x i8], align 1
323315
%cast = bitcast [32 x i8]* %buf to i8*
@@ -326,14 +318,13 @@ define void @memset_32_stack() {
326318
ret void
327319
}
328320

329-
; FIXME This could use FP ops.
330321
define void @memset_40_stack() {
331322
; CHECK-LABEL: memset_40_stack:
332323
; CHECK: mov x8, #-6148914691236517206
333-
; CHECK-NEXT: add x0, sp, #8
334-
; CHECK-NEXT: stp x8, x30, [sp, #40]
335-
; CHECK-NEXT: stp x8, x8, [sp, #24]
336-
; CHECK-NEXT: stp x8, x8, [sp, #8]
324+
; CHECK-NEXT: movi v0.16b, #170
325+
; CHECK-NEXT: mov x0, sp
326+
; CHECK-NEXT: str x8, [sp, #32]
327+
; CHECK-NEXT: stp q0, q0, [sp]
337328
; CHECK-NEXT: bl something
338329
%buf = alloca [40 x i8], align 1
339330
%cast = bitcast [40 x i8]* %buf to i8*
@@ -342,16 +333,12 @@ define void @memset_40_stack() {
342333
ret void
343334
}
344335

345-
; FIXME This could use FP ops.
346336
define void @memset_64_stack() {
347337
; CHECK-LABEL: memset_64_stack:
348-
; CHECK: mov x8, #-6148914691236517206
338+
; CHECK: movi v0.16b, #170
349339
; CHECK-NEXT: mov x0, sp
350-
; CHECK-NEXT: stp x8, x30, [sp, #56]
351-
; CHECK-NEXT: stp x8, x8, [sp, #40]
352-
; CHECK-NEXT: stp x8, x8, [sp, #24]
353-
; CHECK-NEXT: stp x8, x8, [sp, #8]
354-
; CHECK-NEXT: str x8, [sp]
340+
; CHECK-NEXT: stp q0, q0, [sp, #32]
341+
; CHECK-NEXT: stp q0, q0, [sp]
355342
; CHECK-NEXT: bl something
356343
%buf = alloca [64 x i8], align 1
357344
%cast = bitcast [64 x i8]* %buf to i8*
@@ -360,16 +347,14 @@ define void @memset_64_stack() {
360347
ret void
361348
}
362349

363-
; FIXME This could use FP ops.
364350
define void @memset_72_stack() {
365351
; CHECK-LABEL: memset_72_stack:
366352
; CHECK: mov x8, #-6148914691236517206
367-
; CHECK-NEXT: add x0, sp, #8
368-
; CHECK-NEXT: stp x8, x30, [sp, #72]
369-
; CHECK-NEXT: stp x8, x8, [sp, #56]
370-
; CHECK-NEXT: stp x8, x8, [sp, #40]
371-
; CHECK-NEXT: stp x8, x8, [sp, #24]
372-
; CHECK-NEXT: stp x8, x8, [sp, #8]
353+
; CHECK-NEXT: movi v0.16b, #170
354+
; CHECK-NEXT: mov x0, sp
355+
; CHECK-NEXT: str x8, [sp, #64]
356+
; CHECK-NEXT: stp q0, q0, [sp, #32]
357+
; CHECK-NEXT: stp q0, q0, [sp]
373358
; CHECK-NEXT: bl something
374359
%buf = alloca [72 x i8], align 1
375360
%cast = bitcast [72 x i8]* %buf to i8*
@@ -378,20 +363,14 @@ define void @memset_72_stack() {
378363
ret void
379364
}
380365

381-
; FIXME This could use FP ops.
382366
define void @memset_128_stack() {
383367
; CHECK-LABEL: memset_128_stack:
384-
; CHECK: mov x8, #-6148914691236517206
368+
; CHECK: movi v0.16b, #170
385369
; CHECK-NEXT: mov x0, sp
386-
; CHECK-NEXT: stp x8, x30, [sp, #120]
387-
; CHECK-NEXT: stp x8, x8, [sp, #104]
388-
; CHECK-NEXT: stp x8, x8, [sp, #88]
389-
; CHECK-NEXT: stp x8, x8, [sp, #72]
390-
; CHECK-NEXT: stp x8, x8, [sp, #56]
391-
; CHECK-NEXT: stp x8, x8, [sp, #40]
392-
; CHECK-NEXT: stp x8, x8, [sp, #24]
393-
; CHECK-NEXT: stp x8, x8, [sp, #8]
394-
; CHECK-NEXT: str x8, [sp]
370+
; CHECK-NEXT: stp q0, q0, [sp, #96]
371+
; CHECK-NEXT: stp q0, q0, [sp, #64]
372+
; CHECK-NEXT: stp q0, q0, [sp, #32]
373+
; CHECK-NEXT: stp q0, q0, [sp]
395374
; CHECK-NEXT: bl something
396375
%buf = alloca [128 x i8], align 1
397376
%cast = bitcast [128 x i8]* %buf to i8*
@@ -400,27 +379,18 @@ define void @memset_128_stack() {
400379
ret void
401380
}
402381

403-
; FIXME This could use FP ops.
404382
define void @memset_256_stack() {
405383
; CHECK-LABEL: memset_256_stack:
406-
; CHECK: mov x8, #-6148914691236517206
407-
; CHECK-NEXT: mov x0, sp
408-
; CHECK-NEXT: stp x8, x8, [sp, #240]
409-
; CHECK-NEXT: stp x8, x8, [sp, #224]
410-
; CHECK-NEXT: stp x8, x8, [sp, #208]
411-
; CHECK-NEXT: stp x8, x8, [sp, #192]
412-
; CHECK-NEXT: stp x8, x8, [sp, #176]
413-
; CHECK-NEXT: stp x8, x8, [sp, #160]
414-
; CHECK-NEXT: stp x8, x8, [sp, #144]
415-
; CHECK-NEXT: stp x8, x8, [sp, #128]
416-
; CHECK-NEXT: stp x8, x8, [sp, #112]
417-
; CHECK-NEXT: stp x8, x8, [sp, #96]
418-
; CHECK-NEXT: stp x8, x8, [sp, #80]
419-
; CHECK-NEXT: stp x8, x8, [sp, #64]
420-
; CHECK-NEXT: stp x8, x8, [sp, #48]
421-
; CHECK-NEXT: stp x8, x8, [sp, #32]
422-
; CHECK-NEXT: stp x8, x8, [sp, #16]
423-
; CHECK-NEXT: stp x8, x8, [sp]
384+
; CHECK: movi v0.16b, #170
385+
; CHECK-NEXT: mov x0, sp
386+
; CHECK-NEXT: stp q0, q0, [sp, #224]
387+
; CHECK-NEXT: stp q0, q0, [sp, #192]
388+
; CHECK-NEXT: stp q0, q0, [sp, #160]
389+
; CHECK-NEXT: stp q0, q0, [sp, #128]
390+
; CHECK-NEXT: stp q0, q0, [sp, #96]
391+
; CHECK-NEXT: stp q0, q0, [sp, #64]
392+
; CHECK-NEXT: stp q0, q0, [sp, #32]
393+
; CHECK-NEXT: stp q0, q0, [sp]
424394
; CHECK-NEXT: bl something
425395
%buf = alloca [256 x i8], align 1
426396
%cast = bitcast [256 x i8]* %buf to i8*

0 commit comments

Comments
 (0)