Skip to content

Commit 8fcb126

Browse files
authored
[PreISelIntrinsicLowering] Produce a memset_pattern16 libcall for llvm.experimental.memset.pattern when available (#120420)
This is to enable a transition of LoopIdiomRecognize to selecting the llvm.experimental.memset.pattern intrinsic as requested in #118632 (as opposed to supporting selection of the libcall or the intrinsic). As such, although it _is_ a TODO to add costing considerations on whether to lower to the libcall (when available) or expand directly, lacking such logic is helpful at this stage in order to minimise any unexpected code gen changes in this transition.
1 parent dd3edc8 commit 8fcb126

File tree

2 files changed

+261
-1
lines changed

2 files changed

+261
-1
lines changed

llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp

Lines changed: 105 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "llvm/Support/Casting.h"
3333
#include "llvm/Target/TargetMachine.h"
3434
#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
35+
#include "llvm/Transforms/Utils/BuildLibCalls.h"
3536
#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
3637
#include "llvm/Transforms/Utils/LowerVectorIntrinsics.h"
3738

@@ -233,6 +234,60 @@ static bool canEmitLibcall(const TargetMachine *TM, Function *F,
233234
return TLI->getLibcallName(LC) != nullptr;
234235
}
235236

237+
// Return a value appropriate for use with the memset_pattern16 libcall, if
238+
// possible and if we know how. (Adapted from equivalent helper in
239+
// LoopIdiomRecognize).
240+
static Constant *getMemSetPattern16Value(MemSetPatternInst *Inst,
241+
const TargetLibraryInfo &TLI) {
242+
// TODO: This could check for UndefValue because it can be merged into any
243+
// other valid pattern.
244+
245+
// Don't emit libcalls if a non-default address space is being used.
246+
if (Inst->getRawDest()->getType()->getPointerAddressSpace() != 0)
247+
return nullptr;
248+
249+
Value *V = Inst->getValue();
250+
Type *VTy = V->getType();
251+
const DataLayout &DL = Inst->getDataLayout();
252+
Module *M = Inst->getModule();
253+
254+
if (!isLibFuncEmittable(M, &TLI, LibFunc_memset_pattern16))
255+
return nullptr;
256+
257+
// If the value isn't a constant, we can't promote it to being in a constant
258+
// array. We could theoretically do a store to an alloca or something, but
259+
// that doesn't seem worthwhile.
260+
Constant *C = dyn_cast<Constant>(V);
261+
if (!C || isa<ConstantExpr>(C))
262+
return nullptr;
263+
264+
// Only handle simple values that are a power of two bytes in size.
265+
uint64_t Size = DL.getTypeSizeInBits(VTy);
266+
if (!DL.typeSizeEqualsStoreSize(VTy) || !isPowerOf2_64(Size))
267+
return nullptr;
268+
269+
// Don't care enough about darwin/ppc to implement this.
270+
if (DL.isBigEndian())
271+
return nullptr;
272+
273+
// Convert to size in bytes.
274+
Size /= 8;
275+
276+
// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
277+
// if the top and bottom are the same (e.g. for vectors and large integers).
278+
if (Size > 16)
279+
return nullptr;
280+
281+
// If the constant is exactly 16 bytes, just use it.
282+
if (Size == 16)
283+
return C;
284+
285+
// Otherwise, we'll use an array of the constants.
286+
uint64_t ArraySize = 16 / Size;
287+
ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
288+
return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
289+
}
290+
236291
// TODO: Handle atomic memcpy and memcpy.inline
237292
// TODO: Pass ScalarEvolution
238293
bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
@@ -323,7 +378,56 @@ bool PreISelIntrinsicLowering::expandMemIntrinsicUses(Function &F) const {
323378
}
324379
case Intrinsic::experimental_memset_pattern: {
325380
auto *Memset = cast<MemSetPatternInst>(Inst);
326-
expandMemSetPatternAsLoop(Memset);
381+
const TargetLibraryInfo &TLI = LookupTLI(*Memset->getFunction());
382+
Constant *PatternValue = getMemSetPattern16Value(Memset, TLI);
383+
if (!PatternValue) {
384+
// If it isn't possible to emit a memset_pattern16 libcall, expand to
385+
// a loop instead.
386+
expandMemSetPatternAsLoop(Memset);
387+
Changed = true;
388+
Memset->eraseFromParent();
389+
break;
390+
}
391+
// FIXME: There is currently no profitability calculation for emitting
392+
// the libcall vs expanding the memset.pattern directly.
393+
IRBuilder<> Builder(Inst);
394+
Module *M = Memset->getModule();
395+
const DataLayout &DL = Memset->getDataLayout();
396+
397+
StringRef FuncName = "memset_pattern16";
398+
FunctionCallee MSP = getOrInsertLibFunc(
399+
M, TLI, LibFunc_memset_pattern16, Builder.getVoidTy(),
400+
Memset->getRawDest()->getType(), Builder.getPtrTy(),
401+
Memset->getLength()->getType());
402+
inferNonMandatoryLibFuncAttrs(M, FuncName, TLI);
403+
404+
// Otherwise we should form a memset_pattern16. PatternValue is known
405+
// to be an constant array of 16-bytes. Put the value into a mergable
406+
// global.
407+
assert(Memset->getRawDest()->getType()->getPointerAddressSpace() == 0 &&
408+
"Should have skipped if non-zero AS");
409+
GlobalVariable *GV = new GlobalVariable(
410+
*M, PatternValue->getType(), /*isConstant=*/true,
411+
GlobalValue::PrivateLinkage, PatternValue, ".memset_pattern");
412+
GV->setUnnamedAddr(
413+
GlobalValue::UnnamedAddr::Global); // Ok to merge these.
414+
// TODO: Consider relaxing alignment requirement.
415+
GV->setAlignment(Align(16));
416+
Value *PatternPtr = GV;
417+
Value *NumBytes = Builder.CreateMul(
418+
Builder.getInt64(DL.getTypeSizeInBits(Memset->getValue()->getType()) /
419+
8),
420+
Memset->getLength());
421+
CallInst *MemsetPattern16Call =
422+
Builder.CreateCall(MSP, {Memset->getRawDest(), PatternPtr, NumBytes});
423+
MemsetPattern16Call->setAAMetadata(Memset->getAAMetadata());
424+
// Preserve any call site attributes on the destination pointer
425+
// argument (e.g. alignment).
426+
AttrBuilder ArgAttrs(Memset->getContext(),
427+
Memset->getAttributes().getParamAttrs(0));
428+
MemsetPattern16Call->setAttributes(
429+
MemsetPattern16Call->getAttributes().addParamAttributes(
430+
Memset->getContext(), 0, ArgAttrs));
327431
Changed = true;
328432
Memset->eraseFromParent();
329433
break;
Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,156 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
2+
; RUN: opt -mtriple=x86_64-apple-darwin10.0.0 -passes=pre-isel-intrinsic-lowering -S -o - %s | FileCheck %s
3+
4+
;.
5+
; CHECK: @.memset_pattern = private unnamed_addr constant [2 x i64] [i64 -6148895925951734307, i64 -6148895925951734307], align 16
6+
; CHECK: @.memset_pattern.1 = private unnamed_addr constant [2 x i64] [i64 4614256656552045848, i64 4614256656552045848], align 16
7+
; CHECK: @.memset_pattern.2 = private unnamed_addr constant [8 x i16] [i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555, i16 -21555], align 16
8+
; CHECK: @.memset_pattern.3 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
9+
; CHECK: @.memset_pattern.4 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
10+
; CHECK: @.memset_pattern.5 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
11+
; CHECK: @.memset_pattern.6 = private unnamed_addr constant i128 -113427455635030943652277463699152839203, align 16
12+
;.
13+
define void @memset_pattern_i128_1_dynvalue(ptr %a, i128 %value) nounwind {
14+
; CHECK-LABEL: define void @memset_pattern_i128_1_dynvalue(
15+
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0:[0-9]+]] {
16+
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
17+
; CHECK: [[LOADSTORELOOP]]:
18+
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
19+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr [[A]], i64 [[TMP1]]
20+
; CHECK-NEXT: store i128 [[VALUE]], ptr [[TMP2]], align 1
21+
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
22+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
23+
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
24+
; CHECK: [[SPLIT]]:
25+
; CHECK-NEXT: ret void
26+
;
27+
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 %value, i64 1, i1 false)
28+
ret void
29+
}
30+
31+
define void @memset_pattern_i128_1(ptr %a, i128 %value) nounwind {
32+
; CHECK-LABEL: define void @memset_pattern_i128_1(
33+
; CHECK-SAME: ptr [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
34+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.3, i64 16)
35+
; CHECK-NEXT: ret void
36+
;
37+
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
38+
ret void
39+
}
40+
41+
define void @memset_pattern_i128_1_nz_as(ptr addrspace(1) %a, i128 %value) nounwind {
42+
; CHECK-LABEL: define void @memset_pattern_i128_1_nz_as(
43+
; CHECK-SAME: ptr addrspace(1) [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
44+
; CHECK-NEXT: br i1 false, label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
45+
; CHECK: [[LOADSTORELOOP]]:
46+
; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], %[[LOADSTORELOOP]] ]
47+
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i128, ptr addrspace(1) [[A]], i64 [[TMP1]]
48+
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(1) [[TMP2]], align 1
49+
; CHECK-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
50+
; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1
51+
; CHECK-NEXT: br i1 [[TMP4]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
52+
; CHECK: [[SPLIT]]:
53+
; CHECK-NEXT: ret void
54+
;
55+
tail call void @llvm.experimental.memset.pattern(ptr addrspace(1) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
56+
ret void
57+
}
58+
59+
define void @memset_pattern_i128_1_align_attr(ptr align(16) %a, i128 %value) nounwind {
60+
; CHECK-LABEL: define void @memset_pattern_i128_1_align_attr(
61+
; CHECK-SAME: ptr align 16 [[A:%.*]], i128 [[VALUE:%.*]]) #[[ATTR0]] {
62+
; CHECK-NEXT: call void @memset_pattern16(ptr align 16 [[A]], ptr @.memset_pattern.4, i64 16)
63+
; CHECK-NEXT: ret void
64+
;
65+
tail call void @llvm.experimental.memset.pattern(ptr align(16) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 1, i1 false)
66+
ret void
67+
}
68+
69+
define void @memset_pattern_i128_16(ptr %a) nounwind {
70+
; CHECK-LABEL: define void @memset_pattern_i128_16(
71+
; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
72+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.5, i64 256)
73+
; CHECK-NEXT: ret void
74+
;
75+
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 16, i1 false)
76+
ret void
77+
}
78+
79+
define void @memset_pattern_i128_x(ptr %a, i64 %x) nounwind {
80+
; CHECK-LABEL: define void @memset_pattern_i128_x(
81+
; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
82+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 16, [[X]]
83+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.6, i64 [[TMP1]])
84+
; CHECK-NEXT: ret void
85+
;
86+
tail call void @llvm.experimental.memset.pattern(ptr %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false)
87+
ret void
88+
}
89+
90+
define void @memset_pattern_i128_x_nonzero_as(ptr addrspace(10) %a, i64 %x) nounwind {
91+
; CHECK-LABEL: define void @memset_pattern_i128_x_nonzero_as(
92+
; CHECK-SAME: ptr addrspace(10) [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
93+
; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 0, [[X]]
94+
; CHECK-NEXT: br i1 [[TMP1]], label %[[SPLIT:.*]], label %[[LOADSTORELOOP:.*]]
95+
; CHECK: [[LOADSTORELOOP]]:
96+
; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], %[[LOADSTORELOOP]] ]
97+
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i128, ptr addrspace(10) [[A]], i64 [[TMP2]]
98+
; CHECK-NEXT: store i128 -113427455635030943652277463699152839203, ptr addrspace(10) [[TMP3]], align 1
99+
; CHECK-NEXT: [[TMP4]] = add i64 [[TMP2]], 1
100+
; CHECK-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], [[X]]
101+
; CHECK-NEXT: br i1 [[TMP5]], label %[[LOADSTORELOOP]], label %[[SPLIT]]
102+
; CHECK: [[SPLIT]]:
103+
; CHECK-NEXT: ret void
104+
;
105+
tail call void @llvm.experimental.memset.pattern(ptr addrspace(10) %a, i128 u0xaaaaaaaabbbbbbbbccccccccdddddddd, i64 %x, i1 false)
106+
ret void
107+
}
108+
109+
define void @memset_pattern_i16_x(ptr %a, i64 %x) nounwind {
110+
; CHECK-LABEL: define void @memset_pattern_i16_x(
111+
; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
112+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 2, [[X]]
113+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.2, i64 [[TMP1]])
114+
; CHECK-NEXT: ret void
115+
;
116+
tail call void @llvm.experimental.memset.pattern(ptr %a, i16 u0xabcd, i64 %x, i1 false)
117+
ret void
118+
}
119+
120+
define void @memset_pattern_i64_x(ptr %a, i64 %x) nounwind {
121+
; CHECK-LABEL: define void @memset_pattern_i64_x(
122+
; CHECK-SAME: ptr [[A:%.*]], i64 [[X:%.*]]) #[[ATTR0]] {
123+
; CHECK-NEXT: [[TMP1:%.*]] = mul i64 8, [[X]]
124+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern, i64 [[TMP1]])
125+
; CHECK-NEXT: ret void
126+
;
127+
tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0xaaaabbbbccccdddd, i64 %x, i1 false)
128+
ret void
129+
}
130+
131+
; Demonstrate that TBAA metadata is preserved.
132+
define void @memset_pattern_i64_128_tbaa(ptr %a) nounwind {
133+
; CHECK-LABEL: define void @memset_pattern_i64_128_tbaa(
134+
; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
135+
; CHECK-NEXT: call void @memset_pattern16(ptr [[A]], ptr @.memset_pattern.1, i64 1024), !tbaa [[TBAA0:![0-9]+]]
136+
; CHECK-NEXT: ret void
137+
;
138+
tail call void @llvm.experimental.memset.pattern(ptr %a, i64 u0x400921fb54442d18, i64 128, i1 false), !tbaa !5
139+
ret void
140+
}
141+
142+
!5 = !{!6, !6, i64 0}
143+
!6 = !{!"double", !7, i64 0}
144+
!7 = !{!"omnipotent char", !8, i64 0}
145+
!8 = !{!"Simple C++ TBAA"}
146+
147+
;.
148+
; CHECK: attributes #[[ATTR0]] = { nounwind }
149+
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: write) }
150+
; CHECK: attributes #[[ATTR2:[0-9]+]] = { nofree nounwind willreturn memory(argmem: readwrite) }
151+
;.
152+
; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META1]], i64 0}
153+
; CHECK: [[META1]] = !{!"double", [[META2:![0-9]+]], i64 0}
154+
; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]], i64 0}
155+
; CHECK: [[META3]] = !{!"Simple C++ TBAA"}
156+
;.

0 commit comments

Comments
 (0)