Skip to content

Commit 79606ee

Browse files
committed
[AMDGPU] Check atomics aliasing in the clobbering annotation
MemorySSA considers any atomic a def to any operation it dominates just like a barrier or fence. That is correct from memory state perspective, but not required for the no-clobber metadata since we are not using it for reordering. Skip such atomics during the scan just like a barrier if it does not alias with the load. Differential Revision: https://reviews.llvm.org/D118661
1 parent 4f67a90 commit 79606ee

File tree

2 files changed

+213
-1
lines changed

2 files changed

+213
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "AMDGPU.h"
1616
#include "Utils/AMDGPUBaseInfo.h"
1717
#include "llvm/ADT/SmallSet.h"
18+
#include "llvm/Analysis/AliasAnalysis.h"
1819
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
1920
#include "llvm/Analysis/MemorySSA.h"
2021
#include "llvm/IR/InstVisitor.h"
@@ -31,6 +32,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
3132
public InstVisitor<AMDGPUAnnotateUniformValues> {
3233
LegacyDivergenceAnalysis *DA;
3334
MemorySSA *MSSA;
35+
AliasAnalysis *AA;
3436
DenseMap<Value*, GetElementPtrInst*> noClobberClones;
3537
bool isEntryFunc;
3638

@@ -46,6 +48,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
4648
void getAnalysisUsage(AnalysisUsage &AU) const override {
4749
AU.addRequired<LegacyDivergenceAnalysis>();
4850
AU.addRequired<MemorySSAWrapperPass>();
51+
AU.addRequired<AAResultsWrapperPass>();
4952
AU.setPreservesAll();
5053
}
5154

@@ -60,6 +63,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
6063
"Add AMDGPU uniform metadata", false, false)
6164
INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
6265
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
66+
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
6367
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
6468
"Add AMDGPU uniform metadata", false, false)
6569

@@ -78,7 +82,7 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
7882
SmallSet<MemoryAccess *, 8> Visited;
7983
MemoryLocation Loc(MemoryLocation::get(Load));
8084

81-
const auto isReallyAClobber = [](MemoryDef *Def) -> bool {
85+
const auto isReallyAClobber = [this, Load](MemoryDef *Def) -> bool {
8286
Instruction *DefInst = Def->getMemoryInst();
8387
LLVM_DEBUG(dbgs() << " Def: " << *DefInst << '\n');
8488

@@ -95,6 +99,17 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst *Load) {
9599
}
96100
}
97101

102+
// Ignore atomics not aliasing with the original load, any atomic is a
103+
// universal MemoryDef from MSSA's point of view too, just like a fence.
104+
const auto checkNoAlias = [this, Load](auto I) -> bool {
105+
return I && AA->isNoAlias(I->getPointerOperand(),
106+
Load->getPointerOperand());
107+
};
108+
109+
if (checkNoAlias(dyn_cast<AtomicCmpXchgInst>(DefInst)) ||
110+
checkNoAlias(dyn_cast<AtomicRMWInst>(DefInst)))
111+
return false;
112+
98113
return true;
99114
};
100115

@@ -197,6 +212,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
197212

198213
DA = &getAnalysis<LegacyDivergenceAnalysis>();
199214
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
215+
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
200216
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
201217

202218
visit(F);

llvm/test/CodeGen/AMDGPU/noclobber-barrier.ll

Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,5 +434,201 @@ entry:
434434
ret void
435435
}
436436

437+
; GCN-LABEL: {{^}}no_alias_atomic_rmw_relaxed:
438+
; GCN: ds_add_u32
439+
; GCN: s_load_dword s
440+
; GCN-NOT: global_load_dword
441+
; GCN: global_store_dword
442+
define protected amdgpu_kernel void @no_alias_atomic_rmw_relaxed(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
443+
; CHECK-LABEL: @no_alias_atomic_rmw_relaxed(
444+
; CHECK-NEXT: entry:
445+
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic, align 4
446+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
447+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
448+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
449+
; CHECK-NEXT: ret void
450+
;
451+
entry:
452+
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 monotonic
453+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
454+
%ld = load i32, i32 addrspace(1)* %gep, align 4
455+
store i32 %ld, i32 addrspace(1)* %out, align 4
456+
ret void
457+
}
458+
459+
; GCN-LABEL: {{^}}no_alias_atomic_cmpxchg:
460+
; GCN: ds_cmpst_b32
461+
; GCN: s_load_dword s
462+
; GCN-NOT: global_load_dword
463+
; GCN: global_store_dword
464+
define protected amdgpu_kernel void @no_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
465+
; CHECK-LABEL: @no_alias_atomic_cmpxchg(
466+
; CHECK-NEXT: entry:
467+
; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
468+
; CHECK-NEXT: fence syncscope("workgroup") release
469+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
470+
; CHECK-NEXT: fence syncscope("workgroup") acquire
471+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
472+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
473+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
474+
; CHECK-NEXT: ret void
475+
;
476+
entry:
477+
%unused = cmpxchg i32 addrspace(3)* @LDS, i32 7, i32 %swap seq_cst monotonic
478+
fence syncscope("workgroup") release
479+
tail call void @llvm.amdgcn.s.barrier()
480+
fence syncscope("workgroup") acquire
481+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
482+
%ld = load i32, i32 addrspace(1)* %gep, align 4
483+
store i32 %ld, i32 addrspace(1)* %out, align 4
484+
ret void
485+
}
486+
487+
; GCN-LABEL: {{^}}no_alias_atomic_rmw:
488+
; GCN: ds_add_u32
489+
; GCN: s_load_dword s
490+
; GCN-NOT: global_load_dword
491+
; GCN: global_store_dword
492+
define protected amdgpu_kernel void @no_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
493+
; CHECK-LABEL: @no_alias_atomic_rmw(
494+
; CHECK-NEXT: entry:
495+
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
496+
; CHECK-NEXT: fence syncscope("workgroup") release
497+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
498+
; CHECK-NEXT: fence syncscope("workgroup") acquire
499+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
500+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
501+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
502+
; CHECK-NEXT: ret void
503+
;
504+
entry:
505+
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
506+
fence syncscope("workgroup") release
507+
tail call void @llvm.amdgcn.s.barrier()
508+
fence syncscope("workgroup") acquire
509+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
510+
%ld = load i32, i32 addrspace(1)* %gep, align 4
511+
store i32 %ld, i32 addrspace(1)* %out, align 4
512+
ret void
513+
}
514+
515+
; GCN-LABEL: {{^}}may_alias_atomic_cmpxchg:
516+
; GCN: global_atomic_cmpswap
517+
; GCN: global_load_dword
518+
; GCN: global_store_dword
519+
define protected amdgpu_kernel void @may_alias_atomic_cmpxchg(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 %swap) {
520+
; CHECK-LABEL: @may_alias_atomic_cmpxchg(
521+
; CHECK-NEXT: entry:
522+
; CHECK-NEXT: [[UNUSED:%.*]] = cmpxchg i32 addrspace(1)* [[OUT:%.*]], i32 7, i32 [[SWAP:%.*]] seq_cst monotonic, align 4
523+
; CHECK-NEXT: fence syncscope("workgroup") release
524+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
525+
; CHECK-NEXT: fence syncscope("workgroup") acquire
526+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
527+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
528+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
529+
; CHECK-NEXT: ret void
530+
;
531+
entry:
532+
%unused = cmpxchg i32 addrspace(1)* %out, i32 7, i32 %swap seq_cst monotonic
533+
fence syncscope("workgroup") release
534+
tail call void @llvm.amdgcn.s.barrier()
535+
fence syncscope("workgroup") acquire
536+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
537+
%ld = load i32, i32 addrspace(1)* %gep, align 4
538+
store i32 %ld, i32 addrspace(1)* %out, align 4
539+
ret void
540+
}
541+
542+
; GCN-LABEL: {{^}}may_alias_atomic_rmw:
543+
; GCN: global_atomic_add
544+
; GCN: global_load_dword
545+
; GCN: global_store_dword
546+
define protected amdgpu_kernel void @may_alias_atomic_rmw(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
547+
; CHECK-LABEL: @may_alias_atomic_rmw(
548+
; CHECK-NEXT: entry:
549+
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(1)* [[OUT:%.*]], i32 5 seq_cst, align 4
550+
; CHECK-NEXT: fence syncscope("workgroup") release
551+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
552+
; CHECK-NEXT: fence syncscope("workgroup") acquire
553+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
554+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
555+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
556+
; CHECK-NEXT: ret void
557+
;
558+
entry:
559+
%unused = atomicrmw add i32 addrspace(1)* %out, i32 5 seq_cst
560+
fence syncscope("workgroup") release
561+
tail call void @llvm.amdgcn.s.barrier()
562+
fence syncscope("workgroup") acquire
563+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
564+
%ld = load i32, i32 addrspace(1)* %gep, align 4
565+
store i32 %ld, i32 addrspace(1)* %out, align 4
566+
ret void
567+
}
568+
569+
; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_clobber:
570+
; CGN: global_store_dword
571+
; CGN: global_store_dword
572+
; GCN: ds_add_u32
573+
; GCN: global_load_dword
574+
; GCN: global_store_dword
575+
define protected amdgpu_kernel void @no_alias_atomic_rmw_then_clobber(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
576+
; CHECK-LABEL: @no_alias_atomic_rmw_then_clobber(
577+
; CHECK-NEXT: entry:
578+
; CHECK-NEXT: store i32 1, i32 addrspace(1)* [[OUT:%.*]], align 4
579+
; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
580+
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
581+
; CHECK-NEXT: fence syncscope("workgroup") release
582+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
583+
; CHECK-NEXT: fence syncscope("workgroup") acquire
584+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0
585+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
586+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT]], align 4
587+
; CHECK-NEXT: ret void
588+
;
589+
entry:
590+
store i32 1, i32 addrspace(1)* %out, align 4
591+
store i32 2, i32 addrspace(1)* %noalias, align 4
592+
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
593+
fence syncscope("workgroup") release
594+
tail call void @llvm.amdgcn.s.barrier()
595+
fence syncscope("workgroup") acquire
596+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
597+
%ld = load i32, i32 addrspace(1)* %gep, align 4
598+
store i32 %ld, i32 addrspace(1)* %out, align 4
599+
ret void
600+
}
601+
602+
; GCN-LABEL: {{^}}no_alias_atomic_rmw_then_no_alias_store:
603+
; CGN: global_store_dword
604+
; GCN: ds_add_u32
605+
; GCN: s_load_dword s
606+
; GCN-NOT: global_load_dword
607+
; GCN: global_store_dword
608+
define protected amdgpu_kernel void @no_alias_atomic_rmw_then_no_alias_store(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i32 addrspace(1)* noalias %noalias) {
609+
; CHECK-LABEL: @no_alias_atomic_rmw_then_no_alias_store(
610+
; CHECK-NEXT: entry:
611+
; CHECK-NEXT: store i32 2, i32 addrspace(1)* [[NOALIAS:%.*]], align 4
612+
; CHECK-NEXT: [[UNUSED:%.*]] = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst, align 4
613+
; CHECK-NEXT: fence syncscope("workgroup") release
614+
; CHECK-NEXT: tail call void @llvm.amdgcn.s.barrier()
615+
; CHECK-NEXT: fence syncscope("workgroup") acquire
616+
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, i32 addrspace(1)* [[IN:%.*]], i64 0, !amdgpu.uniform !0, !amdgpu.noclobber !0
617+
; CHECK-NEXT: [[LD:%.*]] = load i32, i32 addrspace(1)* [[GEP]], align 4
618+
; CHECK-NEXT: store i32 [[LD]], i32 addrspace(1)* [[OUT:%.*]], align 4
619+
; CHECK-NEXT: ret void
620+
;
621+
entry:
622+
store i32 2, i32 addrspace(1)* %noalias, align 4
623+
%unused = atomicrmw add i32 addrspace(3)* @LDS, i32 5 seq_cst
624+
fence syncscope("workgroup") release
625+
tail call void @llvm.amdgcn.s.barrier()
626+
fence syncscope("workgroup") acquire
627+
%gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 0
628+
%ld = load i32, i32 addrspace(1)* %gep, align 4
629+
store i32 %ld, i32 addrspace(1)* %out, align 4
630+
ret void
631+
}
632+
437633
declare void @llvm.amdgcn.s.barrier()
438634
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)