-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Correctly merge noalias scopes during lowering of LDS data. #131664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: Sirish Pande (srpande) ChangesCurrently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more onservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to insersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: Full diff: https://github.com/llvm/llvm-project/pull/131664.diff 8 Files Affected:
diff --git a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h
index f6ade7c83a61a..4f658028d5e86 100644
--- a/llvm/include/llvm/Analysis/ScopedNoAliasAA.h
+++ b/llvm/include/llvm/Analysis/ScopedNoAliasAA.h
@@ -43,6 +43,9 @@ class ScopedNoAliasAAResult : public AAResultBase {
ModRefInfo getModRefInfo(const CallBase *Call1, const CallBase *Call2,
AAQueryInfo &AAQI);
+ //SmallPtrSet <const MDNode *, 16> getScopedDomains(const MDNode *NoAlias) const;
+ void collectScopedDomains(const MDNode *NoAlias,
+ SmallPtrSetImpl<const MDNode*> &Domains) const;
private:
bool mayAliasInScopes(const MDNode *Scopes, const MDNode *NoAlias) const;
};
diff --git a/llvm/lib/Analysis/ScopedNoAliasAA.cpp b/llvm/lib/Analysis/ScopedNoAliasAA.cpp
index 3815bdf49d59c..4792837206c85 100644
--- a/llvm/lib/Analysis/ScopedNoAliasAA.cpp
+++ b/llvm/lib/Analysis/ScopedNoAliasAA.cpp
@@ -114,6 +114,18 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain,
Nodes.insert(MD);
}
+// Collect the set of scope domains relevant to the noalias scopes.
+void ScopedNoAliasAAResult::collectScopedDomains(const MDNode *NoAlias,
+ SmallPtrSetImpl<const MDNode*> &Domains) const {
+ if (!NoAlias)
+ return;
+ assert(Domains.empty() && "Domains should be empty");
+ for (const MDOperand &MDOp : NoAlias->operands())
+ if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp))
+ if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain())
+ Domains.insert(Domain);
+}
+
bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes,
const MDNode *NoAlias) const {
if (!Scopes || !NoAlias)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 55497c837ee23..4ec1c3f20e0d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -185,6 +185,7 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
#include "llvm/Analysis/CallGraph.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
@@ -244,6 +245,7 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
class AMDGPULowerModuleLDS {
const AMDGPUTargetMachine &TM;
+ friend class ScopedNoAliasAAResult;
static void
removeLocalVarsFromUsedLists(Module &M,
@@ -1424,14 +1426,12 @@ class AMDGPULowerModuleLDS {
Align A =
commonAlignment(Replacement.SGV->getAlign().valueOrOne(), Offset);
-
if (I)
NoAliasList[I - 1] = AliasScopes[I - 1];
MDNode *NoAlias =
NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList);
MDNode *AliasScope =
AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]});
-
refineUsesAlignmentAndAA(GEP, A, DL, AliasScope, NoAlias);
}
}
@@ -1442,16 +1442,27 @@ class AMDGPULowerModuleLDS {
if (!MaxDepth || (A == 1 && !AliasScope))
return;
+ ScopedNoAliasAAResult ScopedNoAlias;
+
for (User *U : Ptr->users()) {
if (auto *I = dyn_cast<Instruction>(U)) {
if (AliasScope && I->mayReadOrWriteMemory()) {
MDNode *AS = I->getMetadata(LLVMContext::MD_alias_scope);
- AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope)
- : AliasScope);
+ AS = (AS ? MDNode::getMostGenericAliasScope(AS, AliasScope) : AliasScope);
I->setMetadata(LLVMContext::MD_alias_scope, AS);
MDNode *NA = I->getMetadata(LLVMContext::MD_noalias);
- NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
+ // If domain of NoAlias (domain of LDS structure) is different
+ // than existing NA, we need to preserve exising !NoAlias
+ SmallPtrSet<const MDNode *, 16> ExistingDomains, LDSDomains;
+ ScopedNoAlias.collectScopedDomains(NA, ExistingDomains);
+ ScopedNoAlias.collectScopedDomains(NoAlias, LDSDomains);
+ auto Diff = set_difference(ExistingDomains, LDSDomains);
+ if (Diff.empty()) {
+ NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias);
+ } else {
+ NA = (NA ? MDNode::concatenate(NA, NoAlias) : NoAlias);
+ }
I->setMetadata(LLVMContext::MD_noalias, NA);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
index eefa0b23d0c08..92d0a05f35732 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-kernel-and-module-lds.ll
@@ -84,7 +84,7 @@ define amdgpu_kernel void @calls_f0() {
define void @f0() {
; CHECK-LABEL: define void @f0()
; CHECK-NEXT: store i8 1, ptr addrspace(3) getelementptr inbounds (%llvm.amdgcn.module.lds.t, ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 8, !noalias !24
-; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !24
+; CHECK-NEXT: store i8 8, ptr addrspace(3) @llvm.amdgcn.module.lds, align 8, !noalias !29
; CHECK-NEXT: ret void
store i8 1, ptr addrspace(3) @lds.size.1.align.1, align 1
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
index bb09d3a670bc9..154c798a44f93 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll
@@ -12,9 +12,9 @@ define amdgpu_kernel void @no_clobber_ds_load_stores_x2_preexisting_aa(ptr addrs
; CHECK-NEXT: store i32 1, ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, align 16, !tbaa [[TBAA1:![0-9]+]], !noalias !6
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 [[I]]
; CHECK-NEXT: [[VAL_A:%.*]] = load i32, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1]], !noalias !6
-; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !6
+; CHECK-NEXT: store i32 2, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), align 16, !tbaa [[TBAA1]], !noalias !11
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_NO_CLOBBER_DS_LOAD_STORES_X2_PREEXISTING_AA_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.no_clobber_ds_load_stores_x2_preexisting_aa.lds, i32 0, i32 1), i32 0, i32 [[I]]
-; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !6
+; CHECK-NEXT: [[VAL_B:%.*]] = load i32, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !noalias !11
; CHECK-NEXT: [[VAL:%.*]] = add i32 [[VAL_A]], [[VAL_B]]
; CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[ARG]], align 4
; CHECK-NEXT: ret void
@@ -48,4 +48,11 @@ bb:
; CHECK:!3 = !{!"int", !4, i64 0}
; CHECK:!4 = !{!"omnipotent char", !5, i64 0}
; CHECK:!5 = !{!"Simple C++ TBAA"}
-; CHECK:!6 = !{}
+; CHECK:!6 = !{!7, !9}
+; CHECK:!7 = distinct !{!7, !8}
+; CHECK:!8 = distinct !{!8}
+; CHECK:!9 = distinct !{!9, !10}
+; CHECK:!10 = distinct !{!10}
+; CHECK:!11 = !{!12, !13}
+; CHECK:!12 = distinct !{!12, !8}
+; CHECK:!13 = distinct !{!13, !10}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll
new file mode 100644
index 0000000000000..357f42646617f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+
+@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) {
+; GCN-LABEL: ds_load_stores_aainfo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s0, s0, 2
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1
+; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512
+; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GCN-NEXT: s_endpgm
+bb:
+ %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
+ %gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i
+
+ %val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !alias.scope !6, !noalias !5
+ %val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !alias.scope !6, !noalias !5
+
+ store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2
+
+ %val = add i64 %val.a, %val.b
+ store i64 %val, ptr addrspace(1) %arg, align 4
+
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ret void
+}
+
+ !0 = !{!"omnipotent char", !1, i64 0}
+ !1 = !{!1}
+ !2 = !{!3}
+ !3 = distinct !{!3, !4}
+ !4 = distinct !{!4}
+ !5 = !{!3}
+ !6 = !{!7}
+ !7 = !{!7, !4}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll
new file mode 100644
index 0000000000000..b90fa756b2970
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-with-noalias.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+
+@a = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@b = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+@c = internal unnamed_addr addrspace(3) global [64 x i32] undef, align 4
+
+define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) {
+; GCN-LABEL: ds_load_stores_aainfo:
+; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GCN-NEXT: v_mov_b32_e32 v0, 1
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: s_lshl_b32 s0, s0, 2
+; GCN-NEXT: v_mov_b32_e32 v4, s0
+; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1
+; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512
+; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65
+; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
+; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4
+; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1]
+; GCN-NEXT: s_endpgm
+; CHECK-LABEL: define amdgpu_kernel void @ds_load_stores_aainfo(
+; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[BB:.*:]]
+; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 [[I]]
+; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 1), i32 0, i32 [[I]]
+; CHECK-NEXT: [[VAL_A:%.*]] = load i64, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1:![0-9]+]], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT: [[VAL_B:%.*]] = load i64, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT: store i64 1, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 2), align 16, !tbaa [[TBAA1]], !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]]
+; CHECK-NEXT: [[VAL:%.*]] = add i64 [[VAL_A]], [[VAL_B]]
+; CHECK-NEXT: store i64 [[VAL]], ptr addrspace(1) [[ARG]], align 4
+; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+; CHECK-NEXT: ret void
+;
+bb:
+ %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
+ %gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i
+
+ %val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !noalias !5
+ %val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !noalias !5
+
+ store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2
+
+ %val = add i64 %val.a, %val.b
+ store i64 %val, ptr addrspace(1) %arg, align 4
+
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ret void
+}
+
+ !0 = !{!"omnipotent char", !1, i64 0}
+ !1 = !{!1}
+ !2 = !{!3}
+ !3 = distinct !{!3, !4}
+ !4 = distinct !{!4}
+ !5 = !{!3}
+;.
+; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0, i64 0}
+; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]}
+; CHECK: [[META3]] = distinct !{[[META3]]}
+; CHECK: [[META4]] = !{[[META5:![0-9]+]]}
+; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]}
+; CHECK: [[META6]] = distinct !{[[META6]]}
+; CHECK: [[META7]] = !{[[META8:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]}
+; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]]}
+; CHECK: [[META9]] = distinct !{[[META9]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META6]]}
+; CHECK: [[META11]] = distinct !{[[META11]], [[META6]]}
+; CHECK: [[META12]] = !{[[META10]]}
+; CHECK: [[META13]] = !{[[META8]], [[META5]], [[META11]]}
+; CHECK: [[META14]] = !{[[META11]]}
+; CHECK: [[META15]] = !{[[META8]], [[META5]], [[META10]]}
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
index 96e8099ed59e1..e7f78b4c6897a 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-single-var-unambiguous.ll
@@ -60,7 +60,7 @@ define void @f0() {
define amdgpu_kernel void @k_f0() {
; MODULE-LABEL: @k_f0(
-; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META5:![0-9]+]], !noalias [[META1]]
+; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META10:![0-9]+]], !noalias [[META1]]
; MODULE-NEXT: call void @f0()
; MODULE-NEXT: ret void
;
@@ -83,9 +83,9 @@ define amdgpu_kernel void @k_f0() {
@both.lds = addrspace(3) global i32 poison
define void @f_both() {
; MODULE-LABEL: @f_both(
-; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]]
+; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META11:![0-9]+]]
; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 4
-; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META4]]
+; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META11]]
; MODULE-NEXT: ret void
;
; TABLE-LABEL: @f_both(
@@ -116,9 +116,9 @@ define void @f_both() {
define amdgpu_kernel void @k0_both() {
; MODULE-LABEL: @k0_both(
; MODULE-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
-; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]]
+; MODULE-NEXT: [[LD:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META1]]
; MODULE-NEXT: [[MUL:%.*]] = mul i32 [[LD]], 5
-; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META5]], !noalias [[META1]]
+; MODULE-NEXT: store i32 [[MUL]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !alias.scope [[META10]], !noalias [[META1]]
; MODULE-NEXT: call void @f_both()
; MODULE-NEXT: ret void
;
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
✅ With the latest revision this PR passed the undef deprecator. |
9ba193b
to
3327d88
Compare
!verify |
Does upstream Buildbot work that way as well? |
@@ -244,6 +245,7 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) { | |||
|
|||
class AMDGPULowerModuleLDS { | |||
const AMDGPUTargetMachine &TM; | |||
friend class ScopedNoAliasAAResult; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't see how this could be used
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need. I was playing with something else. Does not need it anymore.
The premise is reasonable - the aliasing information in lds lowering might be suboptimal. The variables all go in different slots of a struct though so no lds variables alias any other, outside of the externally allocated hack thing where all of them alias every other externally allocated one. So I'm a little confused how more elaborate aliasing information helps. edit: better to rebase on main than to merge, gives a single commit with the content in to look at |
4cef62d
to
7e3abd1
Compare
No it does not. |
if (!NoAlias) | ||
return; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wouldn't include this null check. Either make this a reference, or instead have the instruction as the argument to query the scope metadata
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NoAlias is checked as a null check in line 131. Changing this here would require to change multiple other places unrelated to this patch.
for (const MDOperand &MDOp : NoAlias->operands()) | ||
if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp)) | ||
if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain()) | ||
Domains.insert(Domain); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this really need to be a set? I would hope the verifier enforces these entries must be unique to begin with
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Domain is defined as a set prior to this change. This patch just outlines this code into a different function such that it could be used in multiple places in compiler.
auto Diff = set_difference(ExistingDomains, LDSDomains); | ||
if (Diff.empty()) { | ||
NA = NA ? MDNode::intersect(NA, NoAlias) : NoAlias; | ||
} else { | ||
NA = NA ? MDNode::concatenate(NA, NoAlias) : NoAlias; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the set logic here. Why do you need to pre-compute a set difference? You're then using that to perform another set operation. Just perform the set operation directly in MDnode?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I could something like this to be more clear:
auto Intersection = set_intersection(ExistingDomains, LDSDomains);
if (Intersection.empty()) {
NA = NA ? MDNode::concatenate(NA, NoAlias) : NoAlias;
}
else {
NA = NA ? MDNode::intersect(NA, NoAlias) : NoAlias;
}
We are only concatenating the existing aliases with LDS aliases if they do not come from the same domain. The first set opertion is on domain, and depending on that we set operations on alias sets, not on domain sets.
if (Diff.empty()) { | ||
NA = NA ? MDNode::intersect(NA, NoAlias) : NoAlias; | ||
} else { | ||
NA = NA ? MDNode::concatenate(NA, NoAlias) : NoAlias; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The code computes existing domains, but then resulting NA computation does not depend on it, it still uses the same NA and does not really use Diff or ExistingDomains. It is just replacing intersection with the concatenation, but not with the pre-existing info. What am I missing?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea is the set operation on domains. Depending on the result of set operation of domain, we perform set operations on alias set in order to have correct alias sets on loads/store of new elements of LDS structs.
You are right, that all the LDS variables go in different slots and do not alias with each other. However, sometimes alias.scope or noalias metadata linger from previous optimizations. In the presence of those metadata, current lowering pass creates an empty set ( see llvm/test/CodeGen/AMDGPU/lower-lds-struct-aa-merge.ll line 51). This makes aliasing more conservative, and adds aliasing information between loads and stores where it does not need to. This pass attempts to remove that ambiguity, and aliasing information more precise. Using the following: I could possibly add more tests to show how LDS lowering pass is making aliasing more conservative. |
Yes, should have coverage showing the better / worse cases with merge |
if (I) | ||
NoAliasList[I - 1] = AliasScopes[I - 1]; | ||
MDNode *NoAlias = | ||
NoAliasList.empty() ? nullptr : MDNode::get(Ctx, NoAliasList); | ||
MDNode *AliasScope = | ||
AliasScopes.empty() ? nullptr : MDNode::get(Ctx, {AliasScopes[I]}); | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unrelated whitespace change
A little bit more context seems is needed here. Consider the following test:
All LDS variables have scope of !3 and domain of !4. Clearly they do not alias each other. Following check would show LDS variables do not alias with each other. After running LDS lowering pass,
This IR now has a problem of being more conservative. Load global LDS a, which was not alias with store into c as separate LDS entries, is now aliased.
Similarly, load from b is also now mayaliased. The reason is metadata !7 = !{} is empty set, whereas alias scope of !4 and !8 (from loads) both in domain !6. Current LDS lowering patch could not disambiguate the domain. This patch looks at the domain all metadata and makes more precise aliasing information as shown below.
|
Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more onservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to insersect only if noalias scopes are from the same domain (eg. in case of memcpy across across LDS), otherwise concatenate exising noalias sets with LDS noalias.
7e3abd1
to
adce896
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output` (cherry picked from commit abec9ff)
…lvm#131664) Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction. The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias. There a few patches that have come for scopedAA in the past. Following three should be enough background information. https://reviews.llvm.org/D91576 https://reviews.llvm.org/D108315 https://reviews.llvm.org/D110049 Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following: `opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output`
Currently, if there is already noalias metadata present on loads and stores, lower module lds pass is generating a more conservative aliasing set. This results in inhibiting scheduling intrinsics that would have otherwise generated a better pipelined instruction.
The fix is not to always intersect already existing noalias metadata with noalias created for lowering of LDS. But to intersect only if noalias scopes are from the same domain, otherwise concatenate exising noalias sets with LDS noalias.
There a few patches that have come for scopedAA in the past. Following three should be enough background information.
https://reviews.llvm.org/D91576
https://reviews.llvm.org/D108315
https://reviews.llvm.org/D110049
Essentially, after a pass that might change aliasing info, one should check if that pass results in change number of MayAlias or ModRef using the following:
opt -S -aa-pipeline=basic-aa,scoped-noalias-aa -passes=aa-eval -evaluate-aa-metadata -print-all-alias-modref-info -disable-output