-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Correctly merge noalias scopes during lowering of LDS data. #131664
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -114,17 +114,26 @@ static void collectMDInDomain(const MDNode *List, const MDNode *Domain, | |
Nodes.insert(MD); | ||
} | ||
|
||
/// Collect the set of scoped domains relevant to the noalias scopes. | ||
void ScopedNoAliasAAResult::collectScopedDomains( | ||
const MDNode *NoAlias, SmallPtrSetImpl<const MDNode *> &Domains) const { | ||
if (!NoAlias) | ||
return; | ||
assert(Domains.empty() && "Domains should be empty"); | ||
for (const MDOperand &MDOp : NoAlias->operands()) | ||
if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp)) | ||
if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain()) | ||
Domains.insert(Domain); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this really need to be a set? I would hope the verifier enforces these entries must be unique to begin with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Domain is defined as a set prior to this change. This patch just outlines this code into a different function such that it could be used in multiple places in compiler. |
||
} | ||
|
||
bool ScopedNoAliasAAResult::mayAliasInScopes(const MDNode *Scopes, | ||
const MDNode *NoAlias) const { | ||
if (!Scopes || !NoAlias) | ||
return true; | ||
|
||
// Collect the set of scope domains relevant to the noalias scopes. | ||
SmallPtrSet<const MDNode *, 16> Domains; | ||
for (const MDOperand &MDOp : NoAlias->operands()) | ||
if (const MDNode *NAMD = dyn_cast<MDNode>(MDOp)) | ||
if (const MDNode *Domain = AliasScopeNode(NAMD).getDomain()) | ||
Domains.insert(Domain); | ||
collectScopedDomains(NoAlias, Domains); | ||
|
||
// We alias unless, for some domain, the set of noalias scopes in that domain | ||
// is a superset of the set of alias scopes in that domain. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -186,6 +186,7 @@ | |
#include "llvm/ADT/STLExtras.h" | ||
#include "llvm/ADT/SetOperations.h" | ||
#include "llvm/Analysis/CallGraph.h" | ||
#include "llvm/Analysis/ScopedNoAliasAA.h" | ||
#include "llvm/CodeGen/TargetPassConfig.h" | ||
#include "llvm/IR/Constants.h" | ||
#include "llvm/IR/DerivedTypes.h" | ||
|
@@ -1441,6 +1442,8 @@ class AMDGPULowerModuleLDS { | |
if (!MaxDepth || (A == 1 && !AliasScope)) | ||
return; | ||
|
||
ScopedNoAliasAAResult ScopedNoAlias; | ||
|
||
for (User *U : Ptr->users()) { | ||
if (auto *I = dyn_cast<Instruction>(U)) { | ||
if (AliasScope && I->mayReadOrWriteMemory()) { | ||
|
@@ -1450,7 +1453,34 @@ class AMDGPULowerModuleLDS { | |
I->setMetadata(LLVMContext::MD_alias_scope, AS); | ||
|
||
MDNode *NA = I->getMetadata(LLVMContext::MD_noalias); | ||
NA = (NA ? MDNode::intersect(NA, NoAlias) : NoAlias); | ||
|
||
// Scoped aliases can originate from two different domains. | ||
// First domain would be from LDS domain (created by this pass). | ||
// All entries (LDS vars) into LDS struct will have same domain. | ||
|
||
// Second domain could be existing scoped aliases that are the | ||
// results of noalias params and subsequent optimizations that | ||
// may alter thesse sets. | ||
|
||
// We need to be careful how we create new alias sets, and | ||
// have right scopes and domains for loads/stores of these new | ||
// LDS variables. We intersect NoAlias set if alias sets belong | ||
// to the same domain. This is the case if we have memcpy using | ||
// LDS variables. Both src and dst of memcpy would belong to | ||
// LDS struct, they donot alias. | ||
// On the other hand, if one of the domains is LDS and other is | ||
// existing domain prior to LDS, we need to have a union of all | ||
// these aliases set to preserve existing aliasing information. | ||
|
||
SmallPtrSet<const MDNode *, 16> ExistingDomains, LDSDomains; | ||
ScopedNoAlias.collectScopedDomains(NA, ExistingDomains); | ||
ScopedNoAlias.collectScopedDomains(NoAlias, LDSDomains); | ||
auto Intersection = set_intersection(ExistingDomains, LDSDomains); | ||
if (Intersection.empty()) { | ||
NA = NA ? MDNode::concatenate(NA, NoAlias) : NoAlias; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code computes existing domains, but then resulting NA computation does not depend on it, it still uses the same NA and does not really use Diff or ExistingDomains. It is just replacing intersection with the concatenation, but not with the pre-existing info. What am I missing? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is the set operation on domains. Depending on the result of set operation of domain, we perform set operations on alias set in order to have correct alias sets on loads/store of new elements of LDS structs. |
||
} else { | ||
NA = NA ? MDNode::intersect(NA, NoAlias) : NoAlias; | ||
} | ||
I->setMetadata(LLVMContext::MD_noalias, NA); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s | ||
|
||
@a = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
@b = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
@c = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
|
||
define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) { | ||
; GCN-LABEL: ds_load_stores_aainfo: | ||
; GCN: ; %bb.0: ; %bb | ||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c | ||
; GCN-NEXT: v_mov_b32_e32 v0, 1 | ||
; GCN-NEXT: v_mov_b32_e32 v1, 0 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_lshl_b32 s0, s0, 2 | ||
; GCN-NEXT: v_mov_b32_e32 v4, s0 | ||
; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 | ||
; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512 | ||
; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65 | ||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 | ||
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc | ||
; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] | ||
; GCN-NEXT: s_endpgm | ||
bb: | ||
%gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i | ||
%gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i | ||
|
||
%val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !alias.scope !6, !noalias !5 | ||
%val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !alias.scope !6, !noalias !5 | ||
|
||
store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2 | ||
|
||
%val = add i64 %val.a, %val.b | ||
store i64 %val, ptr addrspace(1) %arg, align 4 | ||
|
||
tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) | ||
tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
ret void | ||
} | ||
|
||
!0 = !{!"omnipotent char", !1, i64 0} | ||
!1 = !{!1} | ||
!2 = !{!3} | ||
!3 = distinct !{!3, !4} | ||
!4 = distinct !{!4} | ||
!5 = !{!3} | ||
!6 = !{!7} | ||
!7 = !{!7, !4} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 --amdgpu-lower-module-lds-strategy=module < %s | FileCheck -check-prefix=GCN %s | ||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s | ||
; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s | ||
|
||
@a = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
@b = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
@c = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4 | ||
|
||
define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) { | ||
; GCN-LABEL: ds_load_stores_aainfo: | ||
; GCN: ; %bb.0: ; %bb | ||
; GCN-NEXT: s_load_dword s0, s[4:5], 0x2c | ||
; GCN-NEXT: v_mov_b32_e32 v0, 1 | ||
; GCN-NEXT: v_mov_b32_e32 v1, 0 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_lshl_b32 s0, s0, 2 | ||
; GCN-NEXT: v_mov_b32_e32 v4, s0 | ||
; GCN-NEXT: ds_read2_b32 v[2:3], v4 offset1:1 | ||
; GCN-NEXT: ds_write_b64 v1, v[0:1] offset:512 | ||
; GCN-NEXT: ds_read2_b32 v[4:5], v4 offset0:64 offset1:65 | ||
; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0) | ||
; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0) | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 | ||
; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc | ||
; GCN-NEXT: global_store_dwordx2 v1, v[2:3], s[0:1] | ||
; GCN-NEXT: s_endpgm | ||
; CHECK-LABEL: define amdgpu_kernel void @ds_load_stores_aainfo( | ||
; CHECK-SAME: ptr addrspace(1) [[ARG:%.*]], i32 [[I:%.*]]) #[[ATTR0:[0-9]+]] { | ||
; CHECK-NEXT: [[BB:.*:]] | ||
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 [[I]] | ||
; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr inbounds [64 x i32], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 1), i32 0, i32 [[I]] | ||
; CHECK-NEXT: [[VAL_A:%.*]] = load i64, ptr addrspace(3) [[GEP_A]], align 4, !tbaa [[TBAA1:![0-9]+]], !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]] | ||
; CHECK-NEXT: [[VAL_B:%.*]] = load i64, ptr addrspace(3) [[GEP_B]], align 4, !tbaa [[TBAA1]], !alias.scope [[META12:![0-9]+]], !noalias [[META13:![0-9]+]] | ||
; CHECK-NEXT: store i64 1, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_KERNEL_DS_LOAD_STORES_AAINFO_LDS_T]], ptr addrspace(3) @llvm.amdgcn.kernel.ds_load_stores_aainfo.lds, i32 0, i32 2), align 16, !tbaa [[TBAA1]], !alias.scope [[META14:![0-9]+]], !noalias [[META15:![0-9]+]] | ||
; CHECK-NEXT: [[VAL:%.*]] = add i64 [[VAL_A]], [[VAL_B]] | ||
; CHECK-NEXT: store i64 [[VAL]], ptr addrspace(1) [[ARG]], align 4 | ||
; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) | ||
; CHECK-NEXT: tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
; CHECK-NEXT: ret void | ||
; | ||
bb: | ||
%gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i | ||
%gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i | ||
|
||
%val.a = load i64, ptr addrspace(3) %gep.a, align 4, !tbaa !0, !noalias !5 | ||
%val.b = load i64, ptr addrspace(3) %gep.b, align 4, !tbaa !0, !noalias !5 | ||
|
||
store i64 1, ptr addrspace(3) @c, align 4, !tbaa !0, !noalias !2 | ||
|
||
%val = add i64 %val.a, %val.b | ||
store i64 %val, ptr addrspace(1) %arg, align 4 | ||
|
||
tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
tail call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0) | ||
tail call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0) | ||
ret void | ||
} | ||
|
||
!0 = !{!"omnipotent char", !1, i64 0} | ||
!1 = !{!1} | ||
!2 = !{!3} | ||
!3 = distinct !{!3, !4} | ||
!4 = distinct !{!4} | ||
!5 = !{!3} | ||
;. | ||
; CHECK: [[TBAA1]] = !{[[META2:![0-9]+]], [[META2]], i64 0, i64 0} | ||
; CHECK: [[META2]] = !{!"omnipotent char", [[META3:![0-9]+]]} | ||
; CHECK: [[META3]] = distinct !{[[META3]]} | ||
; CHECK: [[META4]] = !{[[META5:![0-9]+]]} | ||
; CHECK: [[META5]] = distinct !{[[META5]], [[META6:![0-9]+]]} | ||
; CHECK: [[META6]] = distinct !{[[META6]]} | ||
; CHECK: [[META7]] = !{[[META8:![0-9]+]], [[META10:![0-9]+]], [[META11:![0-9]+]]} | ||
; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]]} | ||
; CHECK: [[META9]] = distinct !{[[META9]]} | ||
; CHECK: [[META10]] = distinct !{[[META10]], [[META6]]} | ||
; CHECK: [[META11]] = distinct !{[[META11]], [[META6]]} | ||
; CHECK: [[META12]] = !{[[META10]]} | ||
; CHECK: [[META13]] = !{[[META8]], [[META5]], [[META11]]} | ||
; CHECK: [[META14]] = !{[[META11]]} | ||
; CHECK: [[META15]] = !{[[META8]], [[META5]], [[META10]]} | ||
;. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wouldn't include this null check. Either make this a reference, or instead have the instruction as the argument to query the scope metadata
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
NoAlias is checked as a null check in line 131. Changing this here would require to change multiple other places unrelated to this patch.