AMDGPU: Remove the s_buffer workaround for GFX9 chips

Marek Olsak · Marek Olsak · commit ea06ecf3436e · 2018-02-07T16:00:40.000Z
Summary: I checked the AMD closed source compiler and the workaround is only needed when x3 is emulated as x4, which we don't do in LLVM. SMEM x3 opcodes don't exist, and instead there is a possibility to use x4 with the last component being unused. If the last component is out of buffer bounds and falls on the next 4K page, the hw hangs. Reviewers: arsenm, nhaehnle Subscribers: kzhuravl, wdng, yaxunl, dstuttard, tpr, llvm-commits, t-tye Differential Revision: https://reviews.llvm.org/D42756 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@324486 91177308-0d34-0410-b5e6-96231b3b80d8
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -333,14 +333,6 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
     return HasMadMixInsts;
   }
 
-  bool hasSBufferLoadStoreAtomicDwordxN() const {
-    // Only use the "x1" variants on GFX9 or don't use the buffer variants.
-    // For x2 and higher variants, if the accessed region spans 2 VM pages and
-    // the second page is unmapped, the hw hangs.
-    // TODO: There is one future GFX9 chip that doesn't have this bug.
-    return getGeneration() != GFX9;
-  }
-
   bool hasCARRY() const {
     return (getGeneration() >= EVERGREEN);
   }
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -853,9 +853,8 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
 
       continue;
     }
-    if (STM->hasSBufferLoadStoreAtomicDwordxN() &&
-        (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
-         Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM)) {
+    if (Opc == AMDGPU::S_BUFFER_LOAD_DWORD_IMM ||
+        Opc == AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM) {
       // EltSize is in units of the offset encoding.
       CI.InstClass = S_BUFFER_LOAD_IMM;
       CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
@@ -217,14 +217,8 @@ main_body:
 ; GCN-NEXT: %bb.
 ; SICI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1
 ; SICI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x7
-; VI-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
-; VI-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
-; GFX9-NEXT: s_buffer_load_dword s{{[0-9]}}
+; VIGFX9-NEXT: s_buffer_load_dwordx4 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x4
+; VIGFX9-NEXT: s_buffer_load_dwordx2 s[{{[0-9]}}:{{[0-9]}}], s[0:3], 0x1c
 define amdgpu_ps void @smrd_imm_merged(<4 x i32> inreg %desc) #0 {
 main_body:
   %r1 = call float @llvm.SI.load.const.v4i32(<4 x i32> %desc, i32 4)

Original file line number	Diff line number	Diff line change
`@@ -333,14 +333,6 @@ class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {`
`333`	`333`	`return HasMadMixInsts;`
`334`	`334`	`}`
`335`	`335`
`336`		`- bool hasSBufferLoadStoreAtomicDwordxN() const {`
`337`		`- // Only use the "x1" variants on GFX9 or don't use the buffer variants.`
`338`		`- // For x2 and higher variants, if the accessed region spans 2 VM pages and`
`339`		`- // the second page is unmapped, the hw hangs.`
`340`		`- // TODO: There is one future GFX9 chip that doesn't have this bug.`
`341`		`- return getGeneration() != GFX9;`
`342`		`- }`
`343`		`-`
`344`	`336`	`bool hasCARRY() const {`
`345`	`337`	`return (getGeneration() >= EVERGREEN);`
`346`	`338`	`}`