-
Notifications
You must be signed in to change notification settings - Fork 14.4k
[AMDGPU] Simplify use of hasMovrel and hasVGPRIndexMode #105680
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The generic subtarget has neither of these features. Rather than forcing HasMovrel on, it is simpler to expand dynamic vector indexing to a sequence of compare/select instructions. NFC for real subtargets.
@llvm/pr-subscribers-backend-amdgpu Author: Jay Foad (jayfoad) ChangesThe generic subtarget has neither of these features. Rather than forcing NFC for real subtargets. Patch is 87.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/105680.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index b3872a6374261b..352994e541fc88 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -143,14 +143,8 @@ GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
if (LDSBankCount == 0)
LDSBankCount = 32;
- if (TT.getArch() == Triple::amdgcn) {
- if (LocalMemorySize == 0)
- LocalMemorySize = 32768;
-
- // Do something sensible for unspecified target.
- if (!HasMovrel && !HasVGPRIndexMode)
- HasMovrel = true;
- }
+ if (TT.getArch() == Triple::amdgcn && LocalMemorySize == 0)
+ LocalMemorySize = 32768;
AddressableLocalMemorySize = LocalMemorySize;
@@ -366,7 +360,7 @@ bool GCNSubtarget::hasMadF16() const {
}
bool GCNSubtarget::useVGPRIndexMode() const {
- return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode());
+ return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
}
bool GCNSubtarget::useAA() const { return UseAA; }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d02d0bbb52e567..c0312e082bf367 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -13354,12 +13354,15 @@ bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
// On some architectures (GFX9) movrel is not available and it's better
// to expand.
- if (!Subtarget->hasMovrel())
+ if (Subtarget->useVGPRIndexMode())
return NumInsts <= 16;
// If movrel is available, use it instead of expanding for vector of 8
// elements.
- return NumInsts <= 15;
+ if (Subtarget->hasMovrel())
+ return NumInsts <= 15;
+
+ return true;
}
bool SITargetLowering::shouldExpandVectorDynExt(SDNode *N) const {
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index c130eb04d02370..a33142fd0ab1f3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,4 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GENERIC %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 < %s | FileCheck -check-prefix=NOOPT %s
; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI-MOVREL %s
; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=VI,VI-MOVREL %s
@@ -8,6 +9,75 @@
; Tests for indirect addressing on SI, which is implemented using dynamic
; indexing of vectors.
define amdgpu_kernel void @extract_w_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: extract_w_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s6, s4, 1
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_w_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -201,6 +271,65 @@ entry:
; XXX: Could do v_or_b32 directly
define amdgpu_kernel void @extract_w_offset_salu_use_vector(ptr addrspace(1) %out, i32 %in, <16 x i32> %or.val) {
+; GENERIC-LABEL: extract_w_offset_salu_use_vector:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0xb
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s20, s20, 1
+; GENERIC-NEXT: s_or_b32 s2, s19, 16
+; GENERIC-NEXT: s_or_b32 s18, s18, 15
+; GENERIC-NEXT: s_or_b32 s17, s17, 14
+; GENERIC-NEXT: s_or_b32 s16, s16, 13
+; GENERIC-NEXT: s_or_b32 s15, s15, 12
+; GENERIC-NEXT: s_or_b32 s14, s14, 11
+; GENERIC-NEXT: s_or_b32 s13, s13, 10
+; GENERIC-NEXT: s_or_b32 s12, s12, 9
+; GENERIC-NEXT: s_or_b32 s11, s11, 8
+; GENERIC-NEXT: s_or_b32 s10, s10, 7
+; GENERIC-NEXT: s_or_b32 s9, s9, 6
+; GENERIC-NEXT: s_or_b32 s8, s8, 5
+; GENERIC-NEXT: s_or_b32 s7, s7, 4
+; GENERIC-NEXT: s_or_b32 s6, s6, 3
+; GENERIC-NEXT: s_or_b32 s4, s4, 1
+; GENERIC-NEXT: s_or_b32 s5, s5, 2
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 1
+; GENERIC-NEXT: s_cselect_b32 s4, s5, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s4, s6, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s7, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s8, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s9, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s10, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s11, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s12, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s13, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s14, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s15, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s16, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s17, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s18, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, s4
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_w_offset_salu_use_vector:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -494,6 +623,74 @@ entry:
}
define amdgpu_kernel void @extract_wo_offset(ptr addrspace(1) %out, i32 %in) {
+; GENERIC-LABEL: extract_wo_offset:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_load_dword s6, s[2:3], 0xb
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, 0x40400000
+; GENERIC-NEXT: v_mov_b32_e32 v1, 0x40a00000
+; GENERIC-NEXT: v_mov_b32_e32 v2, 0x40c00000
+; GENERIC-NEXT: v_mov_b32_e32 v3, 0x40e00000
+; GENERIC-NEXT: v_mov_b32_e32 v4, 0x41000000
+; GENERIC-NEXT: v_mov_b32_e32 v5, 0x41100000
+; GENERIC-NEXT: v_mov_b32_e32 v6, 0x41200000
+; GENERIC-NEXT: v_mov_b32_e32 v7, 0x41300000
+; GENERIC-NEXT: v_mov_b32_e32 v8, 0x41400000
+; GENERIC-NEXT: v_mov_b32_e32 v9, 0x41500000
+; GENERIC-NEXT: v_mov_b32_e32 v10, 0x41600000
+; GENERIC-NEXT: v_mov_b32_e32 v11, 0x41700000
+; GENERIC-NEXT: v_mov_b32_e32 v12, 0x41800000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_cmp_eq_u32 s6, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e64 v13, 1.0, 2.0, s[4:5]
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 2
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v0, v13, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 3
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 4.0, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 4
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 5
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 6
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 7
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 8
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v5, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 9
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 10
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 11
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 12
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v9, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 13
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v10, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 14
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v11, v0, vcc
+; GENERIC-NEXT: s_cmp_lg_u32 s6, 15
+; GENERIC-NEXT: s_cselect_b64 vcc, -1, 0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, v12, v0, vcc
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_wo_offset:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -679,6 +876,50 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_sgpr(ptr addrspace(1) %out, i32 %offset) {
+; GENERIC-LABEL: extract_neg_offset_sgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dword s4, s[2:3], 0xb
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_add_i32 s2, s4, 0xfffffe00
+; GENERIC-NEXT: s_cmp_eq_u32 s2, 1
+; GENERIC-NEXT: s_cselect_b64 s[4:5], -1, 0
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 2
+; GENERIC-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GENERIC-NEXT: v_readfirstlane_b32 s4, v0
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 2
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 3
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 5
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 6
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 7
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 8
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 9
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 10
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 11
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 12
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 13
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 14
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 15
+; GENERIC-NEXT: s_cmp_lg_u32 s2, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s4, 16
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_sgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -866,6 +1107,66 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(ptr addrspace(1) %out, <16 x i32> %vec0, <16 x i32> %vec1, i32 %offset) {
+; GENERIC-LABEL: extract_neg_offset_sgpr_loaded:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx16 s[4:19], s[2:3], 0x19
+; GENERIC-NEXT: s_load_dword s20, s[2:3], 0x39
+; GENERIC-NEXT: s_load_dwordx16 s[36:51], s[2:3], 0x29
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_addk_i32 s20, 0xfe00
+; GENERIC-NEXT: s_or_b32 s2, s19, s51
+; GENERIC-NEXT: s_or_b32 s18, s18, s50
+; GENERIC-NEXT: s_or_b32 s17, s17, s49
+; GENERIC-NEXT: s_or_b32 s16, s16, s48
+; GENERIC-NEXT: s_or_b32 s15, s15, s47
+; GENERIC-NEXT: s_or_b32 s14, s14, s46
+; GENERIC-NEXT: s_or_b32 s13, s13, s45
+; GENERIC-NEXT: s_or_b32 s12, s12, s44
+; GENERIC-NEXT: s_or_b32 s11, s11, s43
+; GENERIC-NEXT: s_or_b32 s10, s10, s42
+; GENERIC-NEXT: s_or_b32 s9, s9, s41
+; GENERIC-NEXT: s_or_b32 s8, s8, s40
+; GENERIC-NEXT: s_or_b32 s7, s7, s39
+; GENERIC-NEXT: s_or_b32 s6, s6, s38
+; GENERIC-NEXT: s_or_b32 s4, s4, s36
+; GENERIC-NEXT: s_or_b32 s5, s5, s37
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 1
+; GENERIC-NEXT: s_cselect_b32 s4, s5, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 2
+; GENERIC-NEXT: s_cselect_b32 s4, s6, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 3
+; GENERIC-NEXT: s_cselect_b32 s4, s7, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 4
+; GENERIC-NEXT: s_cselect_b32 s4, s8, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 5
+; GENERIC-NEXT: s_cselect_b32 s4, s9, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 6
+; GENERIC-NEXT: s_cselect_b32 s4, s10, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 7
+; GENERIC-NEXT: s_cselect_b32 s4, s11, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 8
+; GENERIC-NEXT: s_cselect_b32 s4, s12, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 9
+; GENERIC-NEXT: s_cselect_b32 s4, s13, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 10
+; GENERIC-NEXT: s_cselect_b32 s4, s14, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 11
+; GENERIC-NEXT: s_cselect_b32 s4, s15, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 12
+; GENERIC-NEXT: s_cselect_b32 s4, s16, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 13
+; GENERIC-NEXT: s_cselect_b32 s4, s17, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 14
+; GENERIC-NEXT: s_cselect_b32 s4, s18, s4
+; GENERIC-NEXT: s_cmp_eq_u32 s20, 15
+; GENERIC-NEXT: s_cselect_b32 s4, s2, s4
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_mov_b32_e32 v0, s4
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_sgpr_loaded:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
@@ -1161,6 +1462,46 @@ entry:
}
define amdgpu_kernel void @extract_neg_offset_vgpr(ptr addrspace(1) %out) {
+; GENERIC-LABEL: extract_neg_offset_vgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s3, 0xf000
+; GENERIC-NEXT: s_mov_b32 s2, -1
+; GENERIC-NEXT: v_add_i32_e32 v0, vcc, 0xfffffe00, v0
+; GENERIC-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GENERIC-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 2, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 3, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 3, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 4, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 5, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 5, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 6, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 6, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 7, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 7, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 8, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 8, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 9, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 9, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 10, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 10, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 11, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 11, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 12, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 12, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 13, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 13, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 14, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 14, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v1, 15, v1, vcc
+; GENERIC-NEXT: v_cmp_ne_u32_e32 vcc, 15, v0
+; GENERIC-NEXT: v_cndmask_b32_e32 v0, 16, v1, vcc
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: buffer_store_dword v0, off, s[0:3], 0
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_neg_offset_vgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_mov_b32 s20, SCRATCH_RSRC_DWORD0
@@ -1458,6 +1799,18 @@ entry:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @extract_undef_offset_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GENERIC-LABEL: extract_undef_offset_sgpr:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GENERIC-NEXT: s_mov_b32 s7, 0xf000
+; GENERIC-NEXT: s_mov_b32 s6, -1
+; GENERIC-NEXT: s_waitcnt lgkmcnt(0)
+; GENERIC-NEXT: s_mov_b32 s4, s2
+; GENERIC-NEXT: s_mov_b32 s5, s3
+; GENERIC-NEXT: buffer_load_dwordx4 v[0:3], off, s[4:7], 0 glc
+; GENERIC-NEXT: s_waitcnt vmcnt(0)
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: extract_undef_offset_sgpr:
; NOOPT: ; %bb.0: ; %entry
; NOOPT-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xb
@@ -1513,6 +1866,10 @@ entry:
; undefined behavior, but shouldn't crash compiler
define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GENERIC-LABEL: insert_undef_offset_sgpr_vector_src:
+; GENERIC: ; %bb.0: ; %entry
+; GENERIC-NEXT: s_endpgm
+;
; NOOPT-LABEL: insert_undef_offset_sgpr_v...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
The generic subtarget has neither of these features. Rather than forcing HasMovrel on, it is simpler to expand dynamic vector indexing to a sequence of compare/select instructions. NFC for real subtargets.
The generic subtarget has neither of these features. Rather than forcing HasMovrel on, it is simpler to expand dynamic vector indexing to a sequence of compare/select instructions. NFC for real subtargets. Change-Id: I0affa087067cb0e82f1d631d7a8f84551f2dbffe
The generic subtarget has neither of these features. Rather than forcing
HasMovrel on, it is simpler to expand dynamic vector indexing to a
sequence of compare/select instructions.
NFC for real subtargets.