-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[llvm][AMDGPU] Enable FWD_PROGRESS bit for GFX10+ #128367
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-llvm-binary-utilities @llvm/pr-subscribers-backend-amdgpu Author: Alex Voicu (AlexVlx) ChangesFrom GFX10 onwards it is possible to employ benevolent scheduling of waves. This patch unconditionally enables the bit which controls that capability, as it is beneficial for algorithms that rely on more complex concurrent coordination and it is generally performance neutral otherwise. Patch is 41.84 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128367.diff 14 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index a8d0bb746d2ef..7aae0cb69f10a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -1196,6 +1196,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
if (getIsaVersion(getGlobalSTI()->getCPU()).Major >= 10) {
ProgInfo.WgpMode = STM.isCuModeEnabled() ? 0 : 1;
ProgInfo.MemOrdered = 1;
+ ProgInfo.FwdProgress = 1;
}
// 0 = X, 1 = XY, 2 = XYZ
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
index 14b3cdf37650c..b467dbb2cd519 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCKernelDescriptor.cpp
@@ -70,6 +70,11 @@ MCKernelDescriptor::getDefaultAmdhsaKernelDescriptor(const MCSubtargetInfo *STI,
KD.compute_pgm_rsrc1, OneMCExpr,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED_SHIFT,
amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Ctx);
+
+ MCKernelDescriptor::bits_set(
+ KD.compute_pgm_rsrc1, OneMCExpr,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS_SHIFT,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Ctx);
}
if (AMDGPU::isGFX90A(*STI) && STI->getFeatureBits().test(FeatureTgSplit))
MCKernelDescriptor::bits_set(
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index b4d740422b94a..2fbb6283c28d3 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -39,6 +39,7 @@ void SIProgramInfo::reset(const MachineFunction &MF) {
IEEEMode = 0;
WgpMode = 0;
MemOrdered = 0;
+ FwdProgress = 0;
RrWgMode = 0;
ScratchSize = ZeroExpr;
@@ -84,7 +85,8 @@ static uint64_t getComputePGMRSrc1Reg(const SIProgramInfo &ProgInfo,
S_00B848_PRIV(ProgInfo.Priv) |
S_00B848_DEBUG_MODE(ProgInfo.DebugMode) |
S_00B848_WGP_MODE(ProgInfo.WgpMode) |
- S_00B848_MEM_ORDERED(ProgInfo.MemOrdered);
+ S_00B848_MEM_ORDERED(ProgInfo.MemOrdered) |
+ S_00B848_FWD_PROGRESS(ProgInfo.FwdProgress);
if (ST.hasDX10ClampMode())
Reg |= S_00B848_DX10_CLAMP(ProgInfo.DX10Clamp);
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index d7087436ae758..2b3dfc9efd057 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,9 +41,10 @@ struct LLVM_EXTERNAL_VISIBILITY SIProgramInfo {
uint32_t DX10Clamp = 0;
uint32_t DebugMode = 0;
uint32_t IEEEMode = 0;
- uint32_t WgpMode = 0; // GFX10+
- uint32_t MemOrdered = 0; // GFX10+
- uint32_t RrWgMode = 0; // GFX12+
+ uint32_t WgpMode = 0; // GFX10+
+ uint32_t MemOrdered = 0; // GFX10+
+ uint32_t FwdProgress = 0; // GFX10+
+ uint32_t RrWgMode = 0; // GFX12+
const MCExpr *ScratchSize = nullptr;
// State used to calculate fields set in PGM_RSRC2 pm4 packet.
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c521d0dd3ad2d..0d636655202a9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1319,7 +1319,7 @@ void initDefaultAMDKernelCodeT(AMDGPUMCKernelCodeT &KernelCode,
if (Version.Major >= 10) {
KernelCode.compute_pgm_resource_registers |=
S_00B848_WGP_MODE(STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1) |
- S_00B848_MEM_ORDERED(1);
+ S_00B848_MEM_ORDERED(1) | S_00B848_FWD_PROGRESS(1);
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index c136028f2de43..870520d820878 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3208,7 +3208,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
@@ -3300,7 +3300,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
@@ -4209,7 +4209,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
@@ -4294,7 +4294,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
@@ -4560,7 +4560,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_ieee_mode = 1
; GFX10-NEXT: enable_wgp_mode = 1
; GFX10-NEXT: enable_mem_ordered = 1
-; GFX10-NEXT: enable_fwd_progress = 0
+; GFX10-NEXT: enable_fwd_progress = 1
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX10-NEXT: user_sgpr_count = 12
; GFX10-NEXT: enable_trap_handler = 0
@@ -4648,7 +4648,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX11-NEXT: enable_ieee_mode = 1
; GFX11-NEXT: enable_wgp_mode = 1
; GFX11-NEXT: enable_mem_ordered = 1
-; GFX11-NEXT: enable_fwd_progress = 0
+; GFX11-NEXT: enable_fwd_progress = 1
; GFX11-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
; GFX11-NEXT: user_sgpr_count = 13
; GFX11-NEXT: enable_trap_handler = 0
diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
index d4826a22db795..6044f6e354ee0 100644
--- a/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdpal-msgpack-ieee.ll
@@ -7,7 +7,7 @@
; SI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
; VI-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf02c0{{$}}
; GFX9-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xaf0000{{$}}
-; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0x600f0000{{$}}
+; GFX12-DAG: '0x2e12 (COMPUTE_PGM_RSRC1)': 0xe00f0000{{$}}
define amdgpu_cs half @cs_amdpal(half %arg0) #0 {
%add = fadd half %arg0, 1.0
ret half %add
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index e90a976008229..1ad2510422f76 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -29,7 +29,7 @@
// OBJDUMP-NEXT: 0000 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0010 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0020 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 0030 00000c60 80000000 00040000 00000000
+// OBJDUMP-NEXT: 0030 00000ce0 80000000 00040000 00000000
// complete
// OBJDUMP-NEXT: 0040 01000000 01000000 08000000 00000000
// OBJDUMP-NEXT: 0050 00000000 00000000 00000000 00000000
@@ -39,12 +39,12 @@
// OBJDUMP-NEXT: 0080 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 0090 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 00a0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00b0 00000060 80000000 00040000 00000000
+// OBJDUMP-NEXT: 00b0 000000e0 80000000 00040000 00000000
// disabled_user_sgpr
// OBJDUMP-NEXT: 00c0 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 00d0 00000000 00000000 00000000 00000000
// OBJDUMP-NEXT: 00e0 00000000 00000000 00000000 00000000
-// OBJDUMP-NEXT: 00f0 00000c60 80000000 00040000 00000000
+// OBJDUMP-NEXT: 00f0 00000ce0 80000000 00040000 00000000
.text
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
index bec717e4137df..7f5240d649b7f 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx10.s
@@ -126,16 +126,16 @@ expr_defined:
// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
// ASM-NEXT: .amdhsa_reserve_flat_scratch defined_boolean
// ASM-NEXT: .amdhsa_reserve_xnack_mask 1
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18
-// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21
-// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23
-// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26
-// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29
-// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30
-// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31
+// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12
+// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14
+// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16
+// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&786432)>>18
+// ASM-NEXT: .amdhsa_dx10_clamp (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2097152)>>21
+// ASM-NEXT: .amdhsa_ieee_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&8388608)>>23
+// ASM-NEXT: .amdhsa_fp16_overflow (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&67108864)>>26
+// ASM-NEXT: .amdhsa_workgroup_processor_mode (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&536870912)>>29
+// ASM-NEXT: .amdhsa_memory_ordered (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&1073741824)>>30
+// ASM-NEXT: .amdhsa_forward_progress (((((((((((((((((((3769368576|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&2147483648)>>31
// ASM-NEXT: .amdhsa_shared_vgpr_count 0
// ASM-NEXT: .amdhsa_exception_fp_ieee_invalid_op (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&16777216)>>24
// ASM-NEXT: .amdhsa_exception_fp_denorm_src (((((((((((((((((((((((((128|(defined_2_bits<<11))&(~128))|(defined_boolean<<7))&(~256))|(defined_boolean<<8))&(~512))|(defined_boolean<<9))&(~1024))|(defined_boolean<<10))&(~16777216))|(defined_boolean<<24))&(~33554432))|(defined_boolean<<25))&(~67108864))|(defined_boolean<<26))&(~134217728))|(defined_boolean<<27))&(~268435456))|(defined_boolean<<28))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~62))&33554432)>>25
diff --git a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
index 68cf28f2ac49d..b5b8a58b09a7f 100644
--- a/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
+++ b/llvm/test/MC/AMDGPU/hsa-sym-exprs-gfx11.s
@@ -122,16 +122,16 @@ expr_defined:
// ASM-NEXT: .amdhsa_next_free_vgpr defined_value+4
// ASM-NEXT: .amdhsa_next_free_sgpr defined_value+5
// ASM-NEXT: .amdhsa_reserve_vcc defined_boolean
-// ASM-NEXT: .amdhsa_float_round_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&12288)>>12
-// ASM-NEXT: .amdhsa_float_round_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&49152)>>14
-// ASM-NEXT: .amdhsa_float_denorm_mode_32 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_boolean<<29))&(~1073741824))|(defined_boolean<<30))&(~2147483648))|(defined_boolean<<31))&(~63))|(((alignto(max(defined_value+4, 1), 8))/8)-1))&(~960))&196608)>>16
-// ASM-NEXT: .amdhsa_float_denorm_mode_16_64 (((((((((((((((((((1621884928|(defined_2_bits<<12))&(~49152))|(defined_2_bits<<14))&(~196608))|(defined_2_bits<<16))&(~786432))|(defined_2_bits<<18))&(~67108864))|(defined_boolean<<26))&(~536870912))|(defined_bo...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should add a note to the release notes and mention this in AMDGPUUsage
Will do, thanks; re: AMDGPUUsage, are you thinking about anything other than updating https://llvm.org/docs/AMDGPUUsage.html#amdhsa-kernel-directives-table to reflect the new default? The bit itself is already documented (kindof). |
Does this scheduling work with gfx942 and gfx950 as well? |
There's both the list in the assembler, and then the description in https://llvm.org/docs/AMDGPUUsage.html#amdgpu-amdhsa-compute-pgm-rsrc1-gfx6-gfx12-table |
I don't believe this bit exists on either |
This patch only does it for HSA, right? Could you make that clear in the description please? |
We should universally enable it |
We will not at this stage, there isn't enough prelim test coverage for PAL. |
Sure, thanks. Although, now that you mention it, this is actually accidentally enabling it for PAL as well, see https://github.com/llvm/llvm-project/pull/128367/files#diff-d051a060290eb246b00d46eb788c6377afd18a5c4c86772b6632c2bab5063c52L180. That's re-using |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
With routine performance testing, I don't see we need to specifically pre-check this the potential impact of this case on graphics. Any random change could cause a regression and be noticed, not sure why this case is special. But OK for now I suppose, it's not great for providing a consistent platform. We have too many asterisks on answers for "does amdgpu support X style questions"
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this scheduling work with gfx942 and gfx950 as well?
No, this is GFX10+ only.
From GFX10 onwards it is possible to employ benevolent scheduling of waves. This patch unconditionally enables, for the
amdhsa
OS, the bit which controls that capability, as it is beneficial for algorithms that rely on more complex concurrent coordination and it is generally performance neutral otherwise.