AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291

arsenm · 2024-09-04T19:15:01Z

Trying to codegen these on targets without the instructions should
fail to select. Not sure if all the predicates are correct. We had
a fake one disconnected to a feature which was always true.

Fixes: SWDEV-482274

arsenm · 2024-09-04T19:15:17Z

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291 👈
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-09-04T19:15:33Z

@llvm/pr-subscribers-backend-amdgpu

Author: Matt Arsenault (arsenm)

Changes

Trying to codegen these on targets without the instructions should
fail to select. Not sure if all the predicates are correct. We had
a fake one disconnected to a feature which was always true.

Fixes: SWDEV-482274

Full diff: https://github.com/llvm/llvm-project/pull/107291.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPU.td (+9-2)
(modified) llvm/lib/Target/AMDGPU/GCNSubtarget.h (+1)
(modified) llvm/lib/Target/AMDGPU/VOP1Instructions.td (+7-4)
(added) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll (+100)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 5757ac0d4454d0..248e3ba3e32c56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -685,6 +685,13 @@ def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
   "Has fp8 and bf8 conversion instructions"
 >;
 
+def FeatureCvtFP8VOP1Bug : SubtargetFeature<"cvt-fp8-vop1-bug",
+  "HasCvtFP8Vop1Bug",
+  "true",
+  "FP8/BF8 VOP1 form of conversion to F32 is unreliable",
+  [FeatureFP8ConversionInsts]
+>;
+
 def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
   "HasPkFmacF16Inst",
   "true",
@@ -1444,7 +1451,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
    FeaturePackedFP32Ops,
    FeatureMAIInsts,
    FeatureFP8Insts,
-   FeatureFP8ConversionInsts,
+   FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug,
    FeaturePkFmacF16Inst,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
@@ -1657,7 +1664,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureFlatAtomicFaddF32Inst,
    FeatureImageInsts,
    FeatureExtendedImageInsts,
-   FeatureFP8ConversionInsts,
+   FeatureFP8ConversionInsts, FeatureCvtFP8VOP1Bug,
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 7b74eab96c5677..8a909ad7be26f6 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -158,6 +158,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasMAIInsts = false;
   bool HasFP8Insts = false;
   bool HasFP8ConversionInsts = false;
+  bool HasCvtFP8Vop1Bug = false;
   bool HasPkFmacF16Inst = false;
   bool HasAtomicFMinFMaxF32GlobalInsts = false;
   bool HasAtomicFMinFMaxF64GlobalInsts = false;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 03e4cb9fcf49b7..bc86dc7b94dec7 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -599,8 +599,8 @@ class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
     (inst_sdwa 0, $src, 0, 0, index)
 >;
 
-let SubtargetPredicate = isGFX9Only in {
-let OtherPredicates = [HasCvtFP8VOP1Bug] in {
+let SubtargetPredicate = HasFP8ConversionInsts in {
+let OtherPredicates = [HasCvtFP8VOP1Bug, HasSDWA] in {
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
                (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
@@ -614,11 +614,14 @@ let OtherPredicates = [HasNoCvtFP8VOP1Bug] in {
                (V_CVT_F32_BF8_e32 $src)>;
 }
 
+let OtherPredicates = [HasSDWA] in {
 foreach Index = [1, 2, 3] in {
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
 }
-} // End SubtargetPredicate = isGFX9Only
+} // End OtherPredicates = [HasSDWA]
+
+} // End SubtargetPredicate = HasFP8ConversionInsts
 
 class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
     VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
@@ -628,7 +631,7 @@ class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
          (inst_e32 $src))
 >;
 
-let SubtargetPredicate = isGFX9Only in {
+let SubtargetPredicate = HasFP8ConversionInsts, OtherPredicates = [HasSDWA] in {
   foreach Index = [0, -1] in {
     def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
                             V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll
new file mode 100644
index 00000000000000..29812993d541e2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f32.fp8.err.ll
@@ -0,0 +1,100 @@
+; RUN: split-file %s %t
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR %s
+
+
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s
+
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE0-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/fp8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-FP8-BYTE1-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte0-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE0-ERR-GISEL %s
+; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/bf8-byte1-err.ll 2>&1 | FileCheck -check-prefix=ERR-BF8-BYTE1-ERR-GISEL %s
+
+
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx908 -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s
+
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-fp8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-FP8-WORD1-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word0-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD0-ERR %s
+; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -filetype=null %t/pk-bf8-word1-err.ll 2>&1 | FileCheck -check-prefix=ERR-PK-BF8-WORD1-ERR %s
+
+
+;--- fp8-byte0-err.ll
+; ERR-FP8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8
+; ERR-FP8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0
+
+define float @test_cvt_f32_fp8_byte0(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
+  ret float %ret
+}
+
+;--- fp8-byte1-err.ll
+; ERR-FP8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.fp8
+; ERR-FP8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1
+define float @test_cvt_f32_fp8_byte1(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+  ret float %ret
+}
+
+;--- bf8-byte0-err.ll
+; ERR-BF8-BYTE0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8
+; ERR-BF8-BYTE0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0
+define float @test_cvt_f32_bf8_byte0(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+  ret float %ret
+}
+
+;--- bf8-byte1-err.ll
+; ERR-BF8-BYTE1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.f32.bf8
+; ERR-BF8-BYTE1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1
+define float @test_cvt_f32_bf8_byte1(i32 %a) {
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
+  ret float %ret
+}
+
+;--- pk-fp8-word0-err.ll
+; ERR-PK-FP8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8
+; ERR-PK-FP8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 0
+define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
+  ret <2 x float> %ret
+}
+
+;--- pk-fp8-word1-err.ll
+; ERR-PK-FP8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.fp8
+; ERR-PK-FP8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.fp8), %{{[0-9]+}}:vgpr(s32), 1
+define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+  ret <2 x float> %ret
+}
+
+;--- pk-bf8-word0-err.ll
+; ERR-PK-BF8-WORD0-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8
+; ERR-PK-BF8-WORD0-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 0
+define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+  ret <2 x float> %ret
+}
+
+;--- pk-bf8-word1-err.ll
+; ERR-PK-BF8-WORD1-ERR: LLVM ERROR: Cannot select: intrinsic %llvm.amdgcn.cvt.pk.f32.bf8
+; ERR-PK-BF8-WORD1-ERR-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.cvt.pk.f32.bf8), %{{[0-9]+}}:vgpr(s32), 1
+define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
+  %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
+  ret <2 x float> %ret
+}

llvm/lib/Target/AMDGPU/AMDGPU.td

llvm/lib/Target/AMDGPU/VOP1Instructions.td

krzysz00

Seems fine to me but I can't review this in because I don't know the hardware context to be assured this is correct

llvm/lib/Target/AMDGPU/AMDGPU.td

Trying to codegen these on targets without the instructions should fail to select. Not sure if all the predicates are correct. We had a fake one disconnected to a feature which was always true. Fixes: SWDEV-482274

…uiring the op_sel form

arsenm · 2024-10-09T06:01:10Z

ping

rampitec

LGTM

…#107291) Trying to codegen these on targets without the instructions should fail to select. Not sure if all the predicates are correct. We had a fake one disconnected to a feature which was always true. Fixes: SWDEV-482274

arsenm added the backend:AMDGPU label Sep 4, 2024 — with Graphite App

arsenm requested review from jayfoad, kosarev, mariusz-sikora-at-amd, rampitec, Sisyph and krzysz00 September 4, 2024 19:15

arsenm marked this pull request as ready for review September 4, 2024 19:16

rampitec reviewed Sep 4, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPU.td Outdated Show resolved Hide resolved

llvm/lib/Target/AMDGPU/AMDGPU.td Outdated Show resolved Hide resolved

rampitec reviewed Sep 4, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/VOP1Instructions.td Show resolved Hide resolved

krzysz00 reviewed Sep 5, 2024

View reviewed changes

arsenm force-pushed the users/arsenm/amdgpu-fix-wrong-predicates-fp8-conversion-intrinsics branch from 2d3ae1b to 394a284 Compare September 15, 2024 11:26

rampitec reviewed Sep 16, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/AMDGPU.td Outdated Show resolved Hide resolved

arsenm added 6 commits October 1, 2024 18:54

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics

d85f641

Trying to codegen these on targets without the instructions should fail to select. Not sure if all the predicates are correct. We had a fake one disconnected to a feature which was always true. Fixes: SWDEV-482274

Formatting

7f8fd29

Remove from gfx12

cb93a89

Remove HasSDWA OtherPredicate

5c72cf0

Fix hasCvtFP8VOP1Bug predicate

e5063d7

Avoid selecting e32 case on gfx12; not sure how to best check for req…

cc9a3b9

…uiring the op_sel form

arsenm force-pushed the users/arsenm/amdgpu-fix-wrong-predicates-fp8-conversion-intrinsics branch from 394a284 to cc9a3b9 Compare October 1, 2024 15:06

rampitec approved these changes Oct 9, 2024

View reviewed changes

arsenm merged commit a075e78 into main Oct 9, 2024
8 checks passed

arsenm deleted the users/arsenm/amdgpu-fix-wrong-predicates-fp8-conversion-intrinsics branch October 9, 2024 17:38

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291

Uh oh!

arsenm commented Sep 4, 2024

Uh oh!

arsenm commented Sep 4, 2024

Uh oh!

llvmbot commented Sep 4, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

krzysz00 left a comment

Uh oh!

Uh oh!

arsenm commented Oct 9, 2024

Uh oh!

rampitec left a comment

Uh oh!

Uh oh!

Uh oh!

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291

AMDGPU: Fix incorrectly selecting fp8/bf8 conversion intrinsics #107291

Uh oh!

Conversation

arsenm commented Sep 4, 2024

Uh oh!

arsenm commented Sep 4, 2024

Uh oh!

llvmbot commented Sep 4, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

krzysz00 left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

arsenm commented Oct 9, 2024

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!