Skip to content

Commit 94316dd

Browse files
authored
[AMDGPU][True16][CodeGen] saddsat/ssubsat sdag for true16 format (#118245)
saddsat and ssubsat SDAG codeGen pattern for True16 format
1 parent de6d0d2 commit 94316dd

File tree

3 files changed

+96
-28
lines changed

3 files changed

+96
-28
lines changed

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,10 @@ let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in {
763763
def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
764764
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
765765
} // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts
766+
let True16Predicate = UseRealTrue16Insts in {
767+
def : OpSelBinOpClampPat<saddsat, V_ADD_I16_t16_e64>;
768+
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_t16_e64>;
769+
} // End True16Predicate = UseRealTrue16Insts
766770
let True16Predicate = UseFakeTrue16Insts in {
767771
def : OpSelBinOpClampPat<saddsat, V_ADD_I16_fake16_e64>;
768772
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_fake16_e64>;

llvm/test/CodeGen/AMDGPU/saddsat.ll

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
44
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
55
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
78

89
define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
910
; GFX6-LABEL: v_saddsat_i8:
@@ -34,14 +35,32 @@ define i8 @v_saddsat_i8(i8 %lhs, i8 %rhs) {
3435
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
3536
; GFX9-NEXT: s_setpc_b64 s[30:31]
3637
;
37-
; GFX10PLUS-LABEL: v_saddsat_i8:
38-
; GFX10PLUS: ; %bb.0:
39-
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40-
; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
41-
; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
42-
; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
43-
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
44-
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
38+
; GFX10-LABEL: v_saddsat_i8:
39+
; GFX10: ; %bb.0:
40+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41+
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
42+
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
43+
; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
44+
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
45+
; GFX10-NEXT: s_setpc_b64 s[30:31]
46+
;
47+
; GFX11-TRUE16-LABEL: v_saddsat_i8:
48+
; GFX11-TRUE16: ; %bb.0:
49+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50+
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
51+
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
52+
; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp
53+
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v0.l
54+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
55+
;
56+
; GFX11-FAKE16-LABEL: v_saddsat_i8:
57+
; GFX11-FAKE16: ; %bb.0:
58+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59+
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
60+
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0
61+
; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp
62+
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0
63+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
4564
%result = call i8 @llvm.sadd.sat.i8(i8 %lhs, i8 %rhs)
4665
ret i8 %result
4766
}
@@ -76,11 +95,24 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) {
7695
; GFX9-NEXT: v_add_i16 v0, v0, v1 clamp
7796
; GFX9-NEXT: s_setpc_b64 s[30:31]
7897
;
79-
; GFX10PLUS-LABEL: v_saddsat_i16:
80-
; GFX10PLUS: ; %bb.0:
81-
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82-
; GFX10PLUS-NEXT: v_add_nc_i16 v0, v0, v1 clamp
83-
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
98+
; GFX10-LABEL: v_saddsat_i16:
99+
; GFX10: ; %bb.0:
100+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101+
; GFX10-NEXT: v_add_nc_i16 v0, v0, v1 clamp
102+
; GFX10-NEXT: s_setpc_b64 s[30:31]
103+
;
104+
; GFX11-TRUE16-LABEL: v_saddsat_i16:
105+
; GFX11-TRUE16: ; %bb.0:
106+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
108+
; GFX11-TRUE16-NEXT: v_add_nc_i16 v0.l, v0.l, v0.h clamp
109+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
110+
;
111+
; GFX11-FAKE16-LABEL: v_saddsat_i16:
112+
; GFX11-FAKE16: ; %bb.0:
113+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX11-FAKE16-NEXT: v_add_nc_i16 v0, v0, v1 clamp
115+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
84116
%result = call i16 @llvm.sadd.sat.i16(i16 %lhs, i16 %rhs)
85117
ret i16 %result
86118
}

llvm/test/CodeGen/AMDGPU/ssubsat.ll

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=fiji < %s | FileCheck --check-prefix=GFX8 %s
44
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
55
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX10 %s
6-
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11 %s
6+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s
7+
; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s
78

89
define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
910
; GFX6-LABEL: v_ssubsat_i8:
@@ -34,14 +35,32 @@ define i8 @v_ssubsat_i8(i8 %lhs, i8 %rhs) {
3435
; GFX9-NEXT: v_ashrrev_i16_e32 v0, 8, v0
3536
; GFX9-NEXT: s_setpc_b64 s[30:31]
3637
;
37-
; GFX10PLUS-LABEL: v_ssubsat_i8:
38-
; GFX10PLUS: ; %bb.0:
39-
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
40-
; GFX10PLUS-NEXT: v_lshlrev_b16 v1, 8, v1
41-
; GFX10PLUS-NEXT: v_lshlrev_b16 v0, 8, v0
42-
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
43-
; GFX10PLUS-NEXT: v_ashrrev_i16 v0, 8, v0
44-
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
38+
; GFX10-LABEL: v_ssubsat_i8:
39+
; GFX10: ; %bb.0:
40+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
41+
; GFX10-NEXT: v_lshlrev_b16 v1, 8, v1
42+
; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0
43+
; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
44+
; GFX10-NEXT: v_ashrrev_i16 v0, 8, v0
45+
; GFX10-NEXT: s_setpc_b64 s[30:31]
46+
;
47+
; GFX11-TRUE16-LABEL: v_ssubsat_i8:
48+
; GFX11-TRUE16: ; %bb.0:
49+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
50+
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v1.l
51+
; GFX11-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l
52+
; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v0.h clamp
53+
; GFX11-TRUE16-NEXT: v_ashrrev_i16 v0.l, 8, v0.l
54+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
55+
;
56+
; GFX11-FAKE16-LABEL: v_ssubsat_i8:
57+
; GFX11-FAKE16: ; %bb.0:
58+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
59+
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1
60+
; GFX11-FAKE16-NEXT: v_lshlrev_b16 v0, 8, v0
61+
; GFX11-FAKE16-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
62+
; GFX11-FAKE16-NEXT: v_ashrrev_i16 v0, 8, v0
63+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
4564
%result = call i8 @llvm.ssub.sat.i8(i8 %lhs, i8 %rhs)
4665
ret i8 %result
4766
}
@@ -76,11 +95,24 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) {
7695
; GFX9-NEXT: v_sub_i16 v0, v0, v1 clamp
7796
; GFX9-NEXT: s_setpc_b64 s[30:31]
7897
;
79-
; GFX10PLUS-LABEL: v_ssubsat_i16:
80-
; GFX10PLUS: ; %bb.0:
81-
; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
82-
; GFX10PLUS-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
83-
; GFX10PLUS-NEXT: s_setpc_b64 s[30:31]
98+
; GFX10-LABEL: v_ssubsat_i16:
99+
; GFX10: ; %bb.0:
100+
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
101+
; GFX10-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
102+
; GFX10-NEXT: s_setpc_b64 s[30:31]
103+
;
104+
; GFX11-TRUE16-LABEL: v_ssubsat_i16:
105+
; GFX11-TRUE16: ; %bb.0:
106+
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
107+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
108+
; GFX11-TRUE16-NEXT: v_sub_nc_i16 v0.l, v0.l, v0.h clamp
109+
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
110+
;
111+
; GFX11-FAKE16-LABEL: v_ssubsat_i16:
112+
; GFX11-FAKE16: ; %bb.0:
113+
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
114+
; GFX11-FAKE16-NEXT: v_sub_nc_i16 v0, v0, v1 clamp
115+
; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
84116
%result = call i16 @llvm.ssub.sat.i16(i16 %lhs, i16 %rhs)
85117
ret i16 %result
86118
}

0 commit comments

Comments
 (0)