@@ -12,18 +12,124 @@ define <2 x half> @v_test_cvt_v2f32_v2f16(<2 x float> %src) {
12
12
ret <2 x half > %res
13
13
}
14
14
15
- define half @fptrunc_v2f32_v2f16_then_extract (< 2 x float > %src ) {
16
- ; GFX950-LABEL: fptrunc_v2f32_v2f16_then_extract :
15
+ define < 4 x half > @v_test_cvt_v4f32_v4f16 (< 4 x float > %src ) {
16
+ ; GFX950-LABEL: v_test_cvt_v4f32_v4f16 :
17
17
; GFX950: ; %bb.0:
18
18
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
19
19
; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
20
- ; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
20
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
21
+ ; GFX950-NEXT: s_setpc_b64 s[30:31]
22
+ %res = fptrunc <4 x float > %src to <4 x half >
23
+ ret <4 x half > %res
24
+ }
25
+
26
+ define <8 x half > @v_test_cvt_v8f32_v2f16 (<8 x float > %src ) {
27
+ ; GFX950-LABEL: v_test_cvt_v8f32_v2f16:
28
+ ; GFX950: ; %bb.0:
29
+ ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
30
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
31
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
32
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
33
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
34
+ ; GFX950-NEXT: s_setpc_b64 s[30:31]
35
+ %res = fptrunc <8 x float > %src to <8 x half >
36
+ ret <8 x half > %res
37
+ }
38
+
39
+ define half @fptrunc_v2f32_v2f16_extract_uses (<2 x float > %src ) {
40
+ ; GFX950-LABEL: fptrunc_v2f32_v2f16_extract_uses:
41
+ ; GFX950: ; %bb.0:
42
+ ; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
43
+ ; GFX950-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
44
+ ; GFX950-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
21
45
; GFX950-NEXT: s_setpc_b64 s[30:31]
22
46
%vec_half = fptrunc <2 x float > %src to <2 x half >
23
- %first = extractelement <2 x half > %vec_half , i64 1
24
- %second = extractelement <2 x half > %vec_half , i64 0
25
- %res = fadd half %first , %second
26
- ret half %res
47
+ %f0 = extractelement <2 x half > %vec_half , i64 0
48
+ %f1 = extractelement <2 x half > %vec_half , i64 1
49
+ %rslt = fadd half %f0 , %f1
50
+ ret half %rslt
51
+ }
52
+
53
+ define half @fptrunc_v4f32_v4f16_extract_uses (<4 x float > %vec_float ) {
54
+ ; GFX950-SDAG-LABEL: fptrunc_v4f32_v4f16_extract_uses:
55
+ ; GFX950-SDAG: ; %bb.0:
56
+ ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
57
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
58
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
59
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
60
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
61
+ ; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
62
+ ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
63
+ ;
64
+ ; GFX950-GISEL-LABEL: fptrunc_v4f32_v4f16_extract_uses:
65
+ ; GFX950-GISEL: ; %bb.0:
66
+ ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
67
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
68
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
69
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
70
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
71
+ ; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
72
+ ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
73
+ %vec_half = fptrunc <4 x float > %vec_float to <4 x half >
74
+ %f0 = extractelement <4 x half > %vec_half , i64 0
75
+ %f1 = extractelement <4 x half > %vec_half , i64 1
76
+ %f2 = extractelement <4 x half > %vec_half , i64 2
77
+ %f3 = extractelement <4 x half > %vec_half , i64 3
78
+ %sum0 = fadd half %f0 , %f1
79
+ %sum1 = fadd half %f2 , %f3
80
+ %rslt = fadd half %sum0 , %sum1
81
+ ret half %rslt
82
+ }
83
+
84
+ define half @fptrunc_v8f32_v8f16_extract_uses (<8 x float > %vec_float ) {
85
+ ; GFX950-SDAG-LABEL: fptrunc_v8f32_v8f16_extract_uses:
86
+ ; GFX950-SDAG: ; %bb.0:
87
+ ; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
88
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v6, v6, v7
89
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v4, v4, v5
90
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v2, v2, v3
91
+ ; GFX950-SDAG-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
92
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
93
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v1, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
94
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v2, v4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
95
+ ; GFX950-SDAG-NEXT: v_add_f16_sdwa v3, v6, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
96
+ ; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
97
+ ; GFX950-SDAG-NEXT: v_add_f16_e32 v1, v2, v3
98
+ ; GFX950-SDAG-NEXT: v_add_f16_e32 v0, v0, v1
99
+ ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
100
+ ;
101
+ ; GFX950-GISEL-LABEL: fptrunc_v8f32_v8f16_extract_uses:
102
+ ; GFX950-GISEL: ; %bb.0:
103
+ ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
104
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v0, v0, v1
105
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v1, v2, v3
106
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v2, v4, v5
107
+ ; GFX950-GISEL-NEXT: v_cvt_pk_f16_f32 v3, v6, v7
108
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
109
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v1, v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
110
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v2, v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
111
+ ; GFX950-GISEL-NEXT: v_add_f16_sdwa v3, v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
112
+ ; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
113
+ ; GFX950-GISEL-NEXT: v_add_f16_e32 v1, v2, v3
114
+ ; GFX950-GISEL-NEXT: v_add_f16_e32 v0, v0, v1
115
+ ; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
116
+ %vec_half = fptrunc <8 x float > %vec_float to <8 x half >
117
+ %f0 = extractelement <8 x half > %vec_half , i64 0
118
+ %f1 = extractelement <8 x half > %vec_half , i64 1
119
+ %f2 = extractelement <8 x half > %vec_half , i64 2
120
+ %f3 = extractelement <8 x half > %vec_half , i64 3
121
+ %f4 = extractelement <8 x half > %vec_half , i64 4
122
+ %f5 = extractelement <8 x half > %vec_half , i64 5
123
+ %f6 = extractelement <8 x half > %vec_half , i64 6
124
+ %f7 = extractelement <8 x half > %vec_half , i64 7
125
+ %sum0 = fadd half %f0 , %f1
126
+ %sum1 = fadd half %f2 , %f3
127
+ %sum2 = fadd half %f4 , %f5
128
+ %sum3 = fadd half %f6 , %f7
129
+ %sum4 = fadd half %sum0 , %sum1
130
+ %sum5 = fadd half %sum2 , %sum3
131
+ %rslt = fadd half %sum4 , %sum5
132
+ ret half %rslt
27
133
}
28
134
29
135
define <2 x half > @v_test_cvt_v2f64_v2f16 (<2 x double > %src ) {
0 commit comments