@@ -24,6 +24,166 @@ define amdgpu_kernel void @dpp8_wait_states(ptr addrspace(1) %out, i32 %in) {
24
24
ret void
25
25
}
26
26
27
+ ; GFX10PLUS-LABEL: {{^}}dpp8_i64:
28
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
29
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
30
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
31
+ define amdgpu_ps void @dpp8_i64 (i64 %in , ptr addrspace (1 ) %out ) {
32
+ %tmp0 = call i64 @llvm.amdgcn.mov.dpp8.i64 (i64 %in , i32 1 )
33
+ store i64 %tmp0 , ptr addrspace (1 ) %out
34
+ ret void
35
+ }
36
+
37
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v2i32:
38
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
39
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
40
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
41
+ define amdgpu_ps void @dpp8_v2i32 (<2 x i32 > %in , ptr addrspace (1 ) %out ) {
42
+ %tmp0 = call <2 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<2 x i32 > %in , i32 1 )
43
+ store <2 x i32 > %tmp0 , ptr addrspace (1 ) %out
44
+ ret void
45
+ }
46
+
47
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3i32:
48
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
49
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
50
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
51
+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
52
+ define amdgpu_ps void @dpp8_v3i32 (<3 x i32 > %in , ptr addrspace (1 ) %out ) {
53
+ %tmp0 = call <3 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<3 x i32 > %in , i32 1 )
54
+ store <3 x i32 > %tmp0 , ptr addrspace (1 ) %out
55
+ ret void
56
+ }
57
+
58
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4i32:
59
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
60
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
61
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
62
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v3, v3 dpp8:[1,0,0,0,0,0,0,0]
63
+ ; GFX10PLUS-DAG: global_store_{{dwordx4|b128}} v[4:5], v[0:3], off
64
+ define amdgpu_ps void @dpp8_v4i32 (<4 x i32 > %in , ptr addrspace (1 ) %out ) {
65
+ %tmp0 = call <4 x i32 > @llvm.amdgcn.mov.dpp8.v3i32 (<4 x i32 > %in , i32 1 )
66
+ store <4 x i32 > %tmp0 , ptr addrspace (1 ) %out
67
+ ret void
68
+ }
69
+
70
+ ; GFX10PLUS-LABEL: {{^}}dpp8_p0:
71
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
72
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
73
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
74
+ define amdgpu_ps void @dpp8_p0 (ptr %in , ptr addrspace (1 ) %out ) {
75
+ %tmp0 = call ptr @llvm.amdgcn.mov.dpp8.p0 (ptr %in , i32 1 )
76
+ store ptr %tmp0 , ptr addrspace (1 ) %out
77
+ ret void
78
+ }
79
+
80
+ ; GFX10PLUS-LABEL: {{^}}dpp8_p3:
81
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
82
+ ; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
83
+ define amdgpu_ps void @dpp8_p3 (ptr addrspace (3 ) %in , ptr addrspace (1 ) %out ) {
84
+ %tmp0 = call ptr addrspace (3 ) @llvm.amdgcn.mov.dpp8.v3p3 (ptr addrspace (3 ) %in , i32 1 )
85
+ store ptr addrspace (3 ) %tmp0 , ptr addrspace (1 ) %out
86
+ ret void
87
+ }
88
+
89
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3p3:
90
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
91
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
92
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
93
+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
94
+ define amdgpu_ps void @dpp8_v3p3 (<3 x ptr addrspace (3 )> %in , ptr addrspace (1 ) %out ) {
95
+ %tmp0 = call <3 x ptr addrspace (3 )> @llvm.amdgcn.mov.dpp8.v3p3 (<3 x ptr addrspace (3 )> %in , i32 1 )
96
+ store <3 x ptr addrspace (3 )> %tmp0 , ptr addrspace (1 ) %out
97
+ ret void
98
+ }
99
+
100
+ ; GFX10PLUS-LABEL: {{^}}dpp8_i16:
101
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
102
+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
103
+ define amdgpu_ps void @dpp8_i16 (i16 %in , ptr addrspace (1 ) %out ) {
104
+ %tmp0 = call i16 @llvm.amdgcn.mov.dpp8.i16 (i16 %in , i32 1 )
105
+ store i16 %tmp0 , ptr addrspace (1 ) %out
106
+ ret void
107
+ }
108
+
109
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4i16:
110
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
111
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
112
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
113
+ define amdgpu_ps void @dpp8_v4i16 (<4 x i16 > %in , ptr addrspace (1 ) %out ) {
114
+ %tmp0 = call <4 x i16 > @llvm.amdgcn.mov.dpp8.v4i16 (<4 x i16 > %in , i32 1 )
115
+ store <4 x i16 > %tmp0 , ptr addrspace (1 ) %out
116
+ ret void
117
+ }
118
+
119
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4f16:
120
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
121
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
122
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
123
+ define amdgpu_ps void @dpp8_v4f16 (<4 x half > %in , ptr addrspace (1 ) %out ) {
124
+ %tmp0 = call <4 x half > @llvm.amdgcn.mov.dpp8.v4f16 (<4 x half > %in , i32 1 )
125
+ store <4 x half > %tmp0 , ptr addrspace (1 ) %out
126
+ ret void
127
+ }
128
+
129
+ ; GFX10PLUS-LABEL: {{^}}dpp8_float:
130
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
131
+ ; GFX10PLUS-DAG: global_store_{{dword|b32}} v[1:2], v0, off
132
+ define amdgpu_ps void @dpp8_float (float %in , ptr addrspace (1 ) %out ) {
133
+ %tmp0 = call float @llvm.amdgcn.mov.dpp8.f32 (float %in , i32 1 )
134
+ store float %tmp0 , ptr addrspace (1 ) %out
135
+ ret void
136
+ }
137
+
138
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v3f32:
139
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
140
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
141
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v2, v2 dpp8:[1,0,0,0,0,0,0,0]
142
+ ; GFX10PLUS-DAG: global_store_{{dwordx3|b96}} v[3:4], v[0:2], off
143
+ define amdgpu_ps void @dpp8_v3f32 (<3 x float > %in , ptr addrspace (1 ) %out ) {
144
+ %tmp0 = call <3 x float > @llvm.amdgcn.mov.dpp8.v3f32 (<3 x float > %in , i32 1 )
145
+ store <3 x float > %tmp0 , ptr addrspace (1 ) %out
146
+ ret void
147
+ }
148
+
149
+ ; GFX10PLUS-LABEL: {{^}}dpp8_half:
150
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
151
+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
152
+ define amdgpu_ps void @dpp8_half (half %in , ptr addrspace (1 ) %out ) {
153
+ %tmp0 = call half @llvm.amdgcn.mov.dpp8.f16 (half %in , i32 1 )
154
+ store half %tmp0 , ptr addrspace (1 ) %out
155
+ ret void
156
+ }
157
+
158
+ ; GFX10PLUS-LABEL: {{^}}dpp8_bfloat:
159
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
160
+ ; GFX10PLUS-DAG: global_store_{{short|b16}} v[1:2], v0, off
161
+ define amdgpu_ps void @dpp8_bfloat (bfloat %in , ptr addrspace (1 ) %out ) {
162
+ %tmp0 = call bfloat @llvm.amdgcn.mov.dpp8.bf16 (bfloat %in , i32 1 )
163
+ store bfloat %tmp0 , ptr addrspace (1 ) %out
164
+ ret void
165
+ }
166
+
167
+ ; GFX10PLUS-LABEL: {{^}}dpp8_v4bf16:
168
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
169
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
170
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
171
+ define amdgpu_ps void @dpp8_v4bf16 (<4 x bfloat> %in , ptr addrspace (1 ) %out ) {
172
+ %tmp0 = call <4 x bfloat> @llvm.amdgcn.mov.dpp8.v4bf16 (<4 x bfloat> %in , i32 1 )
173
+ store <4 x bfloat> %tmp0 , ptr addrspace (1 ) %out
174
+ ret void
175
+ }
176
+
177
+ ; GFX10PLUS-LABEL: {{^}}dpp8_double:
178
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v0, v0 dpp8:[1,0,0,0,0,0,0,0]
179
+ ; GFX10PLUS-DAG: v_mov_b32_dpp v1, v1 dpp8:[1,0,0,0,0,0,0,0]
180
+ ; GFX10PLUS-DAG: global_store_{{dwordx2|b64}} v[2:3], v[0:1], off
181
+ define amdgpu_ps void @dpp8_double (double %in , ptr addrspace (1 ) %out ) {
182
+ %tmp0 = call double @llvm.amdgcn.mov.dpp8.f64 (double %in , i32 1 )
183
+ store double %tmp0 , ptr addrspace (1 ) %out
184
+ ret void
185
+ }
186
+
27
187
declare i32 @llvm.amdgcn.mov.dpp8.i32 (i32 , i32 ) #0
28
188
29
189
attributes #0 = { nounwind readnone convergent }
0 commit comments