17
17
# define TARGET_BUILTIN (ID, TYPE, ATTRS, FEATURE ) BUILTIN(ID, TYPE, ATTRS)
18
18
#endif
19
19
20
+ #pragma push_macro("SM_53")
20
21
#pragma push_macro("SM_70")
21
22
#pragma push_macro("SM_72")
22
23
#pragma push_macro("SM_75")
30
31
31
32
#pragma push_macro("SM_60")
32
33
#define SM_60 " sm_60|sm_61|sm_62|" SM_70
34
+ #define SM_53 " sm_53|" SM_60
33
35
36
+ #pragma push_macro("PTX42")
34
37
#pragma push_macro("PTX60")
35
38
#pragma push_macro("PTX61")
36
39
#pragma push_macro("PTX63")
53
56
#define PTX63 " ptx63|" PTX64
54
57
#define PTX61 " ptx61|" PTX63
55
58
#define PTX60 " ptx60|" PTX61
59
+ #define PTX42 " ptx42|" PTX60
56
60
57
61
#pragma push_macro("AND")
58
62
#define AND (a, b ) " (" a " ),(" b " )"
@@ -110,13 +114,89 @@ BUILTIN(__nvvm_prmt, "UiUiUiUi", "")
110
114
111
115
// Min Max
112
116
113
- BUILTIN(__nvvm_fmax_ftz_f, " fff" , " " )
114
- BUILTIN(__nvvm_fmax_f, " fff" , " " )
115
- BUILTIN(__nvvm_fmin_ftz_f, " fff" , " " )
116
- BUILTIN(__nvvm_fmin_f, " fff" , " " )
117
+ TARGET_BUILTIN(__nvvm_fmin_f16, " hhh" , " " , AND(SM_80, PTX70))
118
+ TARGET_BUILTIN(__nvvm_fmin_ftz_f16, " hhh" , " " , AND(SM_80, PTX70))
119
+ TARGET_BUILTIN(__nvvm_fmin_nan_f16, " hhh" , " " , AND(SM_80, PTX70))
120
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16, " hhh" , " " , AND(SM_80, PTX70))
121
+ TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
122
+ TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
123
+ TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
124
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16, " hhh" , " " ,
125
+ AND (SM_86, PTX72))
126
+ TARGET_BUILTIN(__nvvm_fmin_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
127
+ TARGET_BUILTIN(__nvvm_fmin_ftz_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
128
+ TARGET_BUILTIN(__nvvm_fmin_nan_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
129
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
130
+ TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
131
+ AND (SM_86, PTX72))
132
+ TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
133
+ AND (SM_86, PTX72))
134
+ TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
135
+ AND (SM_86, PTX72))
136
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
137
+ AND (SM_86, PTX72))
138
+ TARGET_BUILTIN(__nvvm_fmin_bf16, " UsUsUs" , " " , AND(SM_80, PTX70))
139
+ TARGET_BUILTIN(__nvvm_fmin_nan_bf16, " UsUsUs" , " " , AND(SM_80, PTX70))
140
+ TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16, " UsUsUs" , " " , AND(SM_86, PTX72))
141
+ TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16, " UsUsUs" , " " ,
142
+ AND (SM_86, PTX72))
143
+ TARGET_BUILTIN(__nvvm_fmin_bf16x2, " ZUiZUiZUi" , " " , AND(SM_80, PTX70))
144
+ TARGET_BUILTIN(__nvvm_fmin_nan_bf16x2, " ZUiZUiZUi" , " " , AND(SM_80, PTX70))
145
+ TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_bf16x2, " ZUiZUiZUi" , " " ,
146
+ AND (SM_86, PTX72))
147
+ TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_bf16x2, " ZUiZUiZUi" , " " ,
148
+ AND (SM_86, PTX72))
149
+ BUILTIN(__nvvm_fmin_f, " fff" , " " )
150
+ BUILTIN(__nvvm_fmin_ftz_f, " fff" , " " )
151
+ TARGET_BUILTIN(__nvvm_fmin_nan_f, " fff" , " " , AND(SM_80, PTX70))
152
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_f, " fff" , " " , AND(SM_80, PTX70))
153
+ TARGET_BUILTIN(__nvvm_fmin_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
154
+ TARGET_BUILTIN(__nvvm_fmin_ftz_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
155
+ TARGET_BUILTIN(__nvvm_fmin_nan_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
156
+ TARGET_BUILTIN(__nvvm_fmin_ftz_nan_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
157
+ BUILTIN(__nvvm_fmin_d, " ddd" , " " )
117
158
159
+ TARGET_BUILTIN(__nvvm_fmax_f16, " hhh" , " " , AND(SM_80, PTX70))
160
+ TARGET_BUILTIN(__nvvm_fmax_ftz_f16, " hhh" , " " , AND(SM_80, PTX70))
161
+ TARGET_BUILTIN(__nvvm_fmax_nan_f16, " hhh" , " " , AND(SM_80, PTX70))
162
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16, " hhh" , " " , AND(SM_80, PTX70))
163
+ TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
164
+ TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
165
+ TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16, " hhh" , " " , AND(SM_86, PTX72))
166
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16, " hhh" , " " ,
167
+ AND (SM_86, PTX72))
168
+ TARGET_BUILTIN(__nvvm_fmax_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
169
+ TARGET_BUILTIN(__nvvm_fmax_ftz_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
170
+ TARGET_BUILTIN(__nvvm_fmax_nan_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
171
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f16x2, " V2hV2hV2h" , " " , AND(SM_80, PTX70))
172
+ TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
173
+ AND (SM_86, PTX72))
174
+ TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
175
+ AND (SM_86, PTX72))
176
+ TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
177
+ AND (SM_86, PTX72))
178
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f16x2, " V2hV2hV2h" , " " ,
179
+ AND (SM_86, PTX72))
180
+ TARGET_BUILTIN(__nvvm_fmax_bf16, " UsUsUs" , " " , AND(SM_80, PTX70))
181
+ TARGET_BUILTIN(__nvvm_fmax_nan_bf16, " UsUsUs" , " " , AND(SM_80, PTX70))
182
+ TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16, " UsUsUs" , " " , AND(SM_86, PTX72))
183
+ TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16, " UsUsUs" , " " ,
184
+ AND (SM_86, PTX72))
185
+ TARGET_BUILTIN(__nvvm_fmax_bf16x2, " ZUiZUiZUi" , " " , AND(SM_80, PTX70))
186
+ TARGET_BUILTIN(__nvvm_fmax_nan_bf16x2, " ZUiZUiZUi" , " " , AND(SM_80, PTX70))
187
+ TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_bf16x2, " ZUiZUiZUi" , " " ,
188
+ AND (SM_86, PTX72))
189
+ TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_bf16x2, " ZUiZUiZUi" , " " ,
190
+ AND (SM_86, PTX72))
191
+ BUILTIN(__nvvm_fmax_f, " fff" , " " )
192
+ BUILTIN(__nvvm_fmax_ftz_f, " fff" , " " )
193
+ TARGET_BUILTIN(__nvvm_fmax_nan_f, " fff" , " " , AND(SM_80, PTX70))
194
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_f, " fff" , " " , AND(SM_80, PTX70))
195
+ TARGET_BUILTIN(__nvvm_fmax_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
196
+ TARGET_BUILTIN(__nvvm_fmax_ftz_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
197
+ TARGET_BUILTIN(__nvvm_fmax_nan_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
198
+ TARGET_BUILTIN(__nvvm_fmax_ftz_nan_xorsign_abs_f, " fff" , " " , AND(SM_86, PTX72))
118
199
BUILTIN(__nvvm_fmax_d, " ddd" , " " )
119
- BUILTIN(__nvvm_fmin_d, " ddd" , " " )
120
200
121
201
// Multiplication
122
202
@@ -182,11 +262,6 @@ BUILTIN(__nvvm_fabs_ftz_f, "ff", "")
182
262
BUILTIN(__nvvm_fabs_f, " ff" , " " )
183
263
BUILTIN(__nvvm_fabs_d, " dd" , " " )
184
264
185
- // Neg
186
-
187
- TARGET_BUILTIN(__nvvm_neg_bf16, " ZUsZUs" , " " , AND(SM_80,PTX70))
188
- TARGET_BUILTIN(__nvvm_neg_bf16x2, " ZUiZUi" , " " , AND(SM_80,PTX70))
189
-
190
265
// Round
191
266
192
267
BUILTIN(__nvvm_round_ftz_f, " ff" , " " )
@@ -210,6 +285,8 @@ BUILTIN(__nvvm_saturate_d, "dd", "")
210
285
BUILTIN(__nvvm_ex2_approx_ftz_f, " ff" , " " )
211
286
BUILTIN(__nvvm_ex2_approx_f, " ff" , " " )
212
287
BUILTIN(__nvvm_ex2_approx_d, " dd" , " " )
288
+ TARGET_BUILTIN(__nvvm_ex2_approx_f16, " hh" , " " , AND(SM_75, PTX70))
289
+ TARGET_BUILTIN(__nvvm_ex2_approx_f16x2, " V2hV2h" , " " , AND(SM_75, PTX70))
213
290
214
291
BUILTIN(__nvvm_lg2_approx_ftz_f, " ff" , " " )
215
292
BUILTIN(__nvvm_lg2_approx_f, " ff" , " " )
@@ -223,8 +300,30 @@ BUILTIN(__nvvm_sin_approx_f, "ff", "")
223
300
BUILTIN(__nvvm_cos_approx_ftz_f, " ff" , " " )
224
301
BUILTIN(__nvvm_cos_approx_f, " ff" , " " )
225
302
303
+ // Tanh
304
+
305
+ TARGET_BUILTIN(__nvvm_tanh_approx_f, " ff" , " " , AND(SM_75,PTX70))
306
+ TARGET_BUILTIN(__nvvm_tanh_approx_f16, " hh" , " " , AND(SM_75, PTX70))
307
+ TARGET_BUILTIN(__nvvm_tanh_approx_f16x2, " V2hV2h" , " " , AND(SM_75, PTX70))
308
+
226
309
// Fma
227
310
311
+ TARGET_BUILTIN(__nvvm_fma_rn_f16, " hhhh" , " " , AND(SM_53, PTX42))
312
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16, " hhhh" , " " , AND(SM_53, PTX42))
313
+ TARGET_BUILTIN(__nvvm_fma_rn_sat_f16, " hhhh" , " " , AND(SM_53, PTX42))
314
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16, " hhhh" , " " , AND(SM_53, PTX42))
315
+ TARGET_BUILTIN(__nvvm_fma_rn_relu_f16, " hhhh" , " " , AND(SM_80, PTX70))
316
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16, " hhhh" , " " , AND(SM_80, PTX70))
317
+ TARGET_BUILTIN(__nvvm_fma_rn_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_53, PTX42))
318
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_53, PTX42))
319
+ TARGET_BUILTIN(__nvvm_fma_rn_sat_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_53, PTX42))
320
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_sat_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_53, PTX42))
321
+ TARGET_BUILTIN(__nvvm_fma_rn_relu_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_80, PTX70))
322
+ TARGET_BUILTIN(__nvvm_fma_rn_ftz_relu_f16x2, " V2hV2hV2hV2h" , " " , AND(SM_80, PTX70))
323
+ TARGET_BUILTIN(__nvvm_fma_rn_bf16, " UsUsUsUs" , " " , AND(SM_80, PTX70))
324
+ TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16, " UsUsUsUs" , " " , AND(SM_80, PTX70))
325
+ TARGET_BUILTIN(__nvvm_fma_rn_bf16x2, " ZUiZUiZUiZUi" , " " , AND(SM_80, PTX70))
326
+ TARGET_BUILTIN(__nvvm_fma_rn_relu_bf16x2, " ZUiZUiZUiZUi" , " " , AND(SM_80, PTX70))
228
327
BUILTIN(__nvvm_fma_rn_ftz_f, " ffff" , " " )
229
328
BUILTIN(__nvvm_fma_rn_f, " ffff" , " " )
230
329
BUILTIN(__nvvm_fma_rz_ftz_f, " ffff" , " " )
@@ -2306,15 +2405,24 @@ TARGET_BUILTIN(__nvvm_cp_async_commit_group, "v", "", AND(SM_80,PTX70))
2306
2405
TARGET_BUILTIN(__nvvm_cp_async_wait_group, " vIi" , " " , AND(SM_80,PTX70))
2307
2406
TARGET_BUILTIN(__nvvm_cp_async_wait_all, " v" , " " , AND(SM_80,PTX70))
2308
2407
2408
+
2409
+ // bf16, bf16x2 abs, neg
2410
+ TARGET_BUILTIN(__nvvm_abs_bf16, " UsUs" , " " , AND(SM_80,PTX70))
2411
+ TARGET_BUILTIN(__nvvm_abs_bf16x2, " ZUiZUi" , " " , AND(SM_80,PTX70))
2412
+ TARGET_BUILTIN(__nvvm_neg_bf16, " UsUs" , " " , AND(SM_80,PTX70))
2413
+ TARGET_BUILTIN(__nvvm_neg_bf16x2, " ZUiZUi" , " " , AND(SM_80,PTX70))
2414
+
2309
2415
#undef BUILTIN
2310
2416
#undef TARGET_BUILTIN
2311
2417
#pragma pop_macro("AND")
2418
+ #pragma pop_macro("SM_53")
2312
2419
#pragma pop_macro("SM_60")
2313
2420
#pragma pop_macro("SM_70")
2314
2421
#pragma pop_macro("SM_72")
2315
2422
#pragma pop_macro("SM_75")
2316
2423
#pragma pop_macro("SM_80")
2317
2424
#pragma pop_macro("SM_86")
2425
+ #pragma pop_macro("PTX42")
2318
2426
#pragma pop_macro("PTX60")
2319
2427
#pragma pop_macro("PTX61")
2320
2428
#pragma pop_macro("PTX63")
0 commit comments