Skip to content

Commit 5a7f694

Browse files
committed
further work
1 parent 0c107c0 commit 5a7f694

File tree

1 file changed

+209
-17
lines changed

1 file changed

+209
-17
lines changed

clang/lib/Headers/avx10_2convertintrin.h

Lines changed: 209 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -106,13 +106,13 @@ _mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
106106
/// \code{.operation]
107107
/// FOR i := 0 to 7
108108
/// IF mask[i]
109-
/// dst.fp16[i] := 0
110-
/// ELSE
111109
/// IF i < 4
112110
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
113111
/// ELSE
114112
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
115113
/// FI
114+
/// ELSE
115+
/// dst.fp16[i] := 0
116116
/// FI
117117
/// ENDFOR
118118
/// \endcode
@@ -130,8 +130,8 @@ _mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
130130
/// \returns
131131
/// A 128-bit vector of [8 x fp16]. Lower 4 elements correspond to the
132132
/// (converted) elements from \a __B; higher order elements correspond to the
133-
/// (converted) elements from \a __A. If corresponding mask bit is set, then
134-
/// zero is taken instead.
133+
/// (converted) elements from \a __A. If corresponding mask bit is not set,
134+
/// then zero is taken instead.
135135
static __inline__ __m128h __DEFAULT_FN_ATTRS128
136136
_mm_maskz_cvtx2ps_ph(__mmask8 __U, __m128 __A, __m128 __B) {
137137
return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
@@ -222,13 +222,13 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
222222
/// \code{.operation]
223223
/// FOR i := 0 to 15
224224
/// IF mask[i]
225-
/// dst.fp16[i] := 0
226-
/// ELSE
227225
/// IF i < 8
228226
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
229227
/// ELSE
230228
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
231229
/// FI
230+
/// ELSE
231+
/// dst.fp16[i] := 0
232232
/// FI
233233
/// ENDFOR
234234
/// \endcode
@@ -238,16 +238,16 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
238238
/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
239239
///
240240
/// \param __U
241-
/// A 8-bit merging mask.
241+
/// A 8-bit zeroing mask.
242242
/// \param __A
243243
/// A 256-bit vector of [8 x float].
244244
/// \param __B
245245
/// A 256-bit vector of [8 x float].
246246
/// \returns
247247
/// A 256-bit vector of [16 x fp16]. Lower 4 elements correspond to the
248248
/// (converted) elements from \a __B; higher order elements correspond to the
249-
/// (converted) elements from \a __A. If corresponding mask bit is set, then
250-
/// zero is taken instead.
249+
/// (converted) elements from \a __A. If corresponding mask bit is not set,
250+
/// then zero is taken instead.
251251
static __inline__ __m256h __DEFAULT_FN_ATTRS256
252252
_mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
253253
return (__m256h)__builtin_ia32_vcvt2ps2phx256_mask(
@@ -279,7 +279,7 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
279279
/// A 256-bit vector of [8 x float].
280280
/// \param __R
281281
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
282-
/// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
282+
/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
283283
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
284284
/// _MM_FROUND_TO_ZERO.
285285
/// \returns
@@ -325,7 +325,7 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
325325
/// A 256-bit vector of [8 x float].
326326
/// \param __R
327327
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
328-
/// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
328+
/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
329329
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
330330
/// _MM_FROUND_TO_ZERO.
331331
/// \returns
@@ -345,13 +345,13 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
345345
/// \code{.operation]
346346
/// FOR i := 0 to 15
347347
/// IF mask[i]
348-
/// dst.fp16[i] := 0
349-
/// ELSE
350348
/// IF i < 8
351349
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
352350
/// ELSE
353351
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
354352
/// FI
353+
/// ELSE
354+
/// dst.fp16[i] := 0
355355
/// FI
356356
/// ENDFOR
357357
/// \endcode
@@ -361,21 +361,21 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
361361
/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
362362
///
363363
/// \param __U
364-
/// A 8-bit merging mask.
364+
/// A 8-bit zeroing mask.
365365
/// \param __A
366366
/// A 256-bit vector of [8 x float].
367367
/// \param __B
368368
/// A 256-bit vector of [8 x float].
369369
/// \param __R
370370
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
371-
/// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
371+
/// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
372372
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
373373
/// _MM_FROUND_TO_ZERO.
374374
/// \returns
375375
/// A 256-bit vector of [16 x fp16]. Lower 4 elements correspond to the
376376
/// (converted) elements from \a __B; higher order elements correspond to the
377-
/// (converted) elements from \a __A. If corresponding mask bit is set, then
378-
/// zero is taken instead.
377+
/// (converted) elements from \a __A. If corresponding mask bit is not set,
378+
/// then zero is taken instead.
379379
#define _mm256_maskz_cvtx_round2ps_ph(U, A, B, R) \
380380
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
381381
(__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \
@@ -537,37 +537,229 @@ _mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) {
537537
(__mmask16)__U);
538538
}
539539

540+
/// Convert two 128-bit vectors, \a __A and \a __B, containing packed
541+
/// FP16 floating-point elements to a 128-bit vector
542+
/// containing E5M2 FP8 elements.
543+
///
544+
/// \code{.operation]
545+
/// FOR i := 0 to 16
546+
/// IF i < 8
547+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
548+
/// ELSE
549+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
550+
/// FI
551+
/// ENDFOR
552+
/// \endcode
553+
///
554+
/// \headerfile <immintrin.h>
555+
///
556+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
557+
///
558+
/// \param __A
559+
/// A 128-bit vector of [8 x fp16].
560+
/// \param __B
561+
/// A 128-bit vector of [8 x fp16].
562+
/// \returns
563+
/// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
564+
/// (converted) elements from \a __B; higher order elements correspond to the
565+
/// (converted) elements from \a __A.
540566
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_pbf8(__m128h __A,
541567
__m128h __B) {
542568
return (__m128i)__builtin_ia32_vcvtne2ph2bf8_128((__v8hf)(__A),
543569
(__v8hf)(__B));
544570
}
545571

572+
/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
573+
/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
574+
/// Merging mask \a __U is used to determine if given element should be taken
575+
/// from \a __W instead.
576+
///
577+
/// \code{.operation]
578+
/// FOR i := 0 to 16
579+
/// IF __U[i]
580+
/// dst.fp8[i] := __W[i]
581+
/// ELSE
582+
/// IF i < 8
583+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
584+
/// ELSE
585+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
586+
/// FI
587+
/// FI
588+
/// ENDFOR
589+
/// \endcode
590+
///
591+
/// \headerfile <immintrin.h>
592+
///
593+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
594+
///
595+
/// \param __W
596+
/// A 128-bit vector of [16 x fp8].
597+
/// \param __U
598+
/// A 16-bit merging mask.
599+
/// \param __A
600+
/// A 128-bit vector of [8 x fp16].
601+
/// \param __B
602+
/// A 128-bit vector of [8 x fp16].
603+
/// \returns
604+
/// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
605+
/// (converted) elements from \a __B; higher order elements correspond to the
606+
/// (converted) elements from \a __A. If corresponding mask bit is set, then
607+
/// element from \a __W is taken instead.
546608
static __inline__ __m128i __DEFAULT_FN_ATTRS128
547609
_mm_mask_cvtne2ph_pbf8(__m128i __W, __mmask16 __U, __m128h __A, __m128h __B) {
548610
return (__m128i)__builtin_ia32_selectb_128(
549611
(__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B), (__v16qi)__W);
550612
}
551613

614+
/// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
615+
/// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
616+
/// Zeroing mask \a __U is used to determine if given element should be zeroed
617+
/// instead.
618+
///
619+
/// \code{.operation]
620+
/// FOR i := 0 to 16
621+
/// IF __U[i]
622+
/// IF i < 8
623+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
624+
/// ELSE
625+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
626+
/// FI
627+
/// ELSE
628+
/// dst.fp8[i] := 0
629+
/// FI
630+
/// ENDFOR
631+
/// \endcode
632+
///
633+
/// \headerfile <immintrin.h>
634+
///
635+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
636+
///
637+
/// \param __U
638+
/// A 16-bit zeroing mask.
639+
/// \param __A
640+
/// A 128-bit vector of [8 x fp16].
641+
/// \param __B
642+
/// A 128-bit vector of [8 x fp16].
643+
/// \returns
644+
/// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
645+
/// (converted) elements from \a __B; higher order elements correspond to the
646+
/// (converted) elements from \a __A. If corresponding mask bit is not set, then
647+
/// zero is taken instead.
552648
static __inline__ __m128i __DEFAULT_FN_ATTRS128
553649
_mm_maskz_cvtne2ph_pbf8(__mmask16 __U, __m128h __A, __m128h __B) {
554650
return (__m128i)__builtin_ia32_selectb_128(
555651
(__mmask16)__U, (__v16qi)_mm_cvtne2ph_pbf8(__A, __B),
556652
(__v16qi)(__m128i)_mm_setzero_si128());
557653
}
558654

655+
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed
656+
/// FP16 floating-point elements to a 256-bit vector
657+
/// containing E5M2 FP8 elements.
658+
///
659+
/// \code{.operation]
660+
/// FOR i := 0 to 32
661+
/// IF i < 16
662+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
663+
/// ELSE
664+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
665+
/// FI
666+
/// ENDFOR
667+
/// \endcode
668+
///
669+
/// \headerfile <immintrin.h>
670+
///
671+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
672+
///
673+
/// \param __A
674+
/// A 256-bit vector of [16 x fp16].
675+
/// \param __B
676+
/// A 256-bit vector of [16 x fp16].
677+
/// \returns
678+
/// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
679+
/// (converted) elements from \a __B; higher order elements correspond to the
680+
/// (converted) elements from \a __A.
559681
static __inline__ __m256i __DEFAULT_FN_ATTRS256
560682
_mm256_cvtne2ph_pbf8(__m256h __A, __m256h __B) {
561683
return (__m256i)__builtin_ia32_vcvtne2ph2bf8_256((__v16hf)(__A),
562684
(__v16hf)(__B));
563685
}
564686

687+
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
688+
/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
689+
/// Merging mask \a __U is used to determine if given element should be taken
690+
/// from \a __W instead.
691+
///
692+
/// \code{.operation]
693+
/// FOR i := 0 to 32
694+
/// IF __U[i]
695+
/// dst.fp8[i] := __W.fp8[i]
696+
/// ELSE
697+
/// IF i < 16
698+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
699+
/// ELSE
700+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
701+
/// FI
702+
/// FI
703+
/// ENDFOR
704+
/// \endcode
705+
///
706+
/// \headerfile <immintrin.h>
707+
///
708+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
709+
///
710+
/// \param __W
711+
/// A 256-bit vector of [32 x fp8].
712+
/// \param __U
713+
/// A 32-bit merging mask.
714+
/// \param __A
715+
/// A 256-bit vector of [16 x fp16].
716+
/// \param __B
717+
/// A 256-bit vector of [16 x fp16].
718+
/// \returns
719+
/// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
720+
/// (converted) elements from \a __B; higher order elements correspond to the
721+
/// (converted) elements from \a __A. If corresponding mask bit is set, then
722+
/// element from \a __W is taken instead.
565723
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_pbf8(
566724
__m256i __W, __mmask32 __U, __m256h __A, __m256h __B) {
567725
return (__m256i)__builtin_ia32_selectb_256(
568726
(__mmask16)__U, (__v32qi)_mm256_cvtne2ph_pbf8(__A, __B), (__v32qi)__W);
569727
}
570728

729+
/// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
730+
/// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
731+
/// Merging mask \a __U is used to determine if given element should be zeroed
732+
/// instead.
733+
///
734+
/// \code{.operation]
735+
/// FOR i := 0 to 32
736+
/// IF __U[i]
737+
/// dst.fp8[i] := 0
738+
/// ELSE
739+
/// IF i < 16
740+
/// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
741+
/// ELSE
742+
/// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
743+
/// FI
744+
/// FI
745+
/// ENDFOR
746+
/// \endcode
747+
///
748+
/// \headerfile <immintrin.h>
749+
///
750+
/// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
751+
///
752+
/// \param __U
753+
/// A 32-bit zeroing mask.
754+
/// \param __A
755+
/// A 256-bit vector of [16 x fp16].
756+
/// \param __B
757+
/// A 256-bit vector of [16 x fp16].
758+
/// \returns
759+
/// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
760+
/// (converted) elements from \a __B; higher order elements correspond to the
761+
/// (converted) elements from \a __A. If corresponding mask bit is not set,
762+
/// then element from \a __W is taken instead.
571763
static __inline__ __m256i __DEFAULT_FN_ATTRS256
572764
_mm256_maskz_cvtne2ph_pbf8(__mmask32 __U, __m256h __A, __m256h __B) {
573765
return (__m256i)__builtin_ia32_selectb_256(

0 commit comments

Comments
 (0)