@@ -106,13 +106,13 @@ _mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
106
106
/// \code{.operation]
107
107
/// FOR i := 0 to 7
108
108
/// IF mask[i]
109
- /// dst.fp16[i] := 0
110
- /// ELSE
111
109
/// IF i < 4
112
110
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
113
111
/// ELSE
114
112
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 4])
115
113
/// FI
114
+ /// ELSE
115
+ /// dst.fp16[i] := 0
116
116
/// FI
117
117
/// ENDFOR
118
118
/// \endcode
@@ -130,8 +130,8 @@ _mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
130
130
/// \returns
131
131
/// A 128-bit vector of [8 x fp16]. Lower 4 elements correspond to the
132
132
/// (converted) elements from \a __B; higher order elements correspond to the
133
- /// (converted) elements from \a __A. If corresponding mask bit is set, then
134
- /// zero is taken instead.
133
+ /// (converted) elements from \a __A. If corresponding mask bit is not set,
134
+ /// then zero is taken instead.
135
135
static __inline__ __m128h __DEFAULT_FN_ATTRS128
136
136
_mm_maskz_cvtx2ps_ph (__mmask8 __U , __m128 __A , __m128 __B ) {
137
137
return (__m128h )__builtin_ia32_vcvt2ps2phx128_mask (
@@ -222,13 +222,13 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
222
222
/// \code{.operation]
223
223
/// FOR i := 0 to 15
224
224
/// IF mask[i]
225
- /// dst.fp16[i] := 0
226
- /// ELSE
227
225
/// IF i < 8
228
226
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
229
227
/// ELSE
230
228
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
231
229
/// FI
230
+ /// ELSE
231
+ /// dst.fp16[i] := 0
232
232
/// FI
233
233
/// ENDFOR
234
234
/// \endcode
@@ -238,16 +238,16 @@ _mm256_mask_cvtx2ps_ph(__m256h __W, __mmask16 __U, __m256 __A, __m256 __B) {
238
238
/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
239
239
///
240
240
/// \param __U
241
- /// A 8-bit merging mask.
241
+ /// A 8-bit zeroing mask.
242
242
/// \param __A
243
243
/// A 256-bit vector of [8 x float].
244
244
/// \param __B
245
245
/// A 256-bit vector of [8 x float].
246
246
/// \returns
247
247
/// A 256-bit vector of [16 x fp16]. Lower 4 elements correspond to the
248
248
/// (converted) elements from \a __B; higher order elements correspond to the
249
- /// (converted) elements from \a __A. If corresponding mask bit is set, then
250
- /// zero is taken instead.
249
+ /// (converted) elements from \a __A. If corresponding mask bit is not set,
250
+ /// then zero is taken instead.
251
251
static __inline__ __m256h __DEFAULT_FN_ATTRS256
252
252
_mm256_maskz_cvtx2ps_ph (__mmask16 __U , __m256 __A , __m256 __B ) {
253
253
return (__m256h )__builtin_ia32_vcvt2ps2phx256_mask (
@@ -279,7 +279,7 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
279
279
/// A 256-bit vector of [8 x float].
280
280
/// \param __R
281
281
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
282
- /// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
282
+ /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
283
283
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
284
284
/// _MM_FROUND_TO_ZERO.
285
285
/// \returns
@@ -325,7 +325,7 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
325
325
/// A 256-bit vector of [8 x float].
326
326
/// \param __R
327
327
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
328
- /// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
328
+ /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
329
329
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
330
330
/// _MM_FROUND_TO_ZERO.
331
331
/// \returns
@@ -345,13 +345,13 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
345
345
/// \code{.operation]
346
346
/// FOR i := 0 to 15
347
347
/// IF mask[i]
348
- /// dst.fp16[i] := 0
349
- /// ELSE
350
348
/// IF i < 8
351
349
/// dst.fp16[i] := convert_fp32_to_fp16(__B.fp32[i])
352
350
/// ELSE
353
351
/// dst.fp16[i] := convert_fp32_to_fp16(__A.fp32[i - 8])
354
352
/// FI
353
+ /// ELSE
354
+ /// dst.fp16[i] := 0
355
355
/// FI
356
356
/// ENDFOR
357
357
/// \endcode
@@ -361,21 +361,21 @@ _mm256_maskz_cvtx2ps_ph(__mmask16 __U, __m256 __A, __m256 __B) {
361
361
/// This intrinsic corresponds to the \c VCVT2PS2PHX instruction.
362
362
///
363
363
/// \param __U
364
- /// A 8-bit merging mask.
364
+ /// A 8-bit zeroing mask.
365
365
/// \param __A
366
366
/// A 256-bit vector of [8 x float].
367
367
/// \param __B
368
368
/// A 256-bit vector of [8 x float].
369
369
/// \param __R
370
370
/// Rounding mode. Valid inputs are: _MM_FROUND_CUR_DIRECTION or
371
- /// result bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
371
+ /// result of bitwise or of _MM_FROUND_NO_EXC with at most one of the following:
372
372
/// _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_TO_NEG_INF, _MM_FROUND_TO_POS_INF,
373
373
/// _MM_FROUND_TO_ZERO.
374
374
/// \returns
375
375
/// A 256-bit vector of [16 x fp16]. Lower 4 elements correspond to the
376
376
/// (converted) elements from \a __B; higher order elements correspond to the
377
- /// (converted) elements from \a __A. If corresponding mask bit is set, then
378
- /// zero is taken instead.
377
+ /// (converted) elements from \a __A. If corresponding mask bit is not set,
378
+ /// then zero is taken instead.
379
379
#define _mm256_maskz_cvtx_round2ps_ph (U , A , B , R ) \
380
380
((__m256h)__builtin_ia32_vcvt2ps2phx256_mask( \
381
381
(__v8sf)(A), (__v8sf)(B), (__v16hf)(_mm256_setzero_ph()), \
@@ -537,37 +537,229 @@ _mm256_maskz_cvtbiassph_phf8(__mmask16 __U, __m256i __A, __m256h __B) {
537
537
(__mmask16 )__U );
538
538
}
539
539
540
+ /// Convert two 128-bit vectors, \a __A and \a __B, containing packed
541
+ /// FP16 floating-point elements to a 128-bit vector
542
+ /// containing E5M2 FP8 elements.
543
+ ///
544
+ /// \code{.operation]
545
+ /// FOR i := 0 to 16
546
+ /// IF i < 8
547
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
548
+ /// ELSE
549
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
550
+ /// FI
551
+ /// ENDFOR
552
+ /// \endcode
553
+ ///
554
+ /// \headerfile <immintrin.h>
555
+ ///
556
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
557
+ ///
558
+ /// \param __A
559
+ /// A 128-bit vector of [8 x fp16].
560
+ /// \param __B
561
+ /// A 128-bit vector of [8 x fp16].
562
+ /// \returns
563
+ /// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
564
+ /// (converted) elements from \a __B; higher order elements correspond to the
565
+ /// (converted) elements from \a __A.
540
566
static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtne2ph_pbf8 (__m128h __A ,
541
567
__m128h __B ) {
542
568
return (__m128i )__builtin_ia32_vcvtne2ph2bf8_128 ((__v8hf )(__A ),
543
569
(__v8hf )(__B ));
544
570
}
545
571
572
+ /// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
573
+ /// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
574
+ /// Merging mask \a __U is used to determine if given element should be taken
575
+ /// from \a __W instead.
576
+ ///
577
+ /// \code{.operation]
578
+ /// FOR i := 0 to 16
579
+ /// IF __U[i]
580
+ /// dst.fp8[i] := __W[i]
581
+ /// ELSE
582
+ /// IF i < 8
583
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
584
+ /// ELSE
585
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
586
+ /// FI
587
+ /// FI
588
+ /// ENDFOR
589
+ /// \endcode
590
+ ///
591
+ /// \headerfile <immintrin.h>
592
+ ///
593
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
594
+ ///
595
+ /// \param __W
596
+ /// A 128-bit vector of [16 x fp8].
597
+ /// \param __U
598
+ /// A 16-bit merging mask.
599
+ /// \param __A
600
+ /// A 128-bit vector of [8 x fp16].
601
+ /// \param __B
602
+ /// A 128-bit vector of [8 x fp16].
603
+ /// \returns
604
+ /// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
605
+ /// (converted) elements from \a __B; higher order elements correspond to the
606
+ /// (converted) elements from \a __A. If corresponding mask bit is set, then
607
+ /// element from \a __W is taken instead.
546
608
static __inline__ __m128i __DEFAULT_FN_ATTRS128
547
609
_mm_mask_cvtne2ph_pbf8 (__m128i __W , __mmask16 __U , __m128h __A , __m128h __B ) {
548
610
return (__m128i )__builtin_ia32_selectb_128 (
549
611
(__mmask16 )__U , (__v16qi )_mm_cvtne2ph_pbf8 (__A , __B ), (__v16qi )__W );
550
612
}
551
613
614
+ /// Convert two 128-bit vectors, \a __A and \a __B, containing packed FP16
615
+ /// floating-point elements to a 128-bit vector containing E5M2 FP8 elements.
616
+ /// Zeroing mask \a __U is used to determine if given element should be zeroed
617
+ /// instead.
618
+ ///
619
+ /// \code{.operation]
620
+ /// FOR i := 0 to 16
621
+ /// IF __U[i]
622
+ /// IF i < 8
623
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
624
+ /// ELSE
625
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 8])
626
+ /// FI
627
+ /// ELSE
628
+ /// dst.fp8[i] := 0
629
+ /// FI
630
+ /// ENDFOR
631
+ /// \endcode
632
+ ///
633
+ /// \headerfile <immintrin.h>
634
+ ///
635
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
636
+ ///
637
+ /// \param __U
638
+ /// A 16-bit zeroing mask.
639
+ /// \param __A
640
+ /// A 128-bit vector of [8 x fp16].
641
+ /// \param __B
642
+ /// A 128-bit vector of [8 x fp16].
643
+ /// \returns
644
+ /// A 128-bit vector of [16 x fp8]. Lower 8 elements correspond to the
645
+ /// (converted) elements from \a __B; higher order elements correspond to the
646
+ /// (converted) elements from \a __A. If corresponding mask bit is not set, then
647
+ /// zero is taken instead.
552
648
static __inline__ __m128i __DEFAULT_FN_ATTRS128
553
649
_mm_maskz_cvtne2ph_pbf8 (__mmask16 __U , __m128h __A , __m128h __B ) {
554
650
return (__m128i )__builtin_ia32_selectb_128 (
555
651
(__mmask16 )__U , (__v16qi )_mm_cvtne2ph_pbf8 (__A , __B ),
556
652
(__v16qi )(__m128i )_mm_setzero_si128 ());
557
653
}
558
654
655
+ /// Convert two 256-bit vectors, \a __A and \a __B, containing packed
656
+ /// FP16 floating-point elements to a 256-bit vector
657
+ /// containing E5M2 FP8 elements.
658
+ ///
659
+ /// \code{.operation]
660
+ /// FOR i := 0 to 32
661
+ /// IF i < 16
662
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
663
+ /// ELSE
664
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
665
+ /// FI
666
+ /// ENDFOR
667
+ /// \endcode
668
+ ///
669
+ /// \headerfile <immintrin.h>
670
+ ///
671
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
672
+ ///
673
+ /// \param __A
674
+ /// A 256-bit vector of [16 x fp16].
675
+ /// \param __B
676
+ /// A 256-bit vector of [16 x fp16].
677
+ /// \returns
678
+ /// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
679
+ /// (converted) elements from \a __B; higher order elements correspond to the
680
+ /// (converted) elements from \a __A.
559
681
static __inline__ __m256i __DEFAULT_FN_ATTRS256
560
682
_mm256_cvtne2ph_pbf8 (__m256h __A , __m256h __B ) {
561
683
return (__m256i )__builtin_ia32_vcvtne2ph2bf8_256 ((__v16hf )(__A ),
562
684
(__v16hf )(__B ));
563
685
}
564
686
687
+ /// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
688
+ /// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
689
+ /// Merging mask \a __U is used to determine if given element should be taken
690
+ /// from \a __W instead.
691
+ ///
692
+ /// \code{.operation]
693
+ /// FOR i := 0 to 32
694
+ /// IF __U[i]
695
+ /// dst.fp8[i] := __W.fp8[i]
696
+ /// ELSE
697
+ /// IF i < 16
698
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
699
+ /// ELSE
700
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
701
+ /// FI
702
+ /// FI
703
+ /// ENDFOR
704
+ /// \endcode
705
+ ///
706
+ /// \headerfile <immintrin.h>
707
+ ///
708
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
709
+ ///
710
+ /// \param __W
711
+ /// A 256-bit vector of [32 x fp8].
712
+ /// \param __U
713
+ /// A 32-bit merging mask.
714
+ /// \param __A
715
+ /// A 256-bit vector of [16 x fp16].
716
+ /// \param __B
717
+ /// A 256-bit vector of [16 x fp16].
718
+ /// \returns
719
+ /// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
720
+ /// (converted) elements from \a __B; higher order elements correspond to the
721
+ /// (converted) elements from \a __A. If corresponding mask bit is set, then
722
+ /// element from \a __W is taken instead.
565
723
static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_cvtne2ph_pbf8 (
566
724
__m256i __W , __mmask32 __U , __m256h __A , __m256h __B ) {
567
725
return (__m256i )__builtin_ia32_selectb_256 (
568
726
(__mmask16 )__U , (__v32qi )_mm256_cvtne2ph_pbf8 (__A , __B ), (__v32qi )__W );
569
727
}
570
728
729
+ /// Convert two 256-bit vectors, \a __A and \a __B, containing packed FP16
730
+ /// floating-point elements to a 256-bit vector containing E5M2 FP8 elements.
731
+ /// Merging mask \a __U is used to determine if given element should be zeroed
732
+ /// instead.
733
+ ///
734
+ /// \code{.operation]
735
+ /// FOR i := 0 to 32
736
+ /// IF __U[i]
737
+ /// dst.fp8[i] := 0
738
+ /// ELSE
739
+ /// IF i < 16
740
+ /// dst.fp8[i] := convert_fp16_to_fp8(__B.fp16[i])
741
+ /// ELSE
742
+ /// dst.fp8[i] := convert_fp16_to_fp8(__A.fp16[i - 16])
743
+ /// FI
744
+ /// FI
745
+ /// ENDFOR
746
+ /// \endcode
747
+ ///
748
+ /// \headerfile <immintrin.h>
749
+ ///
750
+ /// This intrinsic corresponds to the \c VCVTNE2PH2BF8 instruction.
751
+ ///
752
+ /// \param __U
753
+ /// A 32-bit zeroing mask.
754
+ /// \param __A
755
+ /// A 256-bit vector of [16 x fp16].
756
+ /// \param __B
757
+ /// A 256-bit vector of [16 x fp16].
758
+ /// \returns
759
+ /// A 256-bit vector of [32 x fp8]. Lower 16 elements correspond to the
760
+ /// (converted) elements from \a __B; higher order elements correspond to the
761
+ /// (converted) elements from \a __A. If corresponding mask bit is not set,
762
+ /// then element from \a __W is taken instead.
571
763
static __inline__ __m256i __DEFAULT_FN_ATTRS256
572
764
_mm256_maskz_cvtne2ph_pbf8 (__mmask32 __U , __m256h __A , __m256h __B ) {
573
765
return (__m256i )__builtin_ia32_selectb_256 (
0 commit comments