@@ -619,3 +619,123 @@ void test_cvt_scalef32_pk_f32_bf8(global float2* out, unsigned int src, float sc
619
619
* out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , true);
620
620
* out = __builtin_amdgcn_cvt_scalef32_pk_f32_bf8 (src , scale , false);
621
621
}
622
+
623
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_f16(
624
+ // CHECK-NEXT: entry:
625
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
626
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
627
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
628
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
629
+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
630
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
631
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
632
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
633
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
634
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
635
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
636
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
637
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
638
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
639
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
640
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
641
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
642
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
643
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
644
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
645
+ // CHECK-NEXT: ret void
646
+ //
647
+ void test_cvt_scalef32_pk_fp8_f16 (global short2 * out , half2 src , float scale )
648
+ {
649
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , true);
650
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_f16 (* out , src , scale , false);
651
+ }
652
+
653
+ // CHECK-LABEL: @test_cvt_scalef32_pk_fp8_bf16(
654
+ // CHECK-NEXT: entry:
655
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
656
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
657
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
658
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
659
+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
660
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
661
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
662
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
663
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
664
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
665
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
666
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
667
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
668
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
669
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
670
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
671
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
672
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.fp8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
673
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
674
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
675
+ // CHECK-NEXT: ret void
676
+ //
677
+ void test_cvt_scalef32_pk_fp8_bf16 (global short2 * out , bfloat2 src , float scale )
678
+ {
679
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , true);
680
+ * out = __builtin_amdgcn_cvt_scalef32_pk_fp8_bf16 (* out , src , scale , false);
681
+ }
682
+
683
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_f16(
684
+ // CHECK-NEXT: entry:
685
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
686
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x half>, align 4, addrspace(5)
687
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
688
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
689
+ // CHECK-NEXT: store <2 x half> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
690
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
691
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
692
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
693
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
694
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
695
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP1]], <2 x half> [[TMP2]], float [[TMP3]], i1 true)
696
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
697
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
698
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
699
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
700
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x half>, ptr addrspace(5) [[SRC_ADDR]], align 4
701
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
702
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.f16(<2 x i16> [[TMP7]], <2 x half> [[TMP8]], float [[TMP9]], i1 false)
703
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
704
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
705
+ // CHECK-NEXT: ret void
706
+ //
707
+ void test_cvt_scalef32_pk_bf8_f16 (global short2 * out , half2 src , float scale )
708
+ {
709
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , true);
710
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_f16 (* out , src , scale , false);
711
+ }
712
+
713
+ // CHECK-LABEL: @test_cvt_scalef32_pk_bf8_bf16(
714
+ // CHECK-NEXT: entry:
715
+ // CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
716
+ // CHECK-NEXT: [[SRC_ADDR:%.*]] = alloca <2 x bfloat>, align 4, addrspace(5)
717
+ // CHECK-NEXT: [[SCALE_ADDR:%.*]] = alloca float, align 4, addrspace(5)
718
+ // CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr addrspace(5) [[OUT_ADDR]], align 8
719
+ // CHECK-NEXT: store <2 x bfloat> [[SRC:%.*]], ptr addrspace(5) [[SRC_ADDR]], align 4
720
+ // CHECK-NEXT: store float [[SCALE:%.*]], ptr addrspace(5) [[SCALE_ADDR]], align 4
721
+ // CHECK-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
722
+ // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP0]], align 4
723
+ // CHECK-NEXT: [[TMP2:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
724
+ // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
725
+ // CHECK-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP1]], <2 x bfloat> [[TMP2]], float [[TMP3]], i1 true)
726
+ // CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
727
+ // CHECK-NEXT: store <2 x i16> [[TMP4]], ptr addrspace(1) [[TMP5]], align 4
728
+ // CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
729
+ // CHECK-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(1) [[TMP6]], align 4
730
+ // CHECK-NEXT: [[TMP8:%.*]] = load <2 x bfloat>, ptr addrspace(5) [[SRC_ADDR]], align 4
731
+ // CHECK-NEXT: [[TMP9:%.*]] = load float, ptr addrspace(5) [[SCALE_ADDR]], align 4
732
+ // CHECK-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.amdgcn.cvt.scalef32.pk.bf8.bf16(<2 x i16> [[TMP7]], <2 x bfloat> [[TMP8]], float [[TMP9]], i1 false)
733
+ // CHECK-NEXT: [[TMP11:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
734
+ // CHECK-NEXT: store <2 x i16> [[TMP10]], ptr addrspace(1) [[TMP11]], align 4
735
+ // CHECK-NEXT: ret void
736
+ //
737
+ void test_cvt_scalef32_pk_bf8_bf16 (global short2 * out , bfloat2 src , float scale )
738
+ {
739
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , true);
740
+ * out = __builtin_amdgcn_cvt_scalef32_pk_bf8_bf16 (* out , src , scale , false);
741
+ }
0 commit comments