[SYCL][Matrix] Add GNR matrix combinations to spec and runtime query (#11867)

dkhaldi · web-flow · commit c0291b2bf18e · 2023-11-14T11:46:50.000+01:00
As a minor addition, I also added tf32 combination to PVC as part of
this PR
diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
@@ -855,24 +855,32 @@ XMX hardware. Note that these can be returned using
 
 ==== Intel AMX Supported Combinations
 This is currently available in devices with the architecture
-`architecture::intel_cpu_spr`. In this architecture's implementation,
-the type of the C matrix must be the same as the type of the D
-matrix. Therefore, that common type is shown in a single column in the
-table below.
+`architecture::intel_cpu_spr`, and `architecture::intel_cpu_gnr`. In
+this architecture's implementation, the type of the C matrix must be
+the same as the type of the D matrix. Therefore, that common type is
+shown in a single column in the table below. 
 
 [frame="none",options="header"]
 |======================
-| A type | B type | C and D type | M | N | K
+| A type | B type | C and D type | M | N | K | device
 | `matrix_type::uint8`  | `matrix_type::uint8` |
 `matrix_type::sint32`  |  +<=+ 16 |  +<=+ 16 |  +<=+ 64
+|`architecture::intel_gpu_spr`, `architecture::intel_gpu_gnr`
 | `matrix_type::uint8`  | `matrix_type::sint8` |
 `matrix_type::sint32`  |  +<=+ 16 |  +<=+ 16 |  +<=+ 64
+|`architecture::intel_gpu_spr`, `architecture::intel_gpu_gnr`
 | `matrix_type::sint8`  | `matrix_type::uint8` |
 `matrix_type::sint32`  |  +<=+ 16 |  +<=+ 16 |  +<=+ 64
+|`architecture::intel_gpu_spr`, `architecture::intel_gpu_gnr`
 | `matrix_type::sint8`  | `matrix_type::sint8` |
 `matrix_type::sint32`  |  +<=+ 16 |  +<=+ 16 |  +<=+ 64
+|`architecture::intel_gpu_spr`, `architecture::intel_gpu_gnr`
 |  `matrix_type::bf16`       |  `matrix_type::bf16`   |
 `matrix_type::fp32`   |  +<=+ 16 |  +<=+ 16   |  +<=+ 32
+|`architecture::intel_gpu_spr`, `architecture::intel_gpu_gnr`
+|  `matrix_type::fp16`       |  `matrix_type::fp16`   |
+`matrix_type::fp32`   |  +<=+ 16 |  +<=+ 16   |  +<=+ 32
+|`architecture::intel_gpu_gnr`
 |======================
 
 ==== Intel XMX Supported Combinations
@@ -911,6 +919,9 @@ architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`
 `matrix_type::fp32`   .2+|  +<=+ 8 |  16   .2+|  16 |
 `architecture::intel_gpu_pvc` |8| `architecture::intel_gpu_dg2_g10,
 architecture::intel_gpu_dg2_g11, architecture::intel_gpu_dg2_g12`
+|  `matrix_type::tf32`       |  `matrix_type::tf32`   |
+`matrix_type::fp32`   |  +<=+ 8 |  16   |  8 |
+`architecture::intel_gpu_pvc`
 |======================
 
 ==== Nvidia Tensor Cores Supported Combinations
@@ -1011,7 +1022,7 @@ be used. An attempt to run the compiled code on an unsupported architecture will
 operations support
 |4   |2022-08-25 |Dounia Khaldi |Update the matrix spec by adding the
 new matrix use parameter and remove reference to the AOT AMX initial
-implementation 
+implementation
 |5   |2022-11-07 |Dounia Khaldi |Update the matrix spec by making it
 portable across Intel AMX, Intel XMX and Nvidia Tensor Cores, and move
 the Intel-specifics to a separate extension document
@@ -1020,4 +1031,6 @@ type, runtime query, and supported combinations appendix for Intel AMX
 and Intel XMX
 |7   |2023-04-11 |Jack Kirk |Add Nvidia Tensor Cores supported combinations
 |8   |2023-10-05 |Mahmoud Moadeli |Add AMD Matrix Core supported combinations
+|9   |2023-11-13 |Dounia Khaldi |Add Granite Rapids Intel AMX
+supported combinations
 |======================
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
@@ -100,6 +100,7 @@ namespace sycl::ext::oneapi::experimental {
 enum class architecture : /* unspecified */ {
   x86_64,
   intel_cpu_spr,
+  intel_cpu_gnr,
   intel_gpu_bdw,
   intel_gpu_skl,
   intel_gpu_kbl,
@@ -202,6 +203,12 @@ of these enumerators, and it provides a brief description of their meanings.
 enumeration is currently limited. See the section "Limitations with
 the experimental version" for details.
 
+|`intel_cpu_gnr`
+|-
+|Intel Xeon processor codenamed Granite Rapids. The utility of this
+enumeration is currently limited. See the section "Limitations with
+the experimental version" for details.
+
 |`intel_gpu_bdw`
 |-
 |Broadwell Intel graphics architecture.
@@ -596,11 +603,11 @@ feature, the application must be compiled in ahead-of-time (AOT) mode using
 description of the `-fsycl-targets` option.  These are the target names of the
 form "intel_gpu_*", "nvidia_gpu_*", or "amd_gpu_*".
 
-The architecture enumeration `intel_cpu_spr` does not currently work
-with any of the APIs described in this extension. It cannot be used
-with the `if_architecture_is` function, the
-`device::ext_oneapi_architecture_is` function, or the
-`info::device::architecture` query descriptor. It currently exists
+The architecture enumerations `intel_cpu_spr` and `intel_cpu_gnr` do
+not currently work with any of the APIs described in this
+extension. They cannot be used with the `if_architecture_is` function,
+the `device::ext_oneapi_architecture_is` function, or the
+`info::device::architecture` query descriptor. They currently exist
 only for use with the
 link:sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix]
 extension.
diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
@@ -15,6 +15,7 @@ namespace ext::oneapi::experimental {
 enum class architecture {
   x86_64,
   intel_cpu_spr,
+  intel_cpu_gnr,
   intel_gpu_bdw,
   intel_gpu_skl,
   intel_gpu_kbl,
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
@@ -622,7 +622,9 @@ struct get_device_info_impl<range<Dimensions>,
 // This macro is only for Intel CPU architectures
 // TODO: extend the macro with other CPU architectures when they will be added
 // to ext_oneapi_device_architecture
-#define INTEL_CPU_ARCHES(X) X(8, oneapi_exp_arch::intel_cpu_spr)
+#define INTEL_CPU_ARCHES(X)                                                    \
+  X(8, oneapi_exp_arch::intel_cpu_spr)                                         \
+  X(9, oneapi_exp_arch::intel_cpu_gnr)
 
 #define CMP_NVIDIA_AMD(s, i)                                                   \
   if (strcmp(s, arch) == 0)                                                    \
@@ -732,6 +734,21 @@ struct get_device_info_impl<
           {16, 16, 32, 0, 0, 0, matrix_type::bf16, matrix_type::bf16,
            matrix_type::fp32, matrix_type::fp32},
       };
+    else if (architecture::intel_cpu_gnr == DeviceArch)
+      return {
+          {16, 16, 64, 0, 0, 0, matrix_type::uint8, matrix_type::uint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {16, 16, 64, 0, 0, 0, matrix_type::uint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {16, 16, 64, 0, 0, 0, matrix_type::sint8, matrix_type::uint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {16, 16, 64, 0, 0, 0, matrix_type::sint8, matrix_type::sint8,
+           matrix_type::sint32, matrix_type::sint32},
+          {16, 16, 32, 0, 0, 0, matrix_type::bf16, matrix_type::bf16,
+           matrix_type::fp32, matrix_type::fp32},
+          {16, 16, 32, 0, 0, 0, matrix_type::fp16, matrix_type::fp16,
+           matrix_type::fp32, matrix_type::fp32},
+      };
     else if (architecture::intel_gpu_pvc == DeviceArch)
       return {
           {8, 0, 0, 0, 16, 32, matrix_type::uint8, matrix_type::uint8,
@@ -746,6 +763,8 @@ struct get_device_info_impl<
            matrix_type::fp32, matrix_type::fp32},
           {8, 0, 0, 0, 16, 16, matrix_type::bf16, matrix_type::bf16,
            matrix_type::fp32, matrix_type::fp32},
+          {8, 0, 0, 0, 16, 8, matrix_type::tf32, matrix_type::tf32,
+           matrix_type::fp32, matrix_type::fp32},
       };
     else if ((architecture::intel_gpu_dg2_g10 == DeviceArch) ||
              (architecture::intel_gpu_dg2_g11 == DeviceArch) ||