intel · xurui1995 · Sep 10, 2024 · Aug 16, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
@@ -49,6 +49,8 @@ declare_mlir_python_sources(GcPythonSources.Common
     __init__.py
     graph_compiler.py
     dialects/__init__.py
+    tools/__init__.py
+    tools/cpuinfo.py
     # init hooks
     _mlir_libs/_site_initialize_0.py
 )
@@ -86,6 +88,13 @@ declare_mlir_python_extension(GcPythonSources.Extension
     GcCAPI
 )
 
+declare_mlir_python_extension(GcPythonSources.CpuInfoExtension
+  MODULE_NAME _cpuinfo
+  ADD_TO_PARENT GcPythonSources
+  SOURCES
+    CPUInfo.cpp
+)
+
 ################################################################################
 # Common CAPI
 ################################################################################

diff --git a/python/CPUInfo.cpp b/python/CPUInfo.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2024 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "mlir/Bindings/Python/PybindAdaptors.h"
+
+#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) ||             \
+    defined(_M_IX86)
+// x86 or x86_64 specific code
+void cpuid(int info[4], int leaf, int subleaf) {
+  __asm__ __volatile__("cpuid"
+                       : "=a"(info[0]), "=b"(info[1]), "=c"(info[2]),
+                         "=d"(info[3])
+                       : "a"(leaf), "c"(subleaf));
+}
+
+std::vector<int> getCacheSizes() {
+  int info[4];
+  cpuid(info, 0, 0);
+  int nIds = info[0];
+  int caches[3] = {};
+  for (int i = 0; i <= nIds; ++i) {
+    cpuid(info, 4, i);
+    int cacheType = info[0] & 0x1F;
+    if (cacheType == 0) {
+      break;
+    }
+    if (cacheType == 2) {
+      // skip instruction cache
+      continue;
+    }
+    int cacheLevel = (info[0] >> 5) & 0x7;
+    int cacheLinesPerTag = ((info[1] >> 0) & 0xFFF) + 1;
+    int cacheAssociativity = ((info[1] >> 12) & 0x3FF) + 1;
+    int cachePartitions = ((info[1] >> 22) & 0x3FF) + 1;
+    int cacheSets = info[2] + 1;
+    int cacheSize =
+        cacheLinesPerTag * cacheAssociativity * cachePartitions * cacheSets;
+    if (cacheLevel >= 1 && cacheLevel <= 3) {
+      caches[cacheLevel - 1] = cacheSize;
+    }
+  }
+  return std::vector<int>(std::begin(caches), std::end(caches));
+}
+
+bool isFeatureSupported(int function_id, int register_idx, int bit) {
+  int info[4];
+  cpuid(info, function_id, 0);
+  return (info[register_idx] & (1 << bit)) != 0;
+}
+
+int getMaxVectorWidth() {
+  if (isFeatureSupported(7, 1, 16)) { // Check for AVX-512F support
+    return 512;
+  } else if (isFeatureSupported(1, 2, 28)) { // Check for AVX support
+    return 256;
+  } else if (isFeatureSupported(1, 3, 25)) { // Check for SSE support
+    return 128;
+  }
+  return 64; // Default to 64 if none of the above features are supported
+}
+#else
+std::vector<int> getCacheSizes() { return {}; }
+
+int getMaxVectorWidth { return 0; }
+#endif
+
+PYBIND11_MODULE(_cpuinfo, m) {
+  m.doc() = "Graph-compiler MLIR Python binding";
+  m.def("get_cache_sizes", &getCacheSizes, "Get CPU L1,L2,L3 cache size");
+  m.def("get_max_vector_width", &getMaxVectorWidth,
+        "Get CPU supported max vector width");
+}
diff --git a/python/gc_mlir/tools/__init__.py b/python/gc_mlir/tools/__init__.py
@@ -0,0 +1,7 @@
+# ===-- __init__.py - init ------------------------------------*- Python -*-===#
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===-----------------------------------------------------------------------===#
diff --git a/python/gc_mlir/tools/cpuinfo.py b/python/gc_mlir/tools/cpuinfo.py
@@ -0,0 +1,26 @@
+# ===-- cpuinfo.py - Getting the CPU info ---------------------*- Python -*-===#
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===-----------------------------------------------------------------------===#
+
+from .._mlir_libs import _cpuinfo
+
+_cache_sizes = []
+_max_vector_width = None
+
+
+def get_cache_sizes():
+    global _cache_sizes
+    if not _cache_sizes:
+        _cache_sizes = _cpuinfo.get_cache_sizes()
+    return _cache_sizes
+
+
+def get_max_vector_width():
+    global _max_vector_width
+    if _max_vector_width is None:
+        _max_vector_width = _cpuinfo.get_max_vector_width()
+    return _max_vector_width
diff --git a/test/benchgc/README.md b/test/benchgc/README.md
@@ -8,6 +8,8 @@ Benchgc is a tool used to verify the correctness and performance of graph compil
 * python >= 3.10
 * torch >= 2.2
 * Enable mlir python binding, Refer to [`python/README.md`](../../python/README.md) for detail
+* Set the envs
+  * OMP_NUM_THREADS [int] : the `num_threads` for dlti attr, default = 1
 
 ## Build 
 There are two ways for using benchgc
@@ -107,6 +109,12 @@ module {
 | Pytorch tensor dump | F | dump filename |
 | Benchdnn driver | D | driver_name[:driver filling parameter]* |
 
+### --cpu_cache_sizes, --max_vector_width
+* BenchGC will automatically obtain target info and add the DLTI attr to the IR
+* In some cases, if the system info obtained by BenchGC is not accurate, you can specify the relevant attributes for BenchGC through these options.
+* --cpu_cache_sizes: cpu cache sizes in bytes, format: L1:L2:L3, example: `--cpu_cache_sizes 49152:2097152:110100480`
+* --max_vector_width: the maximum width of vector registers available in a CPU, example `--max_vector_width `
+
 #### Benchdnn driver filling
 
 | driver_name | driver filling parameter |

diff --git a/test/benchgc/setup.py b/test/benchgc/setup.py
@@ -26,5 +26,5 @@
     packages=setuptools.find_packages("src")
     + setuptools.find_namespace_packages("../../python_packages/gc_mlir_core"),
     package_data={"gc_mlir": ["_mlir_libs/*.so"]},
-    install_requires=["torch", "numpy", "ml_dtypes"],
+    install_requires=["torch", "numpy"],
 )
diff --git a/test/benchgc/src/benchgc/__main__.py b/test/benchgc/src/benchgc/__main__.py
@@ -124,6 +124,20 @@ def add_common_options(parser: argparse.ArgumentParser):
         help="if we need print the ir during the pass-pipeline",
     )
 
+    parser.add_argument(
+        "--cpu_cache_sizes",
+        required=False,
+        help="set the cpu cache sizes, format: L1:L2:L3",
+        type=str,
+    )
+
+    parser.add_argument(
+        "--max_vector_width",
+        required=False,
+        help="set the cpu max_vector_width",
+        type=int,
+    )
+
     if parser.parse_known_args()[0].driver == "linalg":
         parser.add_argument(
             "--cast",
@@ -269,6 +283,8 @@ def get_module_and_args(flags: argparse.Namespace):
     for arg in args:
         arg.print_verbose(flags.verbose)
 
+    benchgc.mlir.util.attach_dlti(flags, module)
+
     if flags.verbose >= benchgc.util.MODULE_VERBOSE:
         print(module)
     return module, args

diff --git a/test/benchgc/src/benchgc/mlir/util.py b/test/benchgc/src/benchgc/mlir/util.py
@@ -14,12 +14,15 @@
 # limitations under the License.
 ################################################################################
 
+import argparse
 import ctypes
+import os
 from typing import Any, List
 
 import torch
 from gc_mlir import ir
 from gc_mlir.dialects import arith, func, memref
+from gc_mlir.tools import cpuinfo
 
 
 # calling python binding consumes a lot of time e.g. get_name()
@@ -152,3 +155,49 @@ def get_kernel_func_from_module(
         if type(f) is func.FuncOp and str(f.name).strip('"') == func_name:
             return f
     raise ValueError("can not find the entry function")
+
+
+def attach_dlti(flags: argparse.Namespace, module: ir.Module):
+    # the moudle already had dlti attr
+    if "dlti.target_system_spec" in module.operation.attributes:
+        return
+    if flags.cpu_cache_sizes:
+        caches_sizes = [int(x) for x in flags.cpu_cache_sizes.strip().split(":")]
+    else:
+        caches_sizes = cpuinfo.get_cache_sizes()
+        if not caches_sizes or len(caches_sizes) != 3:
+            print(
+                "Failed to get CPU cache sizes, please added them manually br --cpu_cache_sizes"
+            )
+            return
+    if flags.max_vector_width:
+        max_vector_width = flags.max_vector_width
+    else:
+        max_vector_width = cpuinfo.get_max_vector_width()
+        if not max_vector_width:
+            print(
+                "Failed to get CPU max vector width, please added them manually br --max_vector_width"
+            )
+            return
+    l1_data_cache_size, l2_cache_size, l3_cache_size = caches_sizes
+    if "OMP_NUM_THREADS" not in os.environ:
+        print("OMP_NUM_THREADS is not found, using 1 as default")
+    num_threads = os.environ.get("OMP_NUM_THREADS", 1)
+
+    dlti_template = f"""
+    module attributes {{
+        dlti.target_system_spec = #dlti.target_system_spec<
+        "CPU": #dlti.target_device_spec<
+            #dlti.dl_entry<"L1_cache_size_in_bytes", {l1_data_cache_size} : ui32>,
+            #dlti.dl_entry<"L2_cache_size_in_bytes", {l2_cache_size} : ui64>,
+            #dlti.dl_entry<"L3_cache_size_in_bytes", {l3_cache_size} : ui64>,
+            #dlti.dl_entry<"num_threads", {num_threads} : i32>,
+            #dlti.dl_entry<"max_vector_width", {max_vector_width} : i64>>
+        >}} {{}}
+    """
+    print(dlti_template)
+    with module.context:
+        template_module = ir.Module.parse(dlti_template)
+        module.operation.attributes["dlti.target_system_spec"] = (
+            template_module.operation.attributes["dlti.target_system_spec"]
+        )