pytorch · GregoryComer · Aug 29, 2024 · Aug 28, 2024
diff --git a/.gitmodules b/.gitmodules
@@ -21,7 +21,7 @@
 	url = https://github.com/Maratyszcza/FXdiv.git
 [submodule "backends/xnnpack/third-party/XNNPACK"]
 	path = backends/xnnpack/third-party/XNNPACK
-	url = https://github.com/digantdesai/XNNPACK.git
+	url = https://github.com/google/XNNPACK.git
 [submodule "backends/xnnpack/third-party/cpuinfo"]
 	path = backends/xnnpack/third-party/cpuinfo
 	url = https://github.com/pytorch/cpuinfo.git

@@ -36,6 +36,10 @@ set(XNNPACK_ENABLE_AVXVNNI
     OFF
     CACHE BOOL ""
 )
+set(XNNPACK_ENABLE_KLEIDIAI
+    OFF
+    CACHE BOOL ""
+)
 add_subdirectory("${XNNPACK_SOURCE_DIR}")
 include_directories(SYSTEM ${XNNPACK_INCLUDE_DIR})
 list(APPEND xnnpack_third_party XNNPACK)

@@ -21,6 +21,25 @@ namespace executor {
 namespace xnnpack {
 namespace delegate {
 
+/*
+ * Provide compile-time allocation.
+ */
+class CompileAllocator {
+ public:
+  /*
+   * Allocate memory which will be automatically freed at the end
+   * of the compilation process.
+   */
+  void* allocateTemporary(size_t size) {
+    auto mem = new uint8_t[size];
+    temporaries_.emplace_back(mem);
+    return mem;
+  }
+
+ private:
+  std::vector<std::unique_ptr<uint8_t[]>> temporaries_;
+};
+
 // Flatbuffer types
 using ValuePtr = const fb_xnnpack::XValue*;
 using NodePtr = const fb_xnnpack::XNode*;
@@ -35,6 +54,23 @@ using DefineNodeFunc = Error (*)(
     const std::unordered_map<uint32_t, uint32_t>&,
     NodePtr) noexcept;
 
+/*
+Convert a tensor from fp32 to bf16.
+*/
+void convertF32TensorToBF16(
+    const float* f32_data,
+    uint16_t* bf16_data_out,
+    size_t numel) {
+  for (auto i = 0u; i < numel; i++) {
+    // Adjust the f32 value such that it rounds properly after truncation.
+    // Constant factor scales 1+2^-8 to 1+2e-7.
+    float f32_adjusted = f32_data[i] * 1.00389105f;
+    uint32_t f32_bits;
+    memcpy(&f32_bits, &f32_adjusted, sizeof(float));
+    bf16_data_out[i] = static_cast<uint16_t>(f32_bits >> 16);
+  }
+}
+
 /*
 Gets the output min and output max for a given node operator
 */
@@ -152,7 +188,8 @@ Error defineTensor(
     GraphPtr flatbuffer_graph,
     const uint8_t* constant_data_ptr,
     std::vector<uint32_t>& input_ids,
-    std::vector<uint32_t>& output_ids) {
+    std::vector<uint32_t>& output_ids,
+    CompileAllocator& allocator) {
   const fb_xnnpack::XNNTensorValue* tensor_value = nullptr;
   const fb_xnnpack::XNNQuantizedTensorValue* qtensor_value = nullptr;
 
@@ -356,12 +393,31 @@ Error defineTensor(
         size_t group_size = qparams->group_size();
         size_t output_channels = tensor_value->dims()->Get(0);
         size_t input_channels = tensor_value->dims()->Get(1);
+
+        const uint16_t* scale_data = nullptr;
+        uint32_t scale_numel = 0;
+
+        // Block scales are preferably serialized as bf16 but can also be
+        // serialized as fp32 for backwards compatability.
+        if (qparams->scale_bf16() != nullptr) {
+          scale_data =
+              static_cast<const uint16_t*>(qparams->scale_bf16()->data());
+          scale_numel = qparams->scale_bf16()->size();
+        } else {
+          // Read fp32 scales, convert to bf16.
+          auto conv_buffer = static_cast<uint16_t*>(allocator.allocateTemporary(
+              qparams->scale()->size() * sizeof(uint16_t)));
+          scale_numel = qparams->scale()->size();
+          convertF32TensorToBF16(
+              qparams->scale()->data(), conv_buffer, scale_numel);
+          scale_data = conv_buffer;
+        }
+
         ET_CHECK_OR_RETURN_ERROR(
-            qparams->scale()->size() ==
-                output_channels * input_channels / group_size,
+            scale_numel == output_channels * input_channels / group_size,
             Internal,
             "scale size %zu != output channels %zu * group size %zu",
-            (size_t)qparams->scale()->size(),
+            static_cast<size_t>(scale_numel),
             output_channels,
             group_size);
         int32_t zero_point =
@@ -370,18 +426,19 @@ Error defineTensor(
             Debug,
             "define quant tensor (per channel group): buffer_ptr: %p, scale.numel(): %u, channel_dim: %u, grpup_size: %zu, output_channels: %zu, dtype: %u, zero_point: %d, datatype: %d\n",
             buffer_ptr,
-            qparams->scale()->size(),
+            scale_numel,
             qparams->channel_dim(),
             group_size,
             output_channels,
             datatype,
             zero_point,
             datatype);
+
         status = xnn_define_blockwise_quantized_tensor_value(
             /*subgraph=*/subgraph_ptr,
             /*datatype=*/datatype,
             /*zero_point=*/zero_point,
-            /*scale=*/qparams->scale()->data(),
+            /*scale=*/scale_data,
             /*num_dims=*/tensor_value->num_dims(),
             /*channel_dim=*/qparams->channel_dim(),
             /*block_size=*/qparams->group_size(),
@@ -1617,6 +1674,7 @@ ET_NODISCARD Error XNNCompiler::compileModel(
   Result<XNNHeader> header = XNNHeader::Parse(buffer_pointer, num_bytes);
   const uint8_t* flatbuffer_data = nullptr;
   const uint8_t* constant_data = nullptr;
+  CompileAllocator compile_allocator;
 
   // Header status can only either be Error::Ok or Error::NotFound
   if (header.ok()) {
@@ -1688,7 +1746,8 @@ ET_NODISCARD Error XNNCompiler::compileModel(
         flatbuffer_graph,
         constant_data,
         input_ids,
-        output_ids);
+        output_ids,
+        compile_allocator);
 
     if (err != Error::Ok) {
       return err;

@@ -63,6 +63,7 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
+  scale_bf16:[ushort];
 }
 
 table XNNTensorValue {

@@ -48,6 +48,7 @@ table PerChannelGroupQuant {
   scale:[float];
   channel_dim:int;
   group_size:int;
+  scale_bf16:[ushort];
 }
 
 table PerChannelQuant {

@@ -407,8 +407,8 @@ def test_qd8_per_channel_linear_parallel_and_sequential(self):
     )
     def test_qd8_fp32_per_token_weight_per_channel_group_int4(self):
         M_sizes = [1, 2, 17, 31]
-        K_sizes = [8, 32, 64, 128]
-        bl_sizes = [8, 16, 16, 32]
+        K_sizes = [32, 32, 64, 128]
+        bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
         for use_bias in [True, False]:
@@ -430,8 +430,8 @@ def test_qd8_fp32_per_token_weight_per_channel_group_int4(self):
     )
     def test_qd8_fp16_per_token_weight_per_channel_group_int4(self):
         M_sizes = [1, 2, 17, 31]
-        K_sizes = [8, 32, 64, 128]
-        bl_sizes = [8, 16, 16, 32]
+        K_sizes = [32, 32, 64, 128]
+        bl_sizes = [32, 32, 32, 64]
         N_sizes = [2, 17, 92, 128]
 
         for use_bias in [True, False]:
@@ -602,8 +602,8 @@ def _test_groupwise_dq_linear(
         use_bias: bool = False,
         group_size: int = 8,
         num_linears: int = 1,
-        atol: float = 1e-3,
-        rtol: float = 1e-3,
+        atol: float = 5e-3,
+        rtol: float = 5e-3,
     ):
         quantize_(mod, int8_dynamic_activation_int4_weight(group_size=group_size))
         unwrap_tensor_subclass(mod)

@@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 
 from __future__ import print_function
+from pathlib import Path
 import collections
 import os
 import sys
@@ -36,8 +37,8 @@
     "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
     "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
     "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
@@ -46,7 +47,7 @@
     # add non-prod microkernel sources here:
 }
 
-SRC_NAMES = set([
+SRC_NAMES = {
     "OPERATOR_SRCS",
     "SUBGRAPH_SRCS",
     "LOGGING_SRCS",
@@ -81,30 +82,42 @@
     "PROD_AVX512F_MICROKERNEL_SRCS",
     "PROD_AVX512SKX_MICROKERNEL_SRCS",
     "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNIGFNI_MICROKERNEL_SRCS",
     "PROD_RVV_MICROKERNEL_SRCS",
     "PROD_AVXVNNI_MICROKERNEL_SRCS",
     "AARCH32_ASM_MICROKERNEL_SRCS",
     "AARCH64_ASM_MICROKERNEL_SRCS",
 
     # add non-prod microkernel sources here:
-])
+}
 
 def handle_singleline_parse(line):
     start_index = line.find("(")
     end_index = line.find(")")
     line = line[start_index+1:end_index]
     key_val = line.split(" ")
-    return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
+    return key_val[0], [x[4:] for x in key_val[1:]]
 
 def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
+    print(f"Updating sources from {cmakefile}")
     sources = collections.defaultdict(list)
     with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
         lines = cmake.readlines()
         i = 0
         while i < len(lines):
             line = lines[i]
+
+            if lines[i].startswith("INCLUDE"):
+                file, _ = handle_singleline_parse(line)
+                if file.startswith("cmake/gen/"):
+                    path = Path(xnnpack_path) / "XNNPACK" / file
+                    local_sources = update_sources(xnnpack_path, path.absolute().as_posix())
+                    for k,v in local_sources.items():
+                        if k in sources:
+                            sources[k] = sources[k] + local_sources[k]
+                        else:
+                            sources[k] = local_sources[k]
 
             if lines[i].startswith("SET") and "src/" in lines[i]:
                 name, val = handle_singleline_parse(line)
@@ -132,7 +145,7 @@ def gen_wrappers(xnnpack_path):
     xnnpack_sources = collections.defaultdict(list)
     sources = update_sources(xnnpack_path)
 
-    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/microkernels.cmake")
+    microkernels_sources = update_sources(xnnpack_path, "XNNPACK/cmake/gen/microkernels.cmake")
     for key in  microkernels_sources:
         sources[key] = microkernels_sources[key]
 
@@ -186,6 +199,8 @@ def gen_wrappers(xnnpack_path):
 
 
 def main(argv):
+    print("Generating wrappers...")
+
     if argv is None or len(argv) == 0:
         gen_wrappers(".")
     else:

@@ -1,7 +1,6 @@
 load("//third-party:glob_defs.bzl", "subdir_glob")
 load(
     ":xnnpack_src_defs.bzl",
-    "JIT_SRCS",
     "LOGGING_SRCS",
     "OPERATOR_SRCS",
     "SUBGRAPH_SRCS",
@@ -69,27 +68,6 @@ def define_xnnpack():
         ],
     )
 
-    # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
-    native.cxx_library(
-        name = "jit_memory",
-        srcs = JIT_SRCS,
-        headers = subdir_glob([
-            ("XNNPACK/src", "**/*.h"),
-        ]),
-        header_namespace = "",
-        compiler_flags = [
-            "-std=c++17",
-        ],
-        preferred_linkage = "static",
-        preprocessor_flags = [
-            "-DXNN_LOG_LEVEL=0",
-        ],
-        exported_deps = [
-            ":clog",
-            ":interface",
-        ],
-    )
-
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "operators",
@@ -139,7 +117,6 @@ def define_xnnpack():
         preferred_linkage = "static",
         preprocessor_flags = [
             "-DXNN_LOG_LEVEL=0",
-            "-DXNN_ENABLE_JIT=0",
             "-DXNN_ENABLE_SPARSE=0",
             "-DXNN_ENABLE_GEMM_M_SPECIALIZATION=0",
             "-DXNN_ENABLE_MEMOPT",
@@ -1223,7 +1200,6 @@ def define_xnnpack():
     ]
 
     ARM_XNNPACK_DEPS = [
-        ":jit_memory",
         ":ukernels_armsimd32",
         ":ukernels_fp16arith",
         ":ukernels_asm",
@@ -1246,11 +1222,10 @@ def define_xnnpack():
             "XNNPACK/src/configs/hardware-config.c",
             "XNNPACK/src/microparams-init.c",
             "XNNPACK/src/operator-run.c",
-            "XNNPACK/src/operators/post-operation.c",
             "XNNPACK/src/microkernel-utils.c",
         ],
         headers = subdir_glob([
-            ("XNNPACK/src", "xnnpack/*.h"),
+            ("XNNPACK/src", "**/*.h"),
             ("XNNPACK/include", "**/*.h"),
         ]),
         exported_headers = {
@@ -1271,7 +1246,6 @@ def define_xnnpack():
             "-DXNN_NO_X8_OPERATORS",
             "-DXNN_ENABLE_MEMOPT",
             "-DXNN_ENABLE_SPARSE=0",
-            "-DXNN_ENABLE_JIT=0",
             "-DXNN_ENABLE_ASSEMBLY",
             "-DXNN_ENABLE_GEMM_M_SPECIALIZATION",
             "-DXNN_ENABLE_ARM_DOTPROD",