[NOT FOR LAND] Prototype to register llama.cpp kernels into ExecuTorch

larryliu0820 · larryliu0820 · commit 51de4a082d88 · 2023-10-24T16:52:18.000-07:00
Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
diff --git a/examples/llama_cpp/CMakeLists.txt b/examples/llama_cpp/CMakeLists.txt
@@ -17,59 +17,82 @@
 #
 
 cmake_minimum_required(VERSION 3.19)
+project(LlamaCppExample)
+
 set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
 set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
 include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 
+set(_common_compile_options -Wno-deprecated-declarations -fPIC)
+
+# Let files say "include <executorch/path/to/header.h>".
+set(_common_include_directories ${EXECUTORCH_ROOT}/..)
+
+find_package(Llama REQUIRED)
+find_package(ExecuTorch REQUIRED)
+find_package(
+  gflags REQUIRED PATHS ${CMAKE_CURRENT_BINARY_DIR}/../../third-party
+)
+
+target_include_directories(executorch INTERFACE ${_common_include_directories})
+
 #
-# select_build_lib: C++ library to register selected ops in custom kernel
-# library
+# The `_<target>_srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}.
 #
-set(_kernel_lib)
-if(EXECUTORCH_SELECT_OPS_YAML)
-  set(_custom_ops_yaml
-      ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops.yaml)
-  gen_selected_ops("${_custom_ops_yaml}" "" "")
-  set(kernel_sources
-      ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops_1_out.cpp
-      ${EXECUTORCH_ROOT}/examples/portable/custom_ops/custom_ops_2_out.cpp)
-  #
-  # custom_kernels: C++ kernel implementations of custom ops
+set(
+  EXECUTORCH_SRCS_FILE
+  "${CMAKE_CURRENT_BINARY_DIR}/../../executorch_srcs.cmake"
+)
+if(NOT EXISTS "${EXECUTORCH_SRCS_FILE}")
+  # A file wasn't generated. Run a script to extract the source lists from the
+  # buck2 build system and write them to a file we can include.
   #
-  add_library(custom_kernels ${kernel_sources})
-  target_link_libraries(custom_kernels PRIVATE executorch)
-  target_compile_options(custom_kernels PUBLIC ${_common_compile_options})
-
-  list(APPEND _kernel_lib custom_kernels)
-else()
-  list(APPEND _kernel_lib portable_kernels)
+  # NOTE: This will only happen once during cmake setup, so it will not re-run
+  # if the buck2 targets change.
+  message(STATUS "executorch: Generating source lists")
+  set(EXECUTORCH_SRCS_FILE "${CMAKE_CURRENT_BINARY_DIR}/executorch_srcs.cmake")
+  extract_sources(${EXECUTORCH_SRCS_FILE})
 endif()
 
-gen_selected_ops(
-  "${_custom_ops_yaml}"
-  "${EXECUTORCH_SELECT_OPS_LIST}"
-  "${EXECUTORCH_SELECT_ALL_OPS}")
+# This file defines the `_<target>__srcs` variables used below.
+message(STATUS "executorch: Using sources file ${EXECUTORCH_SRCS_FILE}")
+include(${EXECUTORCH_SRCS_FILE})
+
+set(_custom_ops_yaml ${EXECUTORCH_ROOT}/examples/llama_cpp/custom_ops.yaml)
+set(_ops_yaml ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml)
 
-generate_bindings_for_kernels(${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
-                              "${_custom_ops_yaml}")
-gen_operators_lib("select_build_lib" ${_kernel_lib} executorch)
+set(kernel_sources ${EXECUTORCH_ROOT}/examples/llama_cpp/op_mm.cpp)
+#
+# custom_kernels: C++ kernel implementations of custom ops
+#
+add_library(custom_kernels ${kernel_sources})
+target_link_libraries(custom_kernels PRIVATE executorch llama)
+target_compile_options(custom_kernels PUBLIC ${_common_compile_options})
+
+set(_kernel_lib custom_kernels portable_kernels)
+
+# Select all ops in functions.yaml as well as custom op.
+gen_selected_ops("${_ops_yaml}" "ggml::mul_mat.out" "")
+
+#
+# kernel_lib: contains both custom_kernels and portable_kernels
+#
+generate_bindings_for_kernels("${_ops_yaml}" "${_custom_ops_yaml}")
+gen_operators_lib("kernel_lib" ${_kernel_lib} executorch)
+target_link_libraries(kernel_lib PRIVATE executorch)
 
-set(_updated__srcs)
-foreach(_src ${_executor_runner__srcs})
-  list(APPEND _updated__srcs "${EXECUTORCH_ROOT}/${_src}")
-endforeach()
+list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 #
-# selective_build_test: test binary to allow different operator libraries to
-# link to
+# llama_cpp_test: test binary to run llama.cpp kernel ggml_mul_mat
 #
-add_executable(selective_build_test ${_updated__srcs})
+add_executable(llama_cpp_test ${_executor_runner__srcs})
 if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
   target_link_options(selective_build_test PRIVATE "LINKER:--gc-sections")
 endif()
-target_link_libraries(selective_build_test executorch gflags select_build_lib)
-target_compile_options(selective_build_test PUBLIC ${_common_compile_options})
+target_link_libraries(llama_cpp_test executorch gflags kernel_lib)
+target_compile_options(llama_cpp_test PUBLIC ${_common_compile_options})
 
 # Print all summary
 executorch_print_configuration_summary()
diff --git a/examples/llama_cpp/custom_ops.yaml b/examples/llama_cpp/custom_ops.yaml
@@ -0,0 +1,4 @@
+- func: ggml::mul_mat.out(Tensor in, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: llama_cpp::mm_out
diff --git a/examples/llama_cpp/export.py b/examples/llama_cpp/export.py
@@ -0,0 +1,36 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Example script for exporting simple models to flatbuffer
+
+import argparse
+import logging
+
+from ..models import MODEL_NAME_TO_MODEL
+from ..models.model_factory import EagerModelFactory
+from ..portable.utils import export_to_edge, save_pte_program
+from .permute_mm_fusion_pass import PermuteMMFusionPass
+from torch._export import capture_pre_autograd_graph
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+
+
+if __name__ == "__main__":
+
+    model, example_inputs = EagerModelFactory.create_model(
+        *MODEL_NAME_TO_MODEL["llama2"]
+    )
+    m = model.eval()
+    # pre-autograd export. eventually this will become torch.export
+    m = capture_pre_autograd_graph(m, example_inputs)
+
+    edge_ir = export_to_edge(m, example_inputs).transform([PermuteMMFusionPass(_fix_node_meta_val=True)])
+    print(f"Exported graph:\n{edge_ir.exported_program().graph}")
+
+    prog = edge_ir.to_executorch()
+
+    save_pte_program(prog.buffer, "llama2_fused")
diff --git a/examples/llama_cpp/op_mm.cpp b/examples/llama_cpp/op_mm.cpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include "ggml.h"
+#include <executorch/kernels/portable/cpu/util/matmul_ops_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace llama_cpp {
+namespace native {
+
+using Tensor = exec_aten::Tensor;
+using RuntimeContext = exec_aten::RuntimeContext;
+using Error = torch::executor::Error;
+
+// Helper function to create a ggml tensor with preallocated memory
+static struct ggml_tensor * ggml_tensor_from(const Tensor & t, const int64_t * ne_override) {
+    // HACK: since this is only used by mm, hardcode n_dims to 2
+    // Should be t.dim() but that requires refactoring
+    int n_dims = 2;
+    // ET_CHECK_MSG(n_dims >= 1 && n_dims <= GGML_MAX_DIMS, "dimension %d is not within range (1, %d)", n_dims, GGML_MAX_DIMS);
+
+    void * data = t.mutable_data_ptr();
+
+    // TODO use memory from context to create tensor
+    struct ggml_tensor * const result = (struct ggml_tensor *) malloc(sizeof (struct ggml_tensor));
+
+    ET_CHECK_MSG(t.scalar_type() == exec_aten::ScalarType::Float, "only float type supported");
+    // TODO support different types
+    enum ggml_type type = ggml_type::GGML_TYPE_F32;
+    *result = (struct ggml_tensor) {
+        /*.type         =*/ type,
+        /*.backend      =*/ GGML_BACKEND_CPU,
+        /*.buffer       =*/ NULL,
+        /*.n_dims       =*/ n_dims,
+        /*.ne           =*/ { 1, 1, 1, 1 },
+        /*.nb           =*/ { 0, 0, 0, 0 },
+        /*.op           =*/ GGML_OP_NONE,
+        /*.op_params    =*/ { 0 },
+        /*.is_param     =*/ false,
+        /*.grad         =*/ NULL,
+        /*.src          =*/ { NULL },
+        /*.perf_runs    =*/ 0,
+        /*.perf_cycles  =*/ 0,
+        /*.perf_time_us =*/ 0,
+        /*.view_src     =*/ NULL,
+        /*.view_offs    =*/ 0,
+        /*.data         =*/ data,
+        /*.name         =*/ { 0 },
+        /*.extra        =*/ NULL,
+        /*.padding      =*/ { 0 },
+    };
+
+    // TODO: this should not be needed as long as we don't rely on aligned SIMD loads
+    //ggml_assert_aligned(result->data);
+
+    if (ne_override != NULL) {
+        for (int i = 0; i < n_dims; i++) {
+            result->ne[i] = ne_override[i];
+        }
+    } else {
+        for (int i = 0; i < n_dims; i++) {
+            result->ne[i] = t.sizes()[i];
+        }
+    }
+
+    result->nb[0] = ggml_type_size(type);
+    result->nb[1] = result->nb[0]*(result->ne[0]/ggml_blck_size(type));
+    for (int i = 2; i < GGML_MAX_DIMS; i++) {
+        result->nb[i] = result->nb[i - 1]*result->ne[i - 1];
+    }
+
+    // ctx->n_objects++;
+
+    return result;
+}
+
+// View(mat2, {1, 64}), transpose, then matmul.
+Tensor&
+mm_out(RuntimeContext& ctx, const Tensor& in, const Tensor& mat2, Tensor& out) {
+
+  // prepare input tensors
+  // HACK: view(mat2, {64, 1});
+  const int64_t dims[4] = {64, 1, 1, 1};
+
+  struct ggml_tensor * a = ggml_tensor_from(in, NULL);
+
+  struct ggml_tensor * b = ggml_tensor_from(mat2, dims);
+
+//   GGML_ASSERT(ggml_can_mul_mat(b, a));
+//   GGML_ASSERT(!ggml_is_transposed(b));
+
+  const int64_t ne[4] = { b->ne[1], a->ne[1], a->ne[2], a->ne[3] };
+  struct ggml_tensor * result = ggml_tensor_from(out, ne);
+
+  result->op   = GGML_OP_MUL_MAT;
+  result->grad = NULL;
+  result->src[0] = b;
+  result->src[1] = a;
+
+  // run op
+  struct ggml_cgraph gf = ggml_build_forward(result);
+
+  struct ggml_cplan plan = ggml_graph_plan(&gf, /*int n_threads*/1);
+  int res = ggml_graph_compute(&gf, &plan);
+
+  return out;
+}
+
+} // namespace native
+} // namespace llama_cpp
diff --git a/examples/llama_cpp/permute_mm_fusion_pass.py b/examples/llama_cpp/permute_mm_fusion_pass.py
@@ -0,0 +1,102 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, List, Tuple
+
+import torch
+from executorch.exir.dialects._ops import bind_pattern_to_op, ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+from executorch.exir.passes.replace_aten_with_edge_pass import (
+    aten_to_edge,
+    should_lower_to_edge,
+)
+from torch import fx
+from torch.fx import GraphModule, subgraph_rewriter
+from torch.fx.passes.infra.pass_base import PassResult
+from torch.utils import _pytree as pytree
+
+from torch.library import impl, Library
+
+custom_ops_lib = Library("ggml", "DEF")
+
+custom_ops_lib.define(
+    "mul_mat.out(Tensor input, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!)"
+)
+
+custom_ops_lib.define("mul_mat(Tensor input, Tensor mat2) -> Tensor")
+
+
+def out_kernel(a, b, *, out):
+    d = torch.ops.aten.view_copy.default(b, [1, 64])
+    e = torch.ops.aten.mm.out(d, a, out=out)
+    return out
+
+
+custom_ops_lib.impl("mul_mat.out", out_kernel)
+
+
+def _trace_and_lower_to_edge_ops(f: Callable) -> fx.GraphModule:
+    gm = fx.symbolic_trace(f)
+    for node in gm.graph.nodes:
+        if node.op == "call_function" and should_lower_to_edge(node.target):
+            node.target = aten_to_edge(node.target)
+    gm.recompile()
+    return gm
+
+
+# Fuse the following pattern:
+#   - d = view_copy(b, [1, 64])
+#   - e = mm(d, a)
+
+
+def get_patterns_and_replacements() -> List[Tuple[Callable, Callable, List[Callable]]]:
+    @bind_pattern_to_op(custom_ops_lib, "mul_mat")
+    def pattern(a, b):
+        d = torch.ops.aten.view_copy.default(b, [1, 64])
+        e = torch.ops.aten.mm.default(d, a)
+        return e
+
+    def replacement(a, b):
+        return torch.ops.ggml.mul_mat.default(a, b)
+
+    p_graph = _trace_and_lower_to_edge_ops(pattern)
+    r_graph = _trace_and_lower_to_edge_ops(replacement)
+    # print(p_graph.graph)
+    # print(r_graph.graph)
+    return [
+        (
+            p_graph,
+            r_graph,
+            [],
+        )
+    ]
+
+
+class PermuteMMFusionPass(ExportPass):
+    def __init__(self, _fix_node_meta_val=False):
+        super().__init__()
+        self._fix_node_meta_val = _fix_node_meta_val
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for (
+            pattern,
+            replacement,
+            match_filters,
+        ) in get_patterns_and_replacements():
+            subgraph_rewriter.replace_pattern_with_filters(
+                graph_module, pattern, replacement, match_filters
+            )
+
+        if self._fix_node_meta_val:
+            for n in graph_module.graph.nodes:
+                if n.op == "call_function" and "val" not in n.meta:
+                    args, kwargs = pytree.tree_map_only(
+                        torch.fx.Node, lambda x: x.meta["val"], (n.args, n.kwargs)
+                    )
+                    n.meta["val"] = n.target(*args, **kwargs)
+        graph_module.graph.lint()
+        graph_module.graph.eliminate_dead_code()
+        return PassResult(graph_module, True)
diff --git a/examples/llama_cpp/test_llama_cpp.sh b/examples/llama_cpp/test_llama_cpp.sh