Hook up KernelRuntimeContext.fail()

dbort · facebook-github-bot · commit 49ccb7092c5a · 2023-08-15T14:51:23.000-07:00
Summary:
This gives kernels a way to fail non-fatally.

We still plan to add more features to `KernelRuntimeContext`, but they're lower priority right now.

Reviewed By: JacobSzwejbka

Differential Revision: D48198665

fbshipit-source-id: 59e22a568a658bab1358835e577840deda511465
diff --git a/runtime/executor/method.cpp b/runtime/executor/method.cpp
@@ -786,10 +786,24 @@ Error Method::execute_instruction() {
   switch (instruction->instr_args_type()) {
     case executorch_flatbuffer::InstructionArguments::KernelCall: {
       EXECUTORCH_SCOPE_PROF("OPERATOR_CALL");
-      KernelRuntimeContext context{};
+      // TODO(T147221312): Also expose the temp allocator and tensor resizer
+      // via the context.
+      KernelRuntimeContext context;
       chain.kernels_[step_state_.instr_idx](
           context, chain.argument_lists_[step_state_.instr_idx].data());
-      // TODO(T135464333): inspect runtime context for error state
+      Error err = context.failure_state();
+      if (err != Error::Ok) {
+        ET_LOG(
+            Error,
+            "KernelCall failed at instruction %zu:%zu: 0x%x",
+            step_state_.chain_idx,
+            step_state_.instr_idx,
+            (unsigned int)err);
+        // TODO(T153804650): Consider logging the EValues to help with
+        // debugging. This is a failure path, and it doesn't matter if it's a
+        // little slow. Do the same for DelegateCall errors.
+        return err;
+      }
     } break;
     case executorch_flatbuffer::InstructionArguments::DelegateCall: {
       EXECUTORCH_SCOPE_PROF("DELEGATE_CALL");
diff --git a/runtime/executor/test/kernel_integration_test.cpp b/runtime/executor/test/kernel_integration_test.cpp
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cctype>
+#include <filesystem>
+
+#include <cstring>
+#include <memory>
+
+#include <executorch/extension/data_loader/file_data_loader.h>
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/core/result.h>
+#include <executorch/runtime/executor/method.h>
+#include <executorch/runtime/executor/program.h>
+#include <executorch/runtime/executor/test/managed_memory_manager.h>
+#include <executorch/runtime/kernel/kernel_runtime_context.h>
+#include <executorch/runtime/kernel/operator_registry.h>
+#include <executorch/runtime/platform/compiler.h>
+#include <executorch/runtime/platform/runtime.h>
+#include <executorch/util/util.h>
+
+#include <gtest/gtest.h>
+
+using namespace ::testing;
+using torch::executor::ArrayRef;
+using torch::executor::Error;
+using torch::executor::EValue;
+using torch::executor::FreeableBuffer;
+using torch::executor::Kernel;
+using torch::executor::KernelKey;
+using torch::executor::KernelRuntimeContext;
+using torch::executor::Method;
+using torch::executor::Program;
+using torch::executor::Result;
+using torch::executor::testing::ManagedMemoryManager;
+using torch::executor::util::FileDataLoader;
+
+constexpr size_t kDefaultNonConstMemBytes = 32 * 1024U;
+constexpr size_t kDefaultRuntimeMemBytes = 32 * 1024U;
+
+/**
+ * Used to control and observe the behavior of a kernel.
+ */
+struct KernelControl {
+ public:
+  // The number of times the kernel has been called.
+  int call_count = 0;
+
+  // If true, the kernel should call `context.fail(error_to_set)`. If false,
+  // the kernel should not call `context.fail()`.
+  bool call_context_fail = true;
+
+  // The error value that the kernel should pass to `context.fail()` before
+  // returning.
+  Error fail_value = Error::Ok;
+
+  void reset() {
+    call_count = 0;
+    call_context_fail = false;
+    fail_value = Error::Ok;
+  }
+
+  /**
+   * Registers a kernel that uses the singleton instance to record and control
+   * its behavior.
+   */
+  static void register_singleton() {
+    if (registered_) {
+      return;
+    }
+
+    // This test helper installs itself as aten::add.out:
+    //
+    // add.out(Tensor self, Tensor other, *, Scalar alpha=1, Tensor(a!) out) ->
+    //     Tensor(a!)
+    //
+    // The arguments are: `self, other, out, out` (we repeat the out argument in
+    // the program). And since we traced using randn(2, 2), all the args are
+    // Float with dim order (0, 1)
+
+    // Construct a kernel key with the following meta:
+    // exec_aten::DimOrderType contiguous[] = {0, 1};
+    // TensorMeta float_contiguous[] = {
+    //     TensorMeta(ScalarType::Float, contiguous), // self
+    //     TensorMeta(ScalarType::Float, contiguous), // other
+    //     TensorMeta(ScalarType::Float, contiguous), // out
+    //     TensorMeta(ScalarType::Float, contiguous)}; // out (repeated)
+    KernelKey key = torch::executor::KernelKey(
+        "v0/\x06;\x00\x01|\x06;\x00\x01|\x06;\x00\x01|\x06;\x00\x01\xff");
+    Kernel kernel = torch::executor::Kernel(
+        "aten::add.out", key, KernelControl::kernel_hook);
+    Error err = torch::executor::register_kernels({kernel});
+    EXPECT_EQ(err, Error::Ok);
+
+    registered_ = true;
+  }
+
+  static KernelControl* singleton() {
+    return &singleton_;
+  }
+
+ private:
+  /**
+   * An OpFunction-compatible function that uses the singleton KernelControl
+   * to record and determine its behavior.
+   */
+  static void kernel_hook(
+      KernelRuntimeContext& context,
+      __ET_UNUSED EValue** args) {
+    auto* control = KernelControl::singleton();
+    control->call_count++;
+    if (control->call_context_fail) {
+      context.fail(control->fail_value);
+    }
+  }
+
+  static bool registered_;
+  static KernelControl singleton_;
+};
+
+bool KernelControl::registered_ = false;
+KernelControl KernelControl::singleton_;
+
+class KernelIntegrationTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    // Register the controllable kernel hook.
+    KernelControl::register_singleton();
+    // Ensure that its state is clear.
+    KernelControl::singleton()->reset();
+    // Provide the singleton to the tests.
+    control_ = KernelControl::singleton();
+
+    // Create a loader for the serialized ModuleAdd program.
+    const char* path = std::getenv("ET_MODULE_ADD_PATH");
+    Result<FileDataLoader> loader = FileDataLoader::From(path);
+    ASSERT_EQ(loader.error(), Error::Ok);
+    loader_ = std::make_unique<FileDataLoader>(std::move(loader.get()));
+
+    // Use it to load the program.
+    Result<Program> program = Program::Load(
+        loader_.get(), Program::Verification::InternalConsistency);
+    ASSERT_EQ(program.error(), Error::Ok);
+    program_ = std::make_unique<Program>(std::move(program.get()));
+
+    // Load the forward method.
+    mmm_ = std::make_unique<ManagedMemoryManager>(
+        kDefaultNonConstMemBytes, kDefaultRuntimeMemBytes);
+    Result<Method> method = program_->load_method("forward", &mmm_->get());
+    ASSERT_EQ(method.error(), Error::Ok);
+    method_ = std::make_unique<Method>(std::move(method.get()));
+
+    // Set up its inputs.
+    inputs_ = torch::executor::util::PrepareInputTensors(*method_);
+  }
+
+  void TearDown() override {
+    torch::executor::util::FreeInputs(inputs_);
+    inputs_ = {};
+  }
+
+ private:
+  // Must outlive program_
+  std::unique_ptr<FileDataLoader> loader_;
+
+  // Must outlive method_
+  std::unique_ptr<Program> program_;
+  std::unique_ptr<ManagedMemoryManager> mmm_;
+  ArrayRef<void*> inputs_;
+
+ protected:
+  // An executable method that will call the kernel associated with control_.
+  // Its inputs will have been allocated and initialized.
+  std::unique_ptr<Method> method_;
+
+  // The KernelControl associated with method_.
+  KernelControl* control_;
+};
+
+TEST_F(KernelIntegrationTest, KernelHookIsCalled) {
+  // Demonstrate that the kernel hook is called in the default state.
+  EXPECT_EQ(control_->call_count, 0);
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 1);
+
+  // Calling it again bumps the count.
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 2);
+}
+
+TEST_F(KernelIntegrationTest, FailurePropagates) {
+  // Tell the kernel to fail.
+  control_->call_context_fail = true;
+
+  // We should see the error from the kernel.
+  control_->fail_value = Error::InvalidArgument;
+  Error err = method_->execute();
+  EXPECT_EQ(err, Error::InvalidArgument);
+  EXPECT_EQ(control_->call_count, 1);
+
+  // Have it fail with a different error to show that it's not a coincidence.
+  control_->fail_value = Error::MemoryAllocationFailed;
+  err = method_->execute();
+  EXPECT_EQ(err, Error::MemoryAllocationFailed);
+  EXPECT_EQ(control_->call_count, 2);
+
+  // Returning an Ok does not cause the execution to fail.
+  control_->fail_value = Error::Ok;
+  err = method_->execute();
+  EXPECT_EQ(err, Error::Ok);
+  EXPECT_EQ(control_->call_count, 3);
+}
diff --git a/runtime/executor/test/targets.bzl b/runtime/executor/test/targets.bzl
@@ -168,6 +168,24 @@ def define_common_targets(is_fbcode = False):
             env = modules_env,
         )
 
+        runtime.cxx_test(
+            name = "kernel_integration_test",
+            srcs = [
+                "kernel_integration_test.cpp",
+            ],
+            deps = [
+                ":managed_memory_manager",
+                "//executorch/extension/data_loader:file_data_loader",
+                "//executorch/runtime/core:core",
+                "//executorch/runtime/executor:program",
+                "//executorch/runtime/kernel:kernel_runtime_context",
+                "//executorch/runtime/kernel:operator_registry",
+                "//executorch/runtime/platform:platform",
+                "//executorch/util:util",
+            ],
+            env = modules_env,
+        )
+
         runtime.cxx_test(
             name = "backend_integration_test",
             srcs = [
diff --git a/runtime/kernel/kernel_runtime_context.h b/runtime/kernel/kernel_runtime_context.h
@@ -8,24 +8,48 @@
 
 #pragma once
 
+#include <executorch/runtime/core/error.h>
+#include <executorch/runtime/platform/compiler.h>
+
 namespace torch {
 namespace executor {
 
 /**
- * Bucket type abstraction that contains many elements of runtime state that
- * a kernel author may want available, but would otherwise be unable to access.
- *
- * Forwarded along to all operators when running in lean mode. NOTE: Will not be
- * forwarded to operators if running in ATen mode as those operators do not
- * expect to receive a KernelRuntimeContext and would not use it.
+ * Runtime state and functionality for kernel implementations.
  *
- * This includes things like setting an error state, a scratch allocator for
- * operators that need more then constant space, and a TensorResizer for dynamic
- * shape tensors allowing programs to be more flexible with Tensor shape.
- *
- * TODO(T147221312): Define this interface
+ * NOTE: Will not be passed to operators if running in ATen mode as those
+ * operators do not expect to receive a KernelRuntimeContext argument.
  */
-class KernelRuntimeContext {};
+class KernelRuntimeContext {
+ public:
+  /**
+   * Tells the runtime that the kernel call has failed. Prefer this over
+   * ET_CHECK_*(), which fatally panics the process/system.
+   *
+   * If this is not called, the runtime will treat the kernel call as
+   * successful.
+   *
+   * This unusual error-propagation path is required because kernel signatures
+   * do not have a natural way to return errors directly. They are generally
+   * compatible with core PyTorch ATen kernel signatures, which use exceptions
+   * to report errors. But, ExecuTorch does not use exceptions.
+   */
+  void fail(Error error) {
+    failure_state_ = error;
+  }
+
+  /// Returns the current failure state.
+  __ET_NODISCARD Error failure_state() const {
+    return failure_state_;
+  }
+
+  // TODO(T147221312): Add a way to allocate temporary memory.
+
+  // TODO(T147221312): Add a way to resize a tensor.
+
+ private:
+  Error failure_state_ = Error::Ok;
+};
 
 } // namespace executor
 } // namespace torch
diff --git a/runtime/kernel/targets.bzl b/runtime/kernel/targets.bzl
@@ -30,15 +30,17 @@ def define_common_targets():
                 "kernel_runtime_context.h",
             ],
             visibility = [
-                "//executorch/kernels/prim_ops/...",  # Contains kernels
-                "//executorch/runtime/kernel/...",
                 "//executorch/kernels/...",
                 "//executorch/runtime/executor/...",
+                "//executorch/runtime/kernel/...",
                 "@EXECUTORCH_CLIENTS",
             ],
             exported_deps = [
                 "//executorch/runtime/core:core",
-                "//executorch/runtime/core/exec_aten:lib" + aten_suffix,
+                "//executorch/runtime/platform:platform",
+                # TODO(T147221312): This will eventually depend on exec_aten
+                # once KernelRuntimeContext support tensor resizing, which is
+                # why this target supports aten mode.
             ],
         )
 
diff --git a/runtime/kernel/test/kernel_runtime_context_test.cpp b/runtime/kernel/test/kernel_runtime_context_test.cpp
diff --git a/runtime/kernel/test/targets.bzl b/runtime/kernel/test/targets.bzl