[2/N] Non-Tensor: Scalar Support: Add scalar to the cache for eager-through-torch.compile (pytorch#124070)

EikanWang · ZelboK · commit b51e6ddae240 · 2024-05-19T14:51:26.000-04:00
Add scalar information to the kernel configuration. #### Additional Context Currently, the input parameters are orchestrated by input order in the kernel configuration and loaded/mapped to the kernel at runtime. For example, the cache order of the input parameters of `torch.add(a, b, alpha=2.0)` is `a' first, followed by `b` and then `alpha`. The same order is for cache loading. However, the orchestration mechanism does not support kwargs because the order of kwargs is useless. For example, the `out` of `aten::gelu.out(Tensor self, *, str approximate='none', Tensor(a!) out) -> Tensor(a!)` may be before `approximate`. We will support it with subsequent PRs. Pull Request resolved: pytorch#124070 Approved by: https://github.com/jansel, https://github.com/jgong5
diff --git a/test/inductor/test_torchinductor.py b/test/inductor/test_torchinductor.py
@@ -841,6 +841,86 @@ def fn(a):
 
         self.assertTrue(kernel_lib_path in kernel_libs_abs_path)
 
+    @skipCUDAIf(not SM80OrLater, "Requires sm80")
+    def test_eager_aoti_with_scalar(self):
+        namespace_name = "aten"
+        op_name = "add"
+        op_overload_name = "Tensor"
+        op_name_with_overload = f"{op_name}.{op_overload_name}"
+
+        dispatch_key = "CPU"
+        device = torch.device("cpu")
+        if self.device.lower() == "cuda":
+            dispatch_key = "CUDA"
+            device = torch.device("cuda")
+
+        # Test the difference between scalar tensor and scalar
+        a = torch.scalar_tensor(1.0, device=device)
+        b = torch.scalar_tensor(2.0, device=device)
+
+        kernel_lib_path = aoti_compile_with_persistent_cache(
+            namespace_name,
+            op_name_with_overload,
+            a.device.type,
+            False,
+            torch.ops.aten.add,
+            args=(a, b),
+            kwargs={"alpha": 3.0},
+        )
+        self.assertTrue(Path(kernel_lib_path).exists())
+        device_kernel_cache = aoti_eager_cache_dir(namespace_name, device.type)
+        kernel_conf = device_kernel_cache / f"{op_name_with_overload}.json"
+        self.assertTrue(kernel_conf.exists())
+        json_data = load_aoti_eager_cache(
+            namespace_name, op_name_with_overload, a.device.type
+        )
+        op_info = json_data[0]
+        self.assertTrue(isinstance(op_info, dict))
+        self.assertTrue("meta_info" in op_info)
+        self.assertTrue(len(op_info["meta_info"]) == 3)
+        self.assertTrue(op_info["meta_info"][0]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][0]["strides"] == [])
+        # Scalar Tensor
+        self.assertTrue("scalar_value" not in op_info["meta_info"][0])
+        self.assertTrue(op_info["meta_info"][1]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][1]["strides"] == [])
+        # Scalar Tensor
+        self.assertTrue("scalar_value" not in op_info["meta_info"][1])
+        self.assertTrue(op_info["meta_info"][2]["sizes"] == [])
+        self.assertTrue(op_info["meta_info"][2]["strides"] == [])
+        # Scalar
+        self.assertTrue("scalar_value" in op_info["meta_info"][2])
+
+        with _scoped_library("aten", "IMPL") as torch_compile_op_lib_impl:
+            a = torch.randn(128, device=device)
+            b = torch.randn(128, device=device)
+
+            scalar_values = [1.0, 2.0, 3.0]
+            ref_values = []
+            for scalar_value in scalar_values:
+                ref_values.append(torch.add(a, b, alpha=scalar_value))
+
+            qualified_op_name = f"{namespace_name}::{op_name}"
+            _, overload_names = torch._C._jit_get_operation(qualified_op_name)
+            for overload_name in overload_names:
+                try:
+                    reg_op_name = qualified_op_name
+                    schema = torch._C._get_schema(reg_op_name, overload_name)
+                    if schema.overload_name:
+                        reg_op_name = f"{reg_op_name}.{schema.overload_name}"
+                    torch_compile_op_lib_impl._impl_with_aoti_compile(  # noqa: F821
+                        reg_op_name, dispatch_key
+                    )
+                except Exception as e:
+                    continue
+
+            res_values = []
+            for scalar_value in scalar_values:
+                res_values.append(torch.add(a, b, alpha=scalar_value))
+
+            self.assertEqual(len(ref_values), len(res_values))
+            self.assertEqual(ref_values, res_values)
+
     @skipCUDAIf(not SM80OrLater, "Requires sm80")
     def test_torch_compile_override_registration(self):
         dynamic = False
diff --git a/torch/_inductor/utils.py b/torch/_inductor/utils.py
@@ -1578,16 +1578,23 @@ def aoti_compile_with_persistent_cache(
     """
     Compile the given function with persistent cache for AOTI eager mode.
     """
-    flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
-    assert all(
-        isinstance(input, torch.Tensor) for input in flattened_inputs
-    ), "Only support tensor for now"
     assert not dynamic, "Only support static shape for now"
+    type_to_torch_dtype = {int: torch.int32, float: torch.float, bool: torch.bool}
+    supported_scalar_types = tuple(type_to_torch_dtype.keys())
+    flattened_inputs = pytree.arg_tree_leaves(*args, **kwargs)
+    if not all(
+        isinstance(input, (supported_scalar_types, torch.Tensor))
+        for input in flattened_inputs
+    ):
+        raise NotImplementedError("Only support tensor, int, float, bool for now")
 
     persistent_cache = aoti_eager_cache_dir(ns, device_type)
-    persistent_cache.mkdir(parents=True, exist_ok=True)
+    if not persistent_cache.exists():
+        persistent_cache.mkdir(parents=True)
+
     persistent_cache_lib = persistent_cache / "lib"
-    persistent_cache_lib.mkdir(parents=True, exist_ok=True)
+    if not persistent_cache_lib.exists():
+        persistent_cache_lib.mkdir()
 
     with mock.patch.dict(
         os.environ,
@@ -1609,18 +1616,30 @@ def aoti_compile_with_persistent_cache(
             )
 
             kernel_metadata_items = []
-            for input_tensor in flattened_inputs:
+            for input in flattened_inputs:
                 # TODO(Eikan): To add dynamic support
                 metadata: Dict[str, Any] = {}
                 metadata["is_dynamic"] = dynamic
-                metadata["device_type"] = f"{input_tensor.device.type}"
-                if is_cpu_device([input_tensor]):
-                    metadata["device_index"] = -1
+
+                if isinstance(input, torch.Tensor):
+                    metadata["device_type"] = f"{input.device.type}"
+                    if is_cpu_device([input]):
+                        metadata["device_index"] = -1
+                    else:
+                        metadata["device_index"] = input.device.index
+                    metadata["dtype"] = f"{input.dtype}"
+                    metadata["sizes"] = list(input.size())
+                    metadata["strides"] = list(input.stride())
                 else:
-                    metadata["device_index"] = input_tensor.device.index
-                metadata["dtype"] = f"{input_tensor.dtype}"
-                metadata["sizes"] = list(input_tensor.size())
-                metadata["strides"] = list(input_tensor.stride())
+                    assert isinstance(input, supported_scalar_types)
+                    # Scalar tensor
+                    metadata["device_type"] = device_type
+                    metadata["device_index"] = -1 if device_type == "cpu" else 0
+                    metadata["dtype"] = f"{type_to_torch_dtype[type(input)]}"
+                    metadata["sizes"] = []
+                    metadata["strides"] = []
+                    metadata["scalar_value"] = input
+
                 kernel_metadata_items.append(metadata)
 
             kernel_meta_info: Dict[str, Any] = {}
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.cpp b/torch/csrc/inductor/aoti_eager/kernel_holder.cpp
@@ -96,8 +96,13 @@ bool unpack_tensors(
     const std::vector<c10::Argument>& arguments,
     const torch::jit::Stack& stack,
     const c10::Device& device,
-    std::vector<at::Tensor>& inputs) {
+    std::vector<at::Tensor>& inputs,
+    bool with_scalar = false) {
   for (size_t idx = 0; idx < stack.size(); idx++) {
+    if (!with_scalar && stack[idx].isScalar()) {
+      continue;
+    }
+
     if (!unpack_ivalue(arguments[idx], stack[idx], device, inputs)) {
       return false;
     }
@@ -106,6 +111,40 @@ bool unpack_tensors(
   return true;
 }
 
+std::vector<size_t> get_tensor_parameter_index(
+    const std::vector<c10::Argument>& arguments,
+    const torch::jit::Stack& stack) {
+  std::vector<size_t> tensor_parameter_index;
+  for (size_t idx = 0; idx < stack.size(); idx++) {
+    if (stack[idx].isScalar() || stack[idx].isTensor()) {
+      // scalar and tensor
+      tensor_parameter_index.push_back(idx);
+    } else if (stack[idx].isTensorList()) {
+      // tensor list
+      std::fill_n(
+          std::back_inserter(tensor_parameter_index),
+          stack[idx].toListRef().size(),
+          idx);
+    } else if (stack[idx].isOptionalTensorList()) {
+      // optional tensor list: std::vector<std::optional<at::Tensor>>
+      for (const auto& item : stack[idx].toListRef()) {
+        if (item.toOptional<at::Tensor>().has_value()) {
+          tensor_parameter_index.push_back(idx);
+        }
+      }
+    } else if (
+        *arguments[idx].real_type() ==
+        *c10::getTypePtr<c10::optional<at::Tensor>>()) {
+      // optional tensor
+      if (stack[idx].toOptional<at::Tensor>().has_value()) {
+        tensor_parameter_index.push_back(idx);
+      }
+    }
+  }
+
+  return tensor_parameter_index;
+}
+
 } // namespace
 
 AOTIPythonKernelHolder::AOTIPythonKernelHolder(
@@ -149,14 +188,19 @@ bool AOTIPythonKernelHolder::cache_lookup(
       "Not implemented for operations that return a non-Tensor value.");
 
   std::vector<at::Tensor> inputs;
-  auto res = unpack_tensors(op.schema().arguments(), *stack, device_, inputs);
+  auto res =
+      unpack_tensors(op.schema().arguments(), *stack, device_, inputs, true);
   TORCH_CHECK_NOT_IMPLEMENTED(
       res && inputs.size() > 0,
       "Not implemented for operations that contain a parameter which is ",
       "not one of the following types: at::Tensor, at::TensorList, ",
       "std::optional<at::Tensor>, std::vector<std::optional<at::Tensor>>.");
 
-  auto inputs_metadata = get_inputs_metadata(inputs);
+  auto tensor_parameter_index =
+      get_tensor_parameter_index(op.schema().arguments(), *stack);
+  TORCH_INTERNAL_ASSERT(tensor_parameter_index.size() == inputs.size());
+  auto inputs_metadata = get_inputs_metadata(
+      inputs, op.schema().arguments(), tensor_parameter_index);
   auto aoti_kernel_state = aoti_kernel_cache_.find(inputs_metadata);
   if (aoti_kernel_state == aoti_kernel_cache_.end()) {
     return false;
@@ -197,18 +241,49 @@ void AOTIPythonKernelHolder::cache_hit(
 }
 
 AOTIKernelMetadata AOTIPythonKernelHolder::get_inputs_metadata(
-    const std::vector<at::Tensor>& inputs) {
+    const std::vector<at::Tensor>& inputs,
+    const std::vector<c10::Argument>& inputs_argument,
+    const std::vector<size_t>& inputs_argument_index) {
   AOTIKernelMetadata inputs_metadata;
-  for (const auto& input : inputs) {
+  for (size_t idx = 0; idx < inputs.size(); ++idx) {
+    auto input = inputs[idx];
+    auto input_info = inputs_argument[inputs_argument_index[idx]];
+
     auto device = input.device();
     if (device.is_cpu()) {
       // If the device is CPU, set the device index to -1.
       device = c10::Device(device.type(), -1);
     }
 
+    c10::Scalar scalar_value((double)1.0);
+    auto tensor_type = input.scalar_type();
+
+    bool is_scalar = input_info.type()->isSubtypeOf(*c10::NumberType::get());
+    if (is_scalar) {
+      if (c10::isFloatingType(input.scalar_type())) {
+        auto scalar_numeric_value = input.item().toDouble();
+        tensor_type = c10::ScalarType::Double;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else if (c10::isIntegralType(input.scalar_type(), false)) {
+        auto scalar_numeric_value = input.item().toUInt64();
+        tensor_type = c10::ScalarType::UInt64;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else if (input.scalar_type() == c10::ScalarType::Bool) {
+        auto scalar_numeric_value = input.item().toBool();
+        tensor_type = c10::ScalarType::Bool;
+        scalar_value = c10::Scalar(scalar_numeric_value);
+      } else {
+        TORCH_CHECK(
+            false,
+            "Unsupported scalar tensor type: ",
+            c10::toString(input.scalar_type()));
+      }
+    }
+
     inputs_metadata.emplace_back(
-        false, // is symbloic
-        input.scalar_type(),
+        false,
+        tensor_type,
+        c10::IValue(scalar_value),
         device,
         input.sizes().vec(),
         input.strides().vec());
@@ -269,6 +344,7 @@ void AOTIPythonKernelHolder::init_aoti_kernel_cache() {
           reinterpret_cast<THPDtype*>(data_type_obj.ptr())->scalar_type;
       auto sizes = metadata["sizes"].cast<std::vector<int64_t>>();
       auto strides = metadata["strides"].cast<std::vector<int64_t>>();
+      bool is_scalar = metadata.contains("scalar_value");
 
       std::vector<std::optional<c10::SymInt>> sym_optional_sizes;
       std::vector<std::optional<c10::SymInt>> sym_optional_strides;
@@ -279,10 +355,34 @@ void AOTIPythonKernelHolder::init_aoti_kernel_cache() {
         sym_optional_strides.push_back(std::optional<c10::SymInt>(stride));
       }
 
-      // Now you can use these variables in your code
+      // If an input parameter is a scalar, its detailed value is cached.
+      // This is done to ensure correctness during subsequent checks.
+      c10::Scalar scalar_value((double)1.0);
+      if (is_scalar) {
+        if (c10::isFloatingType(data_type)) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<double>();
+          data_type = c10::ScalarType::Double;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else if (c10::isIntegralType(data_type, false)) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<int64_t>();
+          data_type = c10::ScalarType::UInt64;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else if (data_type == c10::ScalarType::Bool) {
+          auto scalar_numeric_value = metadata["scalar_value"].cast<bool>();
+          data_type = c10::ScalarType::Bool;
+          scalar_value = c10::Scalar(scalar_numeric_value);
+        } else {
+          TORCH_CHECK(
+              false,
+              "Unsupported scalar tensor type: ",
+              c10::toString(data_type));
+        }
+      }
+
       tensor_metadata_list.emplace_back(
           is_dynamic,
           data_type,
+          c10::IValue(scalar_value),
           c10::Device(c10::Device(device_type).type(), device_index),
           sizes,
           strides);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_holder.h b/torch/csrc/inductor/aoti_eager/kernel_holder.h
@@ -3,6 +3,7 @@
 
 #include <ATen/ATen.h>
 #include <ATen/core/boxing/KernelFunction.h>
+#include <ATen/core/function_schema.h>
 
 #include <torch/csrc/dynamo/guards.h>
 #include <torch/csrc/inductor/aoti_eager/kernel_meta_info.h>
@@ -82,7 +83,10 @@ class AOTIPythonKernelHolder : public c10::OperatorKernel {
   void init_aoti_kernel_cache();
   // Abstract the meta information of each tensor for the given operation. The
   // meta infomation will be used for cache lookup as the key.
-  AOTIKernelMetadata get_inputs_metadata(const std::vector<at::Tensor>&);
+  AOTIKernelMetadata get_inputs_metadata(
+      const std::vector<at::Tensor>& inputs,
+      const std::vector<c10::Argument>& inputs_argument,
+      const std::vector<size_t>& inputs_argument_index);
   // Load the AOTIModelContainerRunner object from the given file path.
   std::shared_ptr<AOTIModelContainerRunner> load_aoti_model_runner(
       const std::string&);
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp b/torch/csrc/inductor/aoti_eager/kernel_meta_info.cpp
diff --git a/torch/csrc/inductor/aoti_eager/kernel_meta_info.h b/torch/csrc/inductor/aoti_eager/kernel_meta_info.h