[cpp_wrapper] Change CppWrapperCodeCache to use faster python binding (pytorch#117693)

desertfire · pytorchmergebot · commit 41556324a9da · 2024-01-21T16:07:52.000Z
Summary: Using faster binding following pytorch#117500. torch.utils.cpp_extension.load_inline builds a lot of things and is very slow. With this change, later we can further reduce the included header files using the ABI-compatible mode and thus further speed up the compilation. Result: ``` python test/inductor/test_cuda_cpp_wrapper.py -k test_relu_cuda_cuda_wrapper Before: Ran 1 test in 32.843s After: Ran 1 test in 26.229s ``` Pull Request resolved: pytorch#117693 Approved by: https://github.com/jansel
diff --git a/torch/_inductor/codecache.py b/torch/_inductor/codecache.py
@@ -33,7 +33,6 @@
 from ctypes import c_void_p, cdll, CDLL
 from dataclasses import field
 from functools import partial
-from importlib import abc
 from pathlib import Path
 from threading import Thread
 from time import sleep, time
@@ -46,7 +45,7 @@
     get_interface_for_device,
     get_registered_device_interfaces,
 )
-from torch._dynamo.utils import counters
+from torch._dynamo.utils import counters, dynamo_timed
 from torch._inductor import config, exc
 from torch._inductor.codegen.cuda import cuda_env
 from torch._inductor.utils import cache_dir, developer_warning, is_linux
@@ -1701,6 +1700,7 @@ def cpp_prefix() -> str:
 
 # Given a path to an input cpp file and an output path,
 # Attempts to compile the file, storing the output in "output_path"
+@dynamo_timed
 def compile_file(
     input_path: Union[str, List[str]], output_path: str, cmd: List[str]
 ) -> None:
@@ -1783,7 +1783,8 @@ def _load_library(cls, path: str, key: str) -> Union[CDLL, ModuleType]:
             raise
 
     @classmethod
-    def load(cls, source_code: str) -> Union[CDLL, ModuleType]:
+    def load(cls, source_code: str, cuda: bool = False) -> Union[CDLL, ModuleType]:
+        cls.cpp_compile_command_flags.update({"cuda": cuda})
         picked_vec_isa = pick_vec_isa()
         cpp_command = repr(
             cpp_compile_command(
@@ -1821,9 +1822,12 @@ class CppPythonBindingsCodeCache(CppCodeCache):
         "include_pytorch": True,
         "shared": True,
     }
+    entry_function = "kernel"
+    call_entry_function = "kernel(%s);Py_RETURN_NONE;"
+    extra_parse_arg = ""
     suffix_template = textwrap.dedent(
         """
-        // Python bindings to call kernel():
+        // Python bindings to call %s():
         #define PY_SSIZE_T_CLEAN
         #include <Python.h>
         #include <sstream>
@@ -1844,14 +1848,15 @@ class CppPythonBindingsCodeCache(CppCodeCache):
             return result;
         }
 
-        static PyObject* kernel_py(PyObject* self, PyObject* args) {
+        %s
+
+        static PyObject* %s_py(PyObject* self, PyObject* args) {
             try {
                 if(!PyTuple_CheckExact(args))
                     [[unlikely]] throw std::runtime_error("tuple args required");
                 if(PyTuple_GET_SIZE(args) != %s)
                     [[unlikely]] throw std::runtime_error("requires %s args");
-                kernel(%s);
-                Py_RETURN_NONE;
+                %s
             } catch(std::exception const& e) {
                 PyErr_SetString(PyExc_RuntimeError, e.what());
                 return nullptr;
@@ -1862,13 +1867,13 @@ class CppPythonBindingsCodeCache(CppCodeCache):
         }
 
         static PyMethodDef py_methods[] = {
-            {"kernel", kernel_py, METH_VARARGS, ""},
+            {"%s", %s_py, METH_VARARGS, ""},
             {NULL, NULL, 0, NULL}};
 
         static struct PyModuleDef py_module =
-            {PyModuleDef_HEAD_INIT, "kernel", NULL, -1, py_methods};
+            {PyModuleDef_HEAD_INIT, "%s", NULL, -1, py_methods};
 
-        PyMODINIT_FUNC PyInit_kernel(void) {
+        PyMODINIT_FUNC PyInit_%s(void) {
             const char* str_addr = std::getenv("_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR");
             if(!str_addr) {
                 PyErr_SetString(PyExc_RuntimeError, "_TORCHINDUCTOR_PYOBJECT_TENSOR_DATA_PTR must be set");
@@ -1890,29 +1895,62 @@ def _load_library_inner(cls, path: str, key: str) -> ModuleType:
             torch._C._dynamo.guards._torchinductor_pyobject_tensor_data_ptr  # type: ignore[attr-defined]
         )
         return importlib.machinery.ExtensionFileLoader(
-            f"{key}.kernel", path
+            f"{key}.{cls.entry_function}", path
         ).load_module()  # type: ignore[call-arg]
 
     @classmethod
-    def load_pybinding(cls, argtypes: List[str], source_code: str) -> Any:
+    def load_pybinding(
+        cls, argtypes: List[str], source_code: str, cuda: bool = False
+    ) -> Any:
         """
         Wrap a C++ function in fast Python bindings.
 
         Args:
-            argtypes: The types of args to kernel(), e.g. ["float*", "long"]
-            source_code: C++ source code containing a kernel() function
+            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
+            source_code: C++ source code containing a ENTRY_FUNCTION() function
 
         Returns:
-            A python version of kernel()
+            A python version of ENTRY_FUNCTION()
         """
         parseargs = ", ".join(
             f"parse_arg<{argtype.replace('const ', '')}>(args, {n})"
             for n, argtype in enumerate(argtypes)
         )
-        suffix = cls.suffix_template % (len(argtypes), len(argtypes), parseargs)
-        result = cls.load(source_code + suffix)
+        suffix = cls.suffix_template % (
+            cls.entry_function,
+            cls.extra_parse_arg,
+            cls.entry_function,
+            len(argtypes),
+            len(argtypes),
+            cls.call_entry_function % parseargs,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+            cls.entry_function,
+        )
+        result = cls.load(source_code + suffix, cuda)
         assert isinstance(result, ModuleType)
-        return result.kernel
+        return getattr(result, cls.entry_function)
+
+
+class CppWrapperCodeCache(CppPythonBindingsCodeCache):
+    cache: Dict[str, Union[CDLL, ModuleType]] = {}
+    clear = staticmethod(cache.clear)
+    cpp_compile_command_flags = {
+        "include_pytorch": True,
+        "shared": True,
+    }
+    entry_function = "inductor_entry_cpp"
+    call_entry_function = "return THPVariable_WrapList(inductor_entry_cpp(%s));"
+    extra_parse_arg = textwrap.dedent(
+        """
+        #include <torch/csrc/autograd/python_variable.h>
+
+        template <> inline std::vector<at::Tensor> parse_arg<std::vector<at::Tensor>>(PyObject* args, size_t n) {
+            return THPVariable_UnpackList(PyTuple_GET_ITEM(args, n));
+        }
+        """
+    )
 
 
 class PyCodeCache:
@@ -1998,81 +2036,6 @@ def parse_stack_trace(stack_trace: str) -> List[Dict[str, Any]]:
         return parse_stack_trace(entry)
 
 
-class CppWrapperCodeCache:
-    cache: Dict[str, CDLL] = dict()
-    clear = staticmethod(cache.clear)
-
-    @classmethod
-    def load(cls, source_code: str, func_name: str, key: str, cuda: bool) -> CDLL:
-        name = f"inline_extension_{key}"
-        cpp_wrapper_dir = cpp_wrapper_cache_dir(name)
-        os.makedirs(cpp_wrapper_dir, exist_ok=True)
-
-        ext = "so"
-        filepath = os.path.join(cpp_wrapper_dir, f"{name}.{ext}")
-        log.debug("Cpp wrapper code path %s", filepath)
-
-        if key not in cls.cache:
-            log.debug("Cpp wrapper cache miss for %s", filepath)
-            from filelock import FileLock
-
-            lock_dir = get_lock_dir()
-            lock = FileLock(os.path.join(lock_dir, key + ".lock"), timeout=LOCK_TIMEOUT)
-            with lock:
-                if not os.path.exists(filepath):
-                    log.debug("Cpp wrapper building %s", filepath)
-
-                    _cpp_flags = cpp_flags()
-                    _opt_flags = optimization_flags()
-                    _shared = get_shared()
-                    _warning_all_flag = get_warning_all_flag()
-                    (
-                        _ipaths,
-                        _lpaths,
-                        _libs,
-                        _macros,
-                        _build_arch_flags,
-                    ) = get_include_and_linking_paths(
-                        vec_isa=pick_vec_isa(),
-                        cuda=cuda,
-                    )
-                    _use_custom_generated_macros = use_custom_generated_macros()
-                    _cpp_wrapper_flags = cpp_wrapper_flags()
-
-                    extra_cflags = f"{_cpp_flags} {_opt_flags} {_warning_all_flag} {_build_arch_flags} {_macros} \
-                    {_cpp_wrapper_flags} {_use_custom_generated_macros}"
-                    # For CPP wrapper, add -ffast-math during linking to make CPU flush denormals.
-                    # CPP wrapper leverages cpp_extension which will do the compilation and linking in two stages.
-                    # We need to explicitly add -ffast-math as a linking flag.
-                    # For the default python wrapper, the compilation and linking are done in one command thus -ffast-math
-                    # will take effect in both compilation and linking.
-                    extra_ldflags = f"{_shared} {_lpaths} {_libs} -ffast-math"
-
-                    mod = torch.utils.cpp_extension.load_inline(
-                        name=name,
-                        build_directory=cpp_wrapper_dir,
-                        cpp_sources=[source_code],
-                        functions=[func_name],
-                        extra_cflags=[extra_cflags],
-                        extra_ldflags=[extra_ldflags],
-                        extra_include_paths=_ipaths,
-                        use_pch=True,
-                    )
-                    log.debug("Cpp wrapper done building %s", filepath)
-                else:
-                    log.debug("Found target .so, cpp wrapper loading %s", filepath)
-                    spec = importlib.util.spec_from_file_location(name, filepath)  # type: ignore[attr-defined]
-                    assert spec is not None
-                    mod = importlib.util.module_from_spec(spec)  # type: ignore[attr-defined]
-                    assert isinstance(spec.loader, abc.Loader)
-                    spec.loader.exec_module(mod)
-                    log.debug("Cpp wrapper done loading %s", filepath)
-
-                cls.cache[key] = mod
-
-        return cls.cache[key]
-
-
 class TritonCodeCache:
     @classmethod
     def load(cls, kernel_name: str, source_code: str) -> ModuleType:
diff --git a/torch/_inductor/codegen/wrapper.py b/torch/_inductor/codegen/wrapper.py
@@ -1449,6 +1449,12 @@ def write_header(self):
         if config.aot_inductor.abi_compatible:
             self.header.splice("#include <torch/csrc/inductor/aoti_torch/c/shim.h>")
         else:
+            if not V.graph.aot_mode:
+                self.header.splice(
+                    """
+                    #include <pybind11/pybind11.h>
+                    """
+                )
             self.header.splice(
                 """
                 #include <ATen/ATen.h>
@@ -1622,7 +1628,7 @@ def write_wrapper_decl(self):
                 else:
                     self.prefix.splice(
                         """
-                            py::gil_scoped_release release;
+                            pybind11::gil_scoped_release release;
                         """
                     )
 
@@ -1978,11 +1984,9 @@ def generate_end(self, result):
             return
 
         result.writeline("'''\n)")
-        # get the hash of the wrapper code to name the extension
-        wrapper_call_hash = codecache.code_hash(result.getvalue())
         result.splice(
             f"""
-            module = CppWrapperCodeCache.load(cpp_wrapper_src, '{self.call_func_name}', '{wrapper_call_hash}', {self.cuda})
+            inductor_entry = CppWrapperCodeCache.load_pybinding(["std::vector<at::Tensor>"], cpp_wrapper_src, {self.cuda})
             """
         )
 
@@ -2024,7 +2028,7 @@ def g(args):
                     {args_str}
                     {return_str}
                 return g
-            call = _wrap_func(module.{self.call_func_name})
+            call = _wrap_func(inductor_entry)
             """
         )