Qualcomm AI Engine Direct - Refine max spill fill buffer setting (#6041)

pytorchbot · Sheng Feng Wu · web-flow · commit b73fb1e43730 · 2024-10-09T10:10:35.000-07:00
Qualcomm AI Engine Direct - Refine max spill fill buffer setting (#5989) Summary: - Get required spillFillBufferSize from context binary and set to compiler_spec - Quantize embedding op in qnn. - If enable multi-contexts, maxSpillFillBuffer could not set to zero. Pull Request resolved: #5989 Reviewed By: kirklandsign Differential Revision: D64056107 Pulled By: cccclai fbshipit-source-id: 9f9846e6ac7b4a27d734d2812ac3bbad32fb194f (cherry picked from commit 01fcdf4) Co-authored-by: Sheng Feng Wu <shewu@qti.qualcomm.com>
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.cpp
@@ -35,7 +35,8 @@ PYBIND11_MODULE(PyQnnManagerAdaptor, m) {
       .def("IsTensorDump", &PyQnnManager::IsTensorDump)
       .def("AllocateTensor", &PyQnnManager::AllocateTensor)
       .def("GetGraphInputs", &PyQnnManager::GetGraphInputs)
-      .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs);
+      .def("GetGraphOutputs", &PyQnnManager::GetGraphOutputs)
+      .def("GetSpillFillBufferSize", &PyQnnManager::GetSpillFillBufferSize);
 }
 } // namespace qnn
 } // namespace executor
diff --git a/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h b/backends/qualcomm/aot/python/PyQnnManagerAdaptor.h
@@ -177,6 +177,10 @@ class PyQnnManager {
     return ret;
   }
 
+  uint64_t GetSpillFillBufferSize() {
+    return qnn_manager_->GetSpillFillBufferSize();
+  }
+
  private:
   // Store the bytes object instead of a raw pointer so that this module will
   // keep the bytes alive.
diff --git a/backends/qualcomm/runtime/QnnManager.cpp b/backends/qualcomm/runtime/QnnManager.cpp
@@ -283,6 +283,10 @@ Error QnnManager::Init() {
         qnn_loaded_backend_, logger_.get(), qnn_context_blob_, options_);
     ET_CHECK_OR_RETURN_ERROR(
         backend_params_ptr_ != nullptr, Internal, "Failed to load Qnn backend.")
+    ET_CHECK_OR_RETURN_ERROR(
+        backend_params_ptr_->qnn_backend_cache_ptr_->Configure() == Error::Ok,
+        Internal,
+        "Fail to configure Qnn backend cache");
     ET_CHECK_OR_RETURN_ERROR(
         backend_params_ptr_->qnn_backend_ptr_->Configure() == Error::Ok,
         Internal,
diff --git a/backends/qualcomm/runtime/QnnManager.h b/backends/qualcomm/runtime/QnnManager.h
@@ -70,6 +70,12 @@ class QnnManager {
   // Pre-register custom memory handle from the SharedBuffer before execution
   Error PreRegisterMem();
 
+  uint64_t GetSpillFillBufferSize() {
+    auto* htp_backend_cache_ptr = static_cast<HtpBackendCache*>(
+        backend_params_ptr_->qnn_backend_cache_ptr_.get());
+    return htp_backend_cache_ptr->GetSpillFillBufferSize();
+  }
+
   std::vector<std::shared_ptr<TensorWrapper>> GetGraphInputs() {
     return input_tensors_;
   }
diff --git a/backends/qualcomm/runtime/backends/CMakeLists.txt b/backends/qualcomm/runtime/backends/CMakeLists.txt
@@ -77,7 +77,9 @@ target_sources(
 target_sources(
   qnn_backend_cache
   PUBLIC ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.h
+         ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.h
   PRIVATE ${CMAKE_CURRENT_LIST_DIR}/QnnBackendCache.cpp
+          ${CMAKE_CURRENT_LIST_DIR}/htpbackend/HtpBackendCache.cpp
 )
 
 # qnn_graph
@@ -130,6 +132,7 @@ set(qnn_header_basenames
     HTP/QnnHtpPerfInfrastructure.h
     HTP/QnnHtpProfile.h
     HTP/QnnHtpProperty.h
+    HTP/QnnHtpSystemContext.h
     QnnInterface.h
     QnnLog.h
     QnnMem.h
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.cpp b/backends/qualcomm/runtime/backends/QnnBackendCache.cpp
@@ -28,13 +28,20 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
 
   if (error != QNN_SUCCESS) {
     QNN_EXECUTORCH_LOG_WARN(
-        "Failed to interpret QNN Context "
+        "Failed to interpret QNN context "
         "binary. Error code %d. "
         "Try verifying binary with online-prepare format.",
         QNN_GET_ERROR_CODE(error));
     return Error::Internal;
   }
 
+  Error status = RetrieveBackendBinaryInfo(binaryinfo);
+  if (status == Error::Internal) {
+    QNN_EXECUTORCH_LOG_ERROR(
+        "Failed to retrieve backend binary info from QNN context binary.");
+    return Error::Internal;
+  }
+
   if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
     num_graphs = binaryinfo->contextBinaryInfoV1.numGraphs;
     graph = binaryinfo->contextBinaryInfoV1.graphs;
@@ -81,20 +88,18 @@ Error QnnBackendCache::GetQnnGraphInfoFromBinary() {
   return Error::Ok;
 }
 
-QnnBackendCache::QnnBackendCache(
-    const QnnExecuTorchContextBinary& qnn_context_blob)
-    : qnn_context_blob_(qnn_context_blob) {
+Error QnnBackendCache::Configure() {
   if (qnn_context_blob_.buffer == nullptr) {
     state_ = SERIALIZE;
     QNN_EXECUTORCH_LOG_INFO("Caching: Caching is in SAVE MODE.");
-    return;
+    return Error::Ok;
   }
 
   if (qnn_sys_impl_.Load() != Error::Ok) {
     QNN_EXECUTORCH_LOG_ERROR(
         "Failed to Load QnnSystem "
         "APIs. Caching mechanism is being disabled.");
-    return;
+    return Error::Internal;
   }
 
   Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -109,7 +114,7 @@ QnnBackendCache::QnnBackendCache(
         "Failed to create Qnn "
         "SystemContext. Caching mechanism will be disabled. Error code %d",
         QNN_GET_ERROR_CODE(error));
-    return;
+    return Error::Internal;
   }
 
   // DO DESERIALIZE
@@ -125,16 +130,16 @@ QnnBackendCache::QnnBackendCache(
 
     if (qcir::VerifyGraphBuffer(verifier)) {
       state_ = ONLINE_PREPARE;
-      return;
+      return Error::Ok;
     }
 
     QNN_EXECUTORCH_LOG_ERROR(
         "Failed to parse QNN Graph Info. The cache "
         "might be broken. Please consider to re-generate the "
         "cache.");
     InvalidateCache();
-    return;
   }
+  return Error::Ok;
 }
 
 QnnBackendCache::~QnnBackendCache() {
diff --git a/backends/qualcomm/runtime/backends/QnnBackendCache.h b/backends/qualcomm/runtime/backends/QnnBackendCache.h
@@ -23,9 +23,9 @@ class QnnBackendCache {
     DESERIALIZE = 2,
     ONLINE_PREPARE = 3,
   };
-  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob);
-
-  ~QnnBackendCache();
+  explicit QnnBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : qnn_context_blob_(qnn_context_blob) {}
+  virtual ~QnnBackendCache();
   QnnBackendCache(const QnnBackendCache&) = delete;
   QnnBackendCache(QnnBackendCache&&) = delete;
   QnnBackendCache& operator=(const QnnBackendCache&) = delete;
@@ -51,6 +51,14 @@ class QnnBackendCache {
     return graph_name_;
   }
 
+  Error Configure();
+
+ protected:
+  virtual Error RetrieveBackendBinaryInfo(
+      __ET_UNUSED const QnnSystemContext_BinaryInfo_t* binaryinfo) {
+    return Error::Ok;
+  }
+
  private:
   Error GetQnnGraphInfoFromBinary();
 
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp b/backends/qualcomm/runtime/backends/QnnBackendFactory.cpp
@@ -56,11 +56,14 @@ std::unique_ptr<BackendConfigParameters> QnnBackendFactory::Create(
       backend_params->qnn_device_ptr_ = std::make_unique<HtpDevice>(
           implementation, logger, options->soc_info(), htp_options);
 
+      backend_params->qnn_backend_cache_ptr_ =
+          std::make_unique<HtpBackendCache>(qnn_context_blob);
+
       backend_params->qnn_context_ptr_ = std::make_unique<HtpContext>(
           implementation,
           backend_params->qnn_backend_ptr_.get(),
           backend_params->qnn_device_ptr_.get(),
-          qnn_context_blob,
+          backend_params->qnn_backend_cache_ptr_.get(),
           htp_options);
 
       backend_params->qnn_graph_ptr_ = std::make_unique<HtpGraph>(
diff --git a/backends/qualcomm/runtime/backends/QnnBackendFactory.h b/backends/qualcomm/runtime/backends/QnnBackendFactory.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnBackendCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnContextCommon.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnDeviceCommon.h>
@@ -16,6 +17,7 @@
 #include <executorch/backends/qualcomm/runtime/backends/QnnLogger.h>
 #include <executorch/backends/qualcomm/runtime/backends/QnnMemManager.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackend.h>
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpDevice.h>
 #include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpGraph.h>
@@ -35,6 +37,7 @@ typedef struct BackendConfigParameters {
   std::unique_ptr<QnnDevice> qnn_device_ptr_;
   std::unique_ptr<QnnGraph> qnn_graph_ptr_;
   std::unique_ptr<QnnMemManager> qnn_mem_manager_ptr_;
+  std::unique_ptr<QnnBackendCache> qnn_backend_cache_ptr_;
 
   // Default ctor
   BackendConfigParameters()
@@ -43,10 +46,12 @@ typedef struct BackendConfigParameters {
         qnn_context_ptr_(nullptr),
         qnn_device_ptr_(nullptr),
         qnn_graph_ptr_(nullptr),
-        qnn_mem_manager_ptr_(nullptr) {}
+        qnn_mem_manager_ptr_(nullptr),
+        qnn_backend_cache_ptr_(nullptr) {}
   // Default dtor
   ~BackendConfigParameters() {
     qnn_graph_ptr_.reset();
+    qnn_backend_cache_ptr_.reset();
     qnn_mem_manager_ptr_.reset();
     qnn_context_ptr_.reset();
     qnn_device_ptr_.reset();
diff --git a/backends/qualcomm/runtime/backends/QnnContextCommon.h b/backends/qualcomm/runtime/backends/QnnContextCommon.h
@@ -22,13 +22,12 @@ class QnnContext {
       const QnnImplementation& implementation,
       QnnBackend* backend,
       QnnDevice* device,
-      const QnnExecuTorchContextBinary& qnn_context_blob)
+      QnnBackendCache* cache)
       : handle_(nullptr),
         implementation_(implementation),
         backend_(backend),
-        device_(device) {
-    cache_ = std::make_unique<QnnBackendCache>(qnn_context_blob);
-  }
+        device_(device),
+        cache_(cache) {}
 
   virtual ~QnnContext();
   Error Configure();
@@ -67,7 +66,7 @@ class QnnContext {
   const QnnImplementation& implementation_;
   QnnBackend* backend_;
   QnnDevice* device_;
-  std::unique_ptr<QnnBackendCache> cache_;
+  QnnBackendCache* cache_;
   std::vector<char> binary_buffer_;
 };
 } // namespace qnn
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.cpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h>
+#include "HTP/QnnHtpSystemContext.h"
+
+namespace torch {
+namespace executor {
+namespace qnn {
+Error HtpBackendCache::RetrieveBackendBinaryInfo(
+    const QnnSystemContext_BinaryInfo_t* binaryinfo) {
+  QnnHtpSystemContext_HwBlobInfo_t* htp_hwblobinfo = nullptr;
+
+  if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_1) {
+    htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV1.hwInfoBlob);
+  } else if (binaryinfo->version == QNN_SYSTEM_CONTEXT_BINARY_INFO_VERSION_2) {
+    htp_hwblobinfo = static_cast<QnnHtpSystemContext_HwBlobInfo_t*>(
+        binaryinfo->contextBinaryInfoV2.hwInfoBlob);
+  } else {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Unknown QNN BinaryInfo version %d.", binaryinfo->version);
+    return Error::Internal;
+  }
+
+  if (htp_hwblobinfo == nullptr) {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Htp hardware blob information is not found in binary information.");
+    return Error::Ok;
+  }
+
+  if (htp_hwblobinfo->version ==
+      QNN_SYSTEM_CONTEXT_HTP_HW_INFO_BLOB_VERSION_V1) {
+    spill_fill_buf_ =
+        (*htp_hwblobinfo).contextBinaryHwInfoBlobV1_t.spillFillBufferSize;
+  } else {
+    QNN_EXECUTORCH_LOG_WARN(
+        "Unknown QNN Htp hw blob info version %d.", htp_hwblobinfo->version);
+    return Error::Internal;
+  }
+
+  return Error::Ok;
+}
+
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h b/backends/qualcomm/runtime/backends/htpbackend/HtpBackendCache.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Qualcomm Innovation Center, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#pragma once
+#include <executorch/backends/qualcomm/runtime/backends/QnnBackendCache.h>
+
+namespace torch {
+namespace executor {
+namespace qnn {
+class HtpBackendCache : public QnnBackendCache {
+ public:
+  explicit HtpBackendCache(const QnnExecuTorchContextBinary& qnn_context_blob)
+      : QnnBackendCache(qnn_context_blob), spill_fill_buf_(0) {}
+  ~HtpBackendCache() override = default;
+
+  uint64_t GetSpillFillBufferSize() {
+    return spill_fill_buf_;
+  }
+
+ protected:
+  Error RetrieveBackendBinaryInfo(
+      const QnnSystemContext_BinaryInfo_t* binaryinfo) override;
+
+ private:
+  uint64_t spill_fill_buf_;
+};
+} // namespace qnn
+} // namespace executor
+} // namespace torch
diff --git a/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h b/backends/qualcomm/runtime/backends/htpbackend/HtpContext.h
@@ -22,9 +22,9 @@ class HtpContext : public QnnContext {
       const QnnImplementation& implementation,
       QnnBackend* backend,
       QnnDevice* device,
-      const QnnExecuTorchContextBinary& qnn_context_blob,
+      QnnBackendCache* cache,
       const QnnExecuTorchHtpBackendOptions* htp_options)
-      : QnnContext(implementation, backend, device, qnn_context_blob) {
+      : QnnContext(implementation, backend, device, cache) {
     htp_context_custom_config_ =
         std::make_unique<HtpContextCustomConfig>(this, htp_options);
   }
diff --git a/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp b/backends/qualcomm/runtime/backends/htpbackend/aarch64/HtpContextCustomConfig.cpp
@@ -19,7 +19,8 @@ HtpContextCustomConfig::CreateContextCustomConfig() {
   QnnHtpContext_CustomConfig_t* p_custom_config = nullptr;
   const HtpContext* htp_ctx = static_cast<const HtpContext*>(context_);
 
-  if (htp_options_->use_multi_contexts()) {
+  if (htp_options_->use_multi_contexts() &&
+      htp_options_->max_sf_buf_size() != 0) {
     p_custom_config = AllocContextCustomConfig();
     p_custom_config->option =
         QNN_HTP_CONTEXT_CONFIG_OPTION_REGISTER_MULTI_CONTEXTS;
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -208,16 +208,28 @@ def process_exported_program(prog):
                     == QnnExecuTorchBackendType.kHtpBackend
                     and options.backend_options.htp_options.use_multi_contexts
                 ):
-                    max_sf_buf_size = max(max_sf_buf_size, len(m.processed_bytes))
+                    qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                        m.compile_specs[0].value, m.processed_bytes
+                    )
+                    assert qnn_mgr.Init().value == 0, "failed to load context binary"
+                    max_sf_buf_size = max(
+                        max_sf_buf_size, qnn_mgr.GetSpillFillBufferSize()
+                    )
                     module_map[m] = options
+                    qnn_mgr.Destroy()
             return max_sf_buf_size, module_map
 
         def process_lowered_module(module):
+            qnn_mgr = PyQnnManagerAdaptor.QnnManager(
+                module.compile_specs[0].value, module.processed_bytes
+            )
+            assert qnn_mgr.Init().value == 0, "failed to load context binary"
             spill_fill_size = (
-                len(module.processed_bytes)
+                qnn_mgr.GetSpillFillBufferSize()
                 if custom_buffer_size is None
                 else custom_buffer_size
             )
+            qnn_mgr.Destroy()
             return spill_fill_size, {
                 module: convert_to_option(module.compile_specs[0].value)
             }
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
diff --git a/extension/llm/export/partitioner_lib.py b/extension/llm/export/partitioner_lib.py
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py