Skip to content

Commit 23709f2

Browse files
committed
support wasm vm recover
Link: https://code.alibaba-inc.com/Ingress/proxy-wasm-cpp-host/codereview/14096772 * support recover crash vm * fix * add log * optimize * add compdb repo * optimize * fix * add updateWasmHandle in pluginHandle * make dorecover be virtual * remove useless code * user shared_ptr * fix * fix bg * optimize recover logic * fix * fix bug * optimize * optimize code * optimize * add UT * optimize code * set pluginhandle virtual * optimize the recover logic
1 parent 38c94a1 commit 23709f2

File tree

8 files changed

+337
-19
lines changed

8 files changed

+337
-19
lines changed

.bazelrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
build:compdb --build_tag_filters=-nocompdb
12
# Pass CC, CXX and PATH from the environment.
23
build --action_env=CC
34
build --action_env=CXX

bazel/repositories.bzl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
1919
def proxy_wasm_cpp_host_repositories():
2020
# Bazel extensions.
2121

22+
maybe(
23+
http_archive,
24+
name = "bazel_compdb",
25+
strip_prefix = "bazel-compilation-database-0.5.2",
26+
url = "https://github.com/grailbio/bazel-compilation-database/archive/0.5.2.tar.gz",
27+
)
28+
2229
maybe(
2330
http_archive,
2431
name = "bazel_skylib",

include/proxy-wasm/context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ class ContextBase : public RootInterface,
151151
virtual ~ContextBase();
152152

153153
WasmBase *wasm() const { return wasm_; }
154+
void clearWasm() { wasm_ = nullptr; }
154155
uint32_t id() const { return id_; }
155156
// The VM Context used for calling "malloc" has an id_ == 0.
156157
bool isVmContext() const { return id_ == 0; }

include/proxy-wasm/wasm.h

Lines changed: 60 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,11 @@ class WasmBase : public std::enable_shared_from_this<WasmBase> {
7373
return it->second;
7474
return nullptr;
7575
}
76+
void clearWasmInContext() {
77+
for (auto &item : contexts_) {
78+
item.second->clearWasm();
79+
}
80+
}
7681
uint32_t allocContextId();
7782
bool isFailed() { return failed_ != FailState::Ok; }
7883
FailState fail_state() { return failed_; }
@@ -323,7 +328,7 @@ using WasmHandleCloneFactory =
323328
class WasmHandleBase : public std::enable_shared_from_this<WasmHandleBase> {
324329
public:
325330
explicit WasmHandleBase(std::shared_ptr<WasmBase> wasm_base) : wasm_base_(wasm_base) {}
326-
~WasmHandleBase() {
331+
virtual ~WasmHandleBase() {
327332
if (wasm_base_) {
328333
wasm_base_->startShutdown();
329334
}
@@ -336,8 +341,31 @@ class WasmHandleBase : public std::enable_shared_from_this<WasmHandleBase> {
336341

337342
std::shared_ptr<WasmBase> &wasm() { return wasm_base_; }
338343

344+
virtual void swap(std::shared_ptr<WasmHandleBase> &new_handle) {
345+
wasm_base_.swap(new_handle->wasm_base_);
346+
}
347+
348+
void setRecoverVmCallback(std::function<std::shared_ptr<WasmHandleBase>()> &&f) {
349+
recover_vm_callback_ = std::move(f);
350+
}
351+
void setNeedRecover() { need_recover_ = true; }
352+
bool needRecover() { return need_recover_; }
353+
bool doRecover(std::shared_ptr<WasmHandleBase> &new_handle) {
354+
if (!need_recover_ || recover_vm_callback_ == nullptr) {
355+
return true;
356+
}
357+
new_handle = recover_vm_callback_();
358+
if (!new_handle) {
359+
return false;
360+
}
361+
need_recover_ = false;
362+
return true;
363+
}
364+
339365
protected:
366+
bool need_recover_ = false;
340367
std::shared_ptr<WasmBase> wasm_base_;
368+
std::function<std::shared_ptr<WasmHandleBase>()> recover_vm_callback_;
341369
};
342370

343371
std::string makeVmKey(std::string_view vm_id, std::string_view configuration,
@@ -357,18 +385,48 @@ class PluginHandleBase : public std::enable_shared_from_this<PluginHandleBase> {
357385
explicit PluginHandleBase(std::shared_ptr<WasmHandleBase> wasm_handle,
358386
std::shared_ptr<PluginBase> plugin)
359387
: plugin_(plugin), wasm_handle_(wasm_handle) {}
360-
~PluginHandleBase() {
388+
virtual ~PluginHandleBase() {
361389
if (wasm_handle_) {
362390
wasm_handle_->wasm()->startShutdown(plugin_->key());
363391
}
364392
}
365393

366394
std::shared_ptr<PluginBase> &plugin() { return plugin_; }
367395
std::shared_ptr<WasmBase> &wasm() { return wasm_handle_->wasm(); }
396+
std::shared_ptr<WasmHandleBase> &wasmHandle() { return wasm_handle_; }
397+
398+
void setRecoverPluginCallback(
399+
std::function<std::shared_ptr<PluginHandleBase>(std::shared_ptr<WasmHandleBase> &)> &&f) {
400+
recover_plugin_callback_ = std::move(f);
401+
}
402+
void setNeedRecover() { need_recover_ = true; }
403+
bool needRecover() { return need_recover_; }
404+
bool doRecover(std::shared_ptr<PluginHandleBase> &new_handle) {
405+
if (!need_recover_ || recover_plugin_callback_ == nullptr) {
406+
return true;
407+
}
408+
std::shared_ptr<WasmHandleBase> new_wasm_handle;
409+
if (!wasm_handle_->doRecover(new_wasm_handle)) {
410+
return false;
411+
}
412+
new_handle = recover_plugin_callback_(new_wasm_handle);
413+
if (!new_handle) {
414+
return false;
415+
}
416+
need_recover_ = false;
417+
return true;
418+
}
419+
420+
virtual void updateWasm(std::shared_ptr<WasmHandleBase> &new_handle) {
421+
wasm_handle_ = new_handle;
422+
}
368423

369424
protected:
425+
bool need_recover_ = false;
370426
std::shared_ptr<PluginBase> plugin_;
371427
std::shared_ptr<WasmHandleBase> wasm_handle_;
428+
std::function<std::shared_ptr<PluginHandleBase>(std::shared_ptr<WasmHandleBase> &)>
429+
recover_plugin_callback_;
372430
};
373431

374432
using PluginHandleFactory = std::function<std::shared_ptr<PluginHandleBase>(

include/proxy-wasm/wasm_vm.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ enum class FailState : int {
174174
StartFailed = 5,
175175
ConfigureFailed = 6,
176176
RuntimeError = 7,
177+
RecoverError = 8,
177178
};
178179

179180
// Wasm VM instance. Provides the low level WASM interface.

src/context.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,7 @@ FilterMetadataStatus ContextBase::convertVmCallResultToFilterMetadataStatus(uint
529529

530530
ContextBase::~ContextBase() {
531531
// Do not remove vm context which has the same lifetime as wasm_.
532-
if (id_ != 0U) {
532+
if (id_ != 0U && wasm_ != nullptr) {
533533
wasm_->contexts_.erase(id_);
534534
}
535535
}

src/wasm.cc

Lines changed: 133 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,71 @@ std::shared_ptr<WasmHandleBase> getThreadLocalWasm(std::string_view vm_key) {
554554
return wasm;
555555
}
556556

557+
void setWasmFailCallback(const std::string &vm_key,
558+
const std::shared_ptr<WasmHandleBase> &wasm_handle) {
559+
std::weak_ptr<WasmHandleBase> wasm_handle_for_copy = wasm_handle;
560+
wasm_handle->wasm()->wasm_vm()->addFailCallback(
561+
[vm_key, wasm_handle_for_copy](proxy_wasm::FailState fail_state) {
562+
if (fail_state == proxy_wasm::FailState::RuntimeError) {
563+
// If VM failed, erase the entry so that:
564+
// 1) we can recreate the new thread local VM from the same base_wasm.
565+
// 2) we wouldn't reuse the failed VM for new plugins accidentally.
566+
local_wasms.erase(vm_key);
567+
auto wasm_handle = wasm_handle_for_copy.lock();
568+
if (!wasm_handle) {
569+
return;
570+
}
571+
wasm_handle->setNeedRecover();
572+
}
573+
});
574+
}
575+
576+
void setWasmRecoverCallback(const std::string &vm_key,
577+
const std::shared_ptr<WasmHandleBase> &wasm_handle,
578+
const std::shared_ptr<WasmHandleBase> &base_handle,
579+
const WasmHandleCloneFactory &clone_factory) {
580+
std::weak_ptr<WasmHandleBase> wasm_handle_for_copy = wasm_handle;
581+
wasm_handle->setRecoverVmCallback([vm_key, wasm_handle_for_copy, base_handle,
582+
clone_factory]() -> std::shared_ptr<WasmHandleBase> {
583+
const auto &integration = base_handle->wasm()->wasm_vm()->integration();
584+
integration->trace("Start recover wasm_handle");
585+
auto it = local_wasms.find(vm_key);
586+
if (it != local_wasms.end()) {
587+
auto wasm_handle = it->second.lock();
588+
if (wasm_handle) {
589+
integration->trace("Wasm handle already exists");
590+
return wasm_handle;
591+
}
592+
local_wasms.erase(vm_key);
593+
}
594+
// try to recover wasm vm
595+
auto wasm_handle = wasm_handle_for_copy.lock();
596+
if (!wasm_handle) {
597+
base_handle->wasm()->fail(FailState::RecoverError, "Wasm handle lock failed");
598+
return nullptr;
599+
}
600+
auto new_handle = clone_factory(base_handle);
601+
if (!new_handle) {
602+
base_handle->wasm()->fail(FailState::RecoverError,
603+
"Failed to clone Base Wasm during recover");
604+
return nullptr;
605+
}
606+
607+
if (!new_handle->wasm()->initialize()) {
608+
base_handle->wasm()->fail(FailState::RecoverError,
609+
"Failed to initialize Wasm code during recover");
610+
return nullptr;
611+
}
612+
// avoid the context use the stale wasm ptr
613+
wasm_handle->wasm()->clearWasmInContext();
614+
wasm_handle->swap(new_handle);
615+
local_wasms[vm_key] = wasm_handle;
616+
integration->trace("Wasm handle has been recovered");
617+
setWasmFailCallback(vm_key, wasm_handle);
618+
return wasm_handle;
619+
});
620+
}
621+
557622
static std::shared_ptr<WasmHandleBase>
558623
getOrCreateThreadLocalWasm(const std::shared_ptr<WasmHandleBase> &base_handle,
559624
const WasmHandleCloneFactory &clone_factory) {
@@ -580,17 +645,75 @@ getOrCreateThreadLocalWasm(const std::shared_ptr<WasmHandleBase> &base_handle,
580645
return nullptr;
581646
}
582647
local_wasms[vm_key] = wasm_handle;
583-
wasm_handle->wasm()->wasm_vm()->addFailCallback([vm_key](proxy_wasm::FailState fail_state) {
584-
if (fail_state == proxy_wasm::FailState::RuntimeError) {
585-
// If VM failed, erase the entry so that:
586-
// 1) we can recreate the new thread local VM from the same base_wasm.
587-
// 2) we wouldn't reuse the failed VM for new plugins accidentally.
588-
local_wasms.erase(vm_key);
589-
};
590-
});
648+
setWasmFailCallback(vm_key, wasm_handle);
649+
setWasmRecoverCallback(vm_key, wasm_handle, base_handle, clone_factory);
591650
return wasm_handle;
592651
}
593652

653+
void setPluginFailCallback(const std::string &key,
654+
const std::shared_ptr<WasmHandleBase> &wasm_handle,
655+
const std::shared_ptr<PluginHandleBase> &plugin_handle) {
656+
std::weak_ptr<PluginHandleBase> plugin_handle_for_copy = plugin_handle;
657+
wasm_handle->wasm()->wasm_vm()->addFailCallback(
658+
[key, plugin_handle_for_copy](proxy_wasm::FailState fail_state) {
659+
if (fail_state == proxy_wasm::FailState::RuntimeError) {
660+
// If VM failed, erase the entry so that:
661+
// 1) we can recreate the new thread local plugin from the same base_wasm.
662+
// 2) we wouldn't reuse the failed VM for new plugin configs accidentally.
663+
local_plugins.erase(key);
664+
auto plugin_handle = plugin_handle_for_copy.lock();
665+
if (!plugin_handle) {
666+
return;
667+
}
668+
plugin_handle->setNeedRecover();
669+
}
670+
});
671+
}
672+
673+
void setPluginRecoverCallback(const std::string &key,
674+
const std::shared_ptr<PluginHandleBase> &plugin_handle,
675+
const std::shared_ptr<WasmHandleBase> &base_handle,
676+
const std::shared_ptr<PluginBase> &plugin) {
677+
std::weak_ptr<PluginHandleBase> plugin_handle_for_copy = plugin_handle;
678+
plugin_handle->setRecoverPluginCallback(
679+
[key, plugin_handle_for_copy, base_handle,
680+
plugin](std::shared_ptr<WasmHandleBase> &wasm_handle) -> std::shared_ptr<PluginHandleBase> {
681+
const auto &integration = base_handle->wasm()->wasm_vm()->integration();
682+
integration->trace("Start recover plugin_handle");
683+
auto it = local_plugins.find(key);
684+
if (it != local_plugins.end()) {
685+
auto plugin_handle = it->second.lock();
686+
if (plugin_handle) {
687+
integration->trace("Plugin handle already exists");
688+
return plugin_handle;
689+
}
690+
local_plugins.erase(key);
691+
}
692+
auto plugin_handle = plugin_handle_for_copy.lock();
693+
if (!plugin_handle) {
694+
base_handle->wasm()->fail(FailState::RecoverError, "Plugin handle lock failed");
695+
return nullptr;
696+
}
697+
plugin_handle->updateWasm(wasm_handle);
698+
// Create and initialize new thread-local Plugin.
699+
auto *plugin_context = wasm_handle->wasm()->start(plugin);
700+
if (plugin_context == nullptr) {
701+
base_handle->wasm()->fail(FailState::RecoverError,
702+
"Failed to start thread-local Wasm during recover");
703+
return nullptr;
704+
}
705+
if (!wasm_handle->wasm()->configure(plugin_context, plugin)) {
706+
base_handle->wasm()->fail(FailState::RecoverError,
707+
"Failed to configure thread-local Wasm plugin during recover");
708+
return nullptr;
709+
}
710+
local_plugins[key] = plugin_handle;
711+
integration->trace("Plugin_handle has been recovered");
712+
setPluginFailCallback(key, wasm_handle, plugin_handle);
713+
return plugin_handle;
714+
});
715+
}
716+
594717
std::shared_ptr<PluginHandleBase> getOrCreateThreadLocalPlugin(
595718
const std::shared_ptr<WasmHandleBase> &base_handle, const std::shared_ptr<PluginBase> &plugin,
596719
const WasmHandleCloneFactory &clone_factory, const PluginHandleFactory &plugin_factory) {
@@ -623,14 +746,8 @@ std::shared_ptr<PluginHandleBase> getOrCreateThreadLocalPlugin(
623746
}
624747
auto plugin_handle = plugin_factory(wasm_handle, plugin);
625748
local_plugins[key] = plugin_handle;
626-
wasm_handle->wasm()->wasm_vm()->addFailCallback([key](proxy_wasm::FailState fail_state) {
627-
if (fail_state == proxy_wasm::FailState::RuntimeError) {
628-
// If VM failed, erase the entry so that:
629-
// 1) we can recreate the new thread local plugin from the same base_wasm.
630-
// 2) we wouldn't reuse the failed VM for new plugin configs accidentally.
631-
local_plugins.erase(key);
632-
};
633-
});
749+
setPluginFailCallback(key, wasm_handle, plugin_handle);
750+
setPluginRecoverCallback(key, plugin_handle, base_handle, plugin);
634751
return plugin_handle;
635752
}
636753

0 commit comments

Comments
 (0)