From 19c91398447c3f08b9faa1dcab3441637f49909c Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Wed, 8 Apr 2026 09:17:56 -0700 Subject: [PATCH 01/11] Merge WorkQueue into AppRuntime WorkQueue was a separate class with its own thread, dispatcher, and cancel source. AppRuntime captured 'this' in lambdas passed to WorkQueue, creating split-lifetime issues where the worker thread could access partially-destroyed AppRuntime members during shutdown. This change inlines WorkQueue's functionality directly into AppRuntime: - Thread, dispatcher, cancel source, and env are now AppRuntime members - Destruction order is deterministic (no cross-object lifetime issues) - WorkQueue.h and WorkQueue.cpp are removed - AppRuntime_JSI.cpp updated to use AppRuntime::Dispatch instead of WorkQueue::Append, and fixed task.release() to std::move(task) for shared_ptr construction Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Core/AppRuntime/CMakeLists.txt | 4 +- Core/AppRuntime/Include/Babylon/AppRuntime.h | 31 ++++++++++- Core/AppRuntime/Source/AppRuntime.cpp | 43 +++++++++++++-- Core/AppRuntime/Source/AppRuntime_JSI.cpp | 13 ++--- Core/AppRuntime/Source/WorkQueue.cpp | 58 -------------------- Core/AppRuntime/Source/WorkQueue.h | 58 -------------------- 6 files changed, 72 insertions(+), 135 deletions(-) delete mode 100644 Core/AppRuntime/Source/WorkQueue.cpp delete mode 100644 Core/AppRuntime/Source/WorkQueue.h diff --git a/Core/AppRuntime/CMakeLists.txt b/Core/AppRuntime/CMakeLists.txt index ad8f670d..ba671233 100644 --- a/Core/AppRuntime/CMakeLists.txt +++ b/Core/AppRuntime/CMakeLists.txt @@ -9,9 +9,7 @@ set(SOURCES "Include/Babylon/AppRuntime.h" "Source/AppRuntime.cpp" "Source/AppRuntime_${NAPI_JAVASCRIPT_ENGINE}.cpp" - "Source/AppRuntime_${JSRUNTIMEHOST_PLATFORM}.${IMPL_EXT}" - "Source/WorkQueue.cpp" - "Source/WorkQueue.h") + "Source/AppRuntime_${JSRUNTIMEHOST_PLATFORM}.${IMPL_EXT}") add_library(AppRuntime ${SOURCES}) warnings_as_errors(AppRuntime) diff --git a/Core/AppRuntime/Include/Babylon/AppRuntime.h b/Core/AppRuntime/Include/Babylon/AppRuntime.h index ba0ffa71..6e595c81 100644 --- a/Core/AppRuntime/Include/Babylon/AppRuntime.h +++ b/Core/AppRuntime/Include/Babylon/AppRuntime.h @@ -6,14 +6,18 @@ #include +#include + #include #include #include +#include +#include +#include +#include namespace Babylon { - class WorkQueue; - class AppRuntime final { public: @@ -43,6 +47,23 @@ namespace Babylon static void BABYLON_API DefaultUnhandledExceptionHandler(const Napi::Error& error); private: + template + void Append(CallableT callable) + { + if constexpr (std::is_copy_constructible::value) + { + m_dispatcher.queue([this, callable = std::move(callable)]() { + callable(m_env.value()); + }); + } + else + { + m_dispatcher.queue([this, callablePtr = std::make_shared(std::move(callable))]() { + (*callablePtr)(m_env.value()); + }); + } + } + // These three methods are the mechanism by which platform- and JavaScript-specific // code can be "injected" into the execution of the JavaScript thread. These three // functions are implemented in separate files, thus allowing implementations to be @@ -62,6 +83,10 @@ namespace Babylon void Execute(Dispatchable callback); Options m_options; - std::unique_ptr m_workQueue; + std::optional m_env{}; + std::optional> m_suspensionLock{}; + arcana::cancellation_source m_cancelSource{}; + arcana::manual_dispatcher<128> m_dispatcher{}; + std::thread m_thread; }; } diff --git a/Core/AppRuntime/Source/AppRuntime.cpp b/Core/AppRuntime/Source/AppRuntime.cpp index b3a90b1b..ae1ee4f9 100644 --- a/Core/AppRuntime/Source/AppRuntime.cpp +++ b/Core/AppRuntime/Source/AppRuntime.cpp @@ -1,5 +1,4 @@ #include "AppRuntime.h" -#include "WorkQueue.h" #include namespace Babylon @@ -11,7 +10,7 @@ namespace Babylon AppRuntime::AppRuntime(Options options) : m_options{std::move(options)} - , m_workQueue{std::make_unique([this] { RunPlatformTier(); })} + , m_thread{[this] { RunPlatformTier(); }} { Dispatch([this](Napi::Env env) { JsRuntime::CreateForJavaScript(env, [this](auto func) { Dispatch(std::move(func)); }); @@ -20,26 +19,58 @@ namespace Babylon AppRuntime::~AppRuntime() { + if (m_suspensionLock.has_value()) + { + m_suspensionLock.reset(); + } + + // Cancel immediately so pending work is dropped promptly, then append + // a no-op work item to wake the worker thread from blocking_tick. The + // no-op goes through push() which acquires the queue mutex, avoiding + // the race where a bare notify_all() can be missed by wait(). + // + // NOTE: This preserves the existing shutdown behavior where pending + // callbacks are dropped on cancellation. A more complete solution + // would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so + // consumers can finish cleanup work before the runtime is destroyed. + m_cancelSource.cancel(); + Append([](Napi::Env) {}); + + m_thread.join(); } void AppRuntime::Run(Napi::Env env) { - m_workQueue->Run(env); + m_env = std::make_optional(env); + + m_dispatcher.set_affinity(std::this_thread::get_id()); + + while (!m_cancelSource.cancelled()) + { + m_dispatcher.blocking_tick(m_cancelSource); + } + + // The dispatcher can be non-empty if something is dispatched after cancellation. + m_dispatcher.clear(); } void AppRuntime::Suspend() { - m_workQueue->Suspend(); + auto suspensionMutex = std::make_shared(); + m_suspensionLock.emplace(*suspensionMutex); + Append([suspensionMutex{std::move(suspensionMutex)}](Napi::Env) { + std::scoped_lock lock{*suspensionMutex}; + }); } void AppRuntime::Resume() { - m_workQueue->Resume(); + m_suspensionLock.reset(); } void AppRuntime::Dispatch(Dispatchable func) { - m_workQueue->Append([this, func{std::move(func)}](Napi::Env env) mutable { + Append([this, func{std::move(func)}](Napi::Env env) mutable { Execute([this, env, func{std::move(func)}]() mutable { try { diff --git a/Core/AppRuntime/Source/AppRuntime_JSI.cpp b/Core/AppRuntime/Source/AppRuntime_JSI.cpp index 020bf83f..3ddd19da 100644 --- a/Core/AppRuntime/Source/AppRuntime_JSI.cpp +++ b/Core/AppRuntime/Source/AppRuntime_JSI.cpp @@ -1,5 +1,4 @@ #include "AppRuntime.h" -#include "WorkQueue.h" #include #include @@ -10,15 +9,15 @@ namespace class TaskRunnerAdapter : public v8runtime::JSITaskRunner { public: - TaskRunnerAdapter(Babylon::WorkQueue& workQueue) - : m_workQueue(workQueue) + TaskRunnerAdapter(Babylon::AppRuntime& runtime) + : m_runtime(runtime) { } void postTask(std::unique_ptr task) override { - std::shared_ptr shared_task(task.release()); - m_workQueue.Append([shared_task2 = std::move(shared_task)](Napi::Env) { + std::shared_ptr shared_task(std::move(task)); + m_runtime.Dispatch([shared_task2 = std::move(shared_task)](Napi::Env) { shared_task2->run(); }); } @@ -27,7 +26,7 @@ namespace TaskRunnerAdapter(const TaskRunnerAdapter&) = delete; TaskRunnerAdapter& operator=(const TaskRunnerAdapter&) = delete; - Babylon::WorkQueue& m_workQueue; + Babylon::AppRuntime& m_runtime; }; } @@ -37,7 +36,7 @@ namespace Babylon { v8runtime::V8RuntimeArgs args{}; args.inspectorPort = 5643; - args.foreground_task_runner = std::make_shared(*m_workQueue); + args.foreground_task_runner = std::make_shared(*this); const auto runtime = v8runtime::makeV8Runtime(std::move(args)); const auto env = Napi::Attach(*runtime); diff --git a/Core/AppRuntime/Source/WorkQueue.cpp b/Core/AppRuntime/Source/WorkQueue.cpp deleted file mode 100644 index 8ff5efe1..00000000 --- a/Core/AppRuntime/Source/WorkQueue.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include "WorkQueue.h" - -namespace Babylon -{ - WorkQueue::WorkQueue(std::function threadProcedure) - : m_thread{std::move(threadProcedure)} - { - } - - WorkQueue::~WorkQueue() - { - if (m_suspensionLock.has_value()) - { - Resume(); - } - - // Cancel immediately so pending work is dropped promptly, then append - // a no-op work item to wake the worker thread from blocking_tick. The - // no-op goes through push() which acquires the queue mutex, avoiding - // the race where a bare notify_all() can be missed by wait(). - // - // NOTE: This preserves the existing shutdown behavior where pending - // callbacks are dropped on cancellation. A more complete solution - // would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so - // consumers can finish cleanup work before the runtime is destroyed. - m_cancelSource.cancel(); - Append([](Napi::Env) {}); - - m_thread.join(); - } - - void WorkQueue::Suspend() - { - auto suspensionMutex = std::make_shared(); - m_suspensionLock.emplace(*suspensionMutex); - Append([suspensionMutex{std::move(suspensionMutex)}](Napi::Env) { - std::scoped_lock lock{*suspensionMutex}; - }); - } - - void WorkQueue::Resume() - { - m_suspensionLock.reset(); - } - - void WorkQueue::Run(Napi::Env env) - { - m_env = std::make_optional(env); - m_dispatcher.set_affinity(std::this_thread::get_id()); - - while (!m_cancelSource.cancelled()) - { - m_dispatcher.blocking_tick(m_cancelSource); - } - - m_dispatcher.clear(); - } -} diff --git a/Core/AppRuntime/Source/WorkQueue.h b/Core/AppRuntime/Source/WorkQueue.h deleted file mode 100644 index 95a504d7..00000000 --- a/Core/AppRuntime/Source/WorkQueue.h +++ /dev/null @@ -1,58 +0,0 @@ -#pragma once - -#include -#include -#include - -#include -#include -#include - -namespace Babylon -{ - class WorkQueue - { - public: - WorkQueue(std::function threadProcedure); - ~WorkQueue(); - - template - void Append(CallableT callable) - { - // Manual dispatcher queueing requires a copyable CallableT, we use a shared pointer trick to make a - // copyable callable if necessary. - if constexpr (std::is_copy_constructible::value) - { - m_dispatcher.queue([this, callable = std::move(callable)]() { - Invoke(callable); - }); - } - else - { - m_dispatcher.queue([this, callablePtr = std::make_shared(std::move(callable))]() { - Invoke(*callablePtr); - }); - } - } - - void Suspend(); - void Resume(); - void Run(Napi::Env); - - private: - template - void Invoke(CallableT& callable) - { - callable(m_env.value()); - } - - std::optional m_env{}; - - std::optional> m_suspensionLock{}; - - arcana::cancellation_source m_cancelSource{}; - arcana::manual_dispatcher<128> m_dispatcher{}; - - std::thread m_thread; - }; -} From c88b5e60e3b8f50393c5b11650814efd0dd12b22 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Wed, 8 Apr 2026 09:44:31 -0700 Subject: [PATCH 02/11] Add explicit cancellation.h include for cancellation_source Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Core/AppRuntime/Include/Babylon/AppRuntime.h | 1 + 1 file changed, 1 insertion(+) diff --git a/Core/AppRuntime/Include/Babylon/AppRuntime.h b/Core/AppRuntime/Include/Babylon/AppRuntime.h index 6e595c81..0de98b8d 100644 --- a/Core/AppRuntime/Include/Babylon/AppRuntime.h +++ b/Core/AppRuntime/Include/Babylon/AppRuntime.h @@ -6,6 +6,7 @@ #include +#include #include #include From 7a8590c8cf36774fa2f082bb46fece6f3abf56b4 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Wed, 8 Apr 2026 10:39:30 -0700 Subject: [PATCH 03/11] Retrigger CI (macOS_Xcode164 transient failure) Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> From 31621898bdd514a8125af708573a6330f7f01152 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Wed, 8 Apr 2026 11:51:15 -0700 Subject: [PATCH 04/11] Add test flow diagram as code comment Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 40 +++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index e09da4fd..925ed29d 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -138,9 +138,45 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // in the hook before triggering destruction. // // Old (broken) code: cancel() + notify_all() fire without the mutex, - // so the notification is lost while the worker sleeps → deadlock. + // so the notification is lost while the worker sleeps -> deadlock. // Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex, - // so it blocks until the worker enters wait() → notification delivered. + // so it blocks until the worker enters wait() -> notification delivered. + // + // Test flow: + // + // Test Thread Worker Thread + // ----------- ------------- + // 1. Install hook (disabled) + // 2. Create AppRuntime Worker starts, enters blocking_tick + // 3. Dispatch(ready), wait Worker runs ready callback + // ready.wait() <------------- ready.set_value() + // Worker returns to blocking_tick + // Hook fires but disabled -> no-op + // Worker enters wait(lock) + // 4. Enable hook + // Dispatch(no-op) Worker wakes, runs no-op, + // returns to blocking_tick + // Hook fires, enabled: + // signal workerInHook + // sleep 200ms (holding mutex!) + // 5. workerInHook.wait() + // Worker is sleeping in hook + // 6. Destroy on separate thread + // ~WorkQueue(): + // cancel() + // Append(no-op): + // push() blocks ------> (worker holds mutex) + // 200ms sleep ends + // wait(lock) releases mutex + // push() acquires mutex + // pushes, notifies ---> wakes up! + // join() waits drains no-op, cancelled -> exit + // join() returns <----- thread exits + // 7. destroy completes -> PASS + // + // With old code, notify_all() fires WITHOUT the mutex during + // step 6, gets lost during the 200ms sleep, worker enters + // wait() with no signal -> join() hangs -> FAIL (timeout) // Shared state for hook synchronization std::atomic hookEnabled{false}; From acfb2fccb4765a9e65bbb7fc7ff96e57dbe6041c Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 08:57:58 -0700 Subject: [PATCH 05/11] Simplify destructor comment - fix details already in #147 This PR is a pure refactor, the shutdown logic was merged in #147. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Core/AppRuntime/Source/AppRuntime.cpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/Core/AppRuntime/Source/AppRuntime.cpp b/Core/AppRuntime/Source/AppRuntime.cpp index ae1ee4f9..e68403fb 100644 --- a/Core/AppRuntime/Source/AppRuntime.cpp +++ b/Core/AppRuntime/Source/AppRuntime.cpp @@ -24,15 +24,7 @@ namespace Babylon m_suspensionLock.reset(); } - // Cancel immediately so pending work is dropped promptly, then append - // a no-op work item to wake the worker thread from blocking_tick. The - // no-op goes through push() which acquires the queue mutex, avoiding - // the race where a bare notify_all() can be missed by wait(). - // - // NOTE: This preserves the existing shutdown behavior where pending - // callbacks are dropped on cancellation. A more complete solution - // would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so - // consumers can finish cleanup work before the runtime is destroyed. + // See #147 for details on this shutdown sequence. m_cancelSource.cancel(); Append([](Napi::Env) {}); From ae0080a91b715e7cf6bbd12fe502c51aa25bf402 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 09:04:29 -0700 Subject: [PATCH 06/11] Remove stale old-vs-fixed framing from test comment The old behavior no longer exists in this PR. The comment should just describe what the test verifies, not re-explain the bug. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index 925ed29d..3ec9505f 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -131,16 +131,10 @@ TEST(Console, Log) TEST(AppRuntime, DestroyDoesNotDeadlock) { - // Deterministic test for the race condition in the AppRuntime destructor. - // - // A global hook sleeps WHILE HOLDING the queue mutex, right before - // condition_variable::wait(). We synchronize so the worker is definitely - // in the hook before triggering destruction. - // - // Old (broken) code: cancel() + notify_all() fire without the mutex, - // so the notification is lost while the worker sleeps -> deadlock. - // Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex, - // so it blocks until the worker enters wait() -> notification delivered. + // Regression test verifying AppRuntime destruction doesn't deadlock. + // Uses a global arcana hook to sleep while holding the queue mutex + // before wait(), ensuring the worker is in the vulnerable window + // when the destructor fires. See #147 for details on the bug and fix. // // Test flow: // @@ -162,7 +156,7 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // 5. workerInHook.wait() // Worker is sleeping in hook // 6. Destroy on separate thread - // ~WorkQueue(): + // ~AppRuntime(): // cancel() // Append(no-op): // push() blocks ------> (worker holds mutex) @@ -173,10 +167,6 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // join() waits drains no-op, cancelled -> exit // join() returns <----- thread exits // 7. destroy completes -> PASS - // - // With old code, notify_all() fires WITHOUT the mutex during - // step 6, gets lost during the 200ms sleep, worker enters - // wait() with no signal -> join() hangs -> FAIL (timeout) // Shared state for hook synchronization std::atomic hookEnabled{false}; From 756fa8fafb21dbe7d579defbd2300168cf47d324 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 09:05:00 -0700 Subject: [PATCH 07/11] Restore destructor comment explaining shutdown sequence Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Core/AppRuntime/Source/AppRuntime.cpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Core/AppRuntime/Source/AppRuntime.cpp b/Core/AppRuntime/Source/AppRuntime.cpp index e68403fb..ae1ee4f9 100644 --- a/Core/AppRuntime/Source/AppRuntime.cpp +++ b/Core/AppRuntime/Source/AppRuntime.cpp @@ -24,7 +24,15 @@ namespace Babylon m_suspensionLock.reset(); } - // See #147 for details on this shutdown sequence. + // Cancel immediately so pending work is dropped promptly, then append + // a no-op work item to wake the worker thread from blocking_tick. The + // no-op goes through push() which acquires the queue mutex, avoiding + // the race where a bare notify_all() can be missed by wait(). + // + // NOTE: This preserves the existing shutdown behavior where pending + // callbacks are dropped on cancellation. A more complete solution + // would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so + // consumers can finish cleanup work before the runtime is destroyed. m_cancelSource.cancel(); Append([](Napi::Env) {}); From 7b5323e9d427e259c21c19b1e7691c1fa2170d55 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 09:26:53 -0700 Subject: [PATCH 08/11] Clean up test: single hookEnabled check, plain bool, unique_ptr move, reset hook early - Merge duplicate hookEnabled checks into one block - hookSignaled only accessed by worker thread, plain bool suffices - Move unique_ptr into destroy thread lambda instead of shared_ptr wrapper - Reset hook before destruction to prevent stale reference access Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index 3ec9505f..195e7df5 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -170,18 +170,20 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // Shared state for hook synchronization std::atomic hookEnabled{false}; - std::atomic hookSignaled{false}; + bool hookSignaled{false}; std::promise workerInHook; // Set the callback. It checks hookEnabled so we control - // when it actually sleeps. + // when it actually sleeps. hookSignaled is only accessed by + // the worker thread so it doesn't need to be atomic. arcana::set_before_wait_callback([&]() { - if (hookEnabled.load() && !hookSignaled.exchange(true)) - { - workerInHook.set_value(); - } if (hookEnabled.load()) { + if (!hookSignaled) + { + hookSignaled = true; + workerInHook.set_value(); + } std::this_thread::sleep_for(std::chrono::milliseconds(200)); } }); @@ -202,20 +204,22 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // Wait for the worker to be in the hook (holding mutex, sleeping) auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5)); + + // Reset the hook before any destruction so it won't fire with + // stale references if the detached thread outlives this scope. + arcana::set_before_wait_callback([]() {}); + if (hookStatus == std::future_status::timeout) { - // Hook didn't fire — no deadlock risk, clean up normally - arcana::set_before_wait_callback([]() {}); FAIL() << "Worker thread did not enter before-wait hook"; } // Worker is in the hook (holding mutex, sleeping). Destroy on a // detachable thread so the test doesn't hang if the destructor deadlocks. - auto runtimePtr = std::make_shared>(std::move(runtime)); std::promise destroyDone; auto destroyFuture = destroyDone.get_future(); - std::thread destroyThread([runtimePtr, &destroyDone]() { - runtimePtr->reset(); + std::thread destroyThread([runtime = std::move(runtime), &destroyDone]() mutable { + runtime.reset(); destroyDone.set_value(); }); @@ -229,8 +233,6 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) destroyThread.join(); } - arcana::set_before_wait_callback([]() {}); - ASSERT_NE(status, std::future_status::timeout) << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; } From 1a0990f82902f82b5369d8defd63170b567261c5 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 09:59:58 -0700 Subject: [PATCH 09/11] Clarify comment on initialization wait in test Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index 195e7df5..f70c9498 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -190,7 +190,9 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) auto runtime = std::make_unique(); - // Dispatch work and wait for completion + // Wait for the runtime to fully initialize. The constructor dispatches + // CreateForJavaScript which must complete before we enable the hook, + // otherwise the hook would sleep during initialization. std::promise ready; runtime->Dispatch([&ready](Napi::Env) { ready.set_value(); From 468654abc90f9f7dbffe9df03c21b5def84785ab Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 10:05:05 -0700 Subject: [PATCH 10/11] Run entire test on separate thread, gtest thread is watchdog AppRuntime is created and destroyed on the same thread naturally. The gtest thread just waits with a timeout and fails if it detects a deadlock, without hanging the process. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 74 +++++++++++++++---------------- 1 file changed, 35 insertions(+), 39 deletions(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index f70c9498..26269b0a 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -136,6 +136,10 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // before wait(), ensuring the worker is in the vulnerable window // when the destructor fires. See #147 for details on the bug and fix. // + // The entire test runs on a separate thread so the gtest thread can + // act as a watchdog. If destruction deadlocks, the watchdog detects + // the timeout and fails the test without hanging the process. + // // Test flow: // // Test Thread Worker Thread @@ -155,8 +159,7 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) // sleep 200ms (holding mutex!) // 5. workerInHook.wait() // Worker is sleeping in hook - // 6. Destroy on separate thread - // ~AppRuntime(): + // 6. ~AppRuntime() at scope exit // cancel() // Append(no-op): // push() blocks ------> (worker holds mutex) @@ -188,55 +191,48 @@ TEST(AppRuntime, DestroyDoesNotDeadlock) } }); - auto runtime = std::make_unique(); - - // Wait for the runtime to fully initialize. The constructor dispatches - // CreateForJavaScript which must complete before we enable the hook, - // otherwise the hook would sleep during initialization. - std::promise ready; - runtime->Dispatch([&ready](Napi::Env) { - ready.set_value(); - }); - ready.get_future().wait(); - - // Enable the hook and dispatch a no-op to wake the worker, - // ensuring it cycles through the hook on its way back to idle - hookEnabled.store(true); - runtime->Dispatch([](Napi::Env) {}); + // Run the full lifecycle on a separate thread so the gtest thread + // can act as a watchdog with a timeout. + std::promise testDone; + std::thread testThread([&]() { + auto runtime = std::make_unique(); + + // Wait for the runtime to fully initialize. The constructor dispatches + // CreateForJavaScript which must complete before we enable the hook, + // otherwise the hook would sleep during initialization. + std::promise ready; + runtime->Dispatch([&ready](Napi::Env) { + ready.set_value(); + }); + ready.get_future().wait(); - // Wait for the worker to be in the hook (holding mutex, sleeping) - auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5)); + // Enable the hook and dispatch a no-op to wake the worker, + // ensuring it cycles through the hook on its way back to idle + hookEnabled.store(true); + runtime->Dispatch([](Napi::Env) {}); - // Reset the hook before any destruction so it won't fire with - // stale references if the detached thread outlives this scope. - arcana::set_before_wait_callback([]() {}); + // Wait for the worker to be in the hook (holding mutex, sleeping) + workerInHook.get_future().wait(); - if (hookStatus == std::future_status::timeout) - { - FAIL() << "Worker thread did not enter before-wait hook"; - } - - // Worker is in the hook (holding mutex, sleeping). Destroy on a - // detachable thread so the test doesn't hang if the destructor deadlocks. - std::promise destroyDone; - auto destroyFuture = destroyDone.get_future(); - std::thread destroyThread([runtime = std::move(runtime), &destroyDone]() mutable { + // Destroy naturally at scope exit — if the fix works, the destructor + // completes. If broken, it deadlocks and the watchdog catches it. runtime.reset(); - destroyDone.set_value(); + testDone.set_value(); }); - auto status = destroyFuture.wait_for(std::chrono::seconds(5)); + auto status = testDone.get_future().wait_for(std::chrono::seconds(5)); + + arcana::set_before_wait_callback([]() {}); + if (status == std::future_status::timeout) { - destroyThread.detach(); + testThread.detach(); + FAIL() << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; } else { - destroyThread.join(); + testThread.join(); } - - ASSERT_NE(status, std::future_status::timeout) - << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; } int RunTests() From b2ea67a48fd4bf1334be479da2d1d6f8c4a13363 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 10:08:23 -0700 Subject: [PATCH 11/11] Remove test changes - this PR is a pure refactor Test improvements should be in a separate PR. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Tests/UnitTests/Shared/Shared.cpp | 126 ++++++++++++------------------ 1 file changed, 50 insertions(+), 76 deletions(-) diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index 26269b0a..e09da4fd 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -131,108 +131,82 @@ TEST(Console, Log) TEST(AppRuntime, DestroyDoesNotDeadlock) { - // Regression test verifying AppRuntime destruction doesn't deadlock. - // Uses a global arcana hook to sleep while holding the queue mutex - // before wait(), ensuring the worker is in the vulnerable window - // when the destructor fires. See #147 for details on the bug and fix. + // Deterministic test for the race condition in the AppRuntime destructor. // - // The entire test runs on a separate thread so the gtest thread can - // act as a watchdog. If destruction deadlocks, the watchdog detects - // the timeout and fails the test without hanging the process. + // A global hook sleeps WHILE HOLDING the queue mutex, right before + // condition_variable::wait(). We synchronize so the worker is definitely + // in the hook before triggering destruction. // - // Test flow: - // - // Test Thread Worker Thread - // ----------- ------------- - // 1. Install hook (disabled) - // 2. Create AppRuntime Worker starts, enters blocking_tick - // 3. Dispatch(ready), wait Worker runs ready callback - // ready.wait() <------------- ready.set_value() - // Worker returns to blocking_tick - // Hook fires but disabled -> no-op - // Worker enters wait(lock) - // 4. Enable hook - // Dispatch(no-op) Worker wakes, runs no-op, - // returns to blocking_tick - // Hook fires, enabled: - // signal workerInHook - // sleep 200ms (holding mutex!) - // 5. workerInHook.wait() - // Worker is sleeping in hook - // 6. ~AppRuntime() at scope exit - // cancel() - // Append(no-op): - // push() blocks ------> (worker holds mutex) - // 200ms sleep ends - // wait(lock) releases mutex - // push() acquires mutex - // pushes, notifies ---> wakes up! - // join() waits drains no-op, cancelled -> exit - // join() returns <----- thread exits - // 7. destroy completes -> PASS + // Old (broken) code: cancel() + notify_all() fire without the mutex, + // so the notification is lost while the worker sleeps → deadlock. + // Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex, + // so it blocks until the worker enters wait() → notification delivered. // Shared state for hook synchronization std::atomic hookEnabled{false}; - bool hookSignaled{false}; + std::atomic hookSignaled{false}; std::promise workerInHook; // Set the callback. It checks hookEnabled so we control - // when it actually sleeps. hookSignaled is only accessed by - // the worker thread so it doesn't need to be atomic. + // when it actually sleeps. arcana::set_before_wait_callback([&]() { + if (hookEnabled.load() && !hookSignaled.exchange(true)) + { + workerInHook.set_value(); + } if (hookEnabled.load()) { - if (!hookSignaled) - { - hookSignaled = true; - workerInHook.set_value(); - } std::this_thread::sleep_for(std::chrono::milliseconds(200)); } }); - // Run the full lifecycle on a separate thread so the gtest thread - // can act as a watchdog with a timeout. - std::promise testDone; - std::thread testThread([&]() { - auto runtime = std::make_unique(); - - // Wait for the runtime to fully initialize. The constructor dispatches - // CreateForJavaScript which must complete before we enable the hook, - // otherwise the hook would sleep during initialization. - std::promise ready; - runtime->Dispatch([&ready](Napi::Env) { - ready.set_value(); - }); - ready.get_future().wait(); - - // Enable the hook and dispatch a no-op to wake the worker, - // ensuring it cycles through the hook on its way back to idle - hookEnabled.store(true); - runtime->Dispatch([](Napi::Env) {}); - - // Wait for the worker to be in the hook (holding mutex, sleeping) - workerInHook.get_future().wait(); + auto runtime = std::make_unique(); - // Destroy naturally at scope exit — if the fix works, the destructor - // completes. If broken, it deadlocks and the watchdog catches it. - runtime.reset(); - testDone.set_value(); + // Dispatch work and wait for completion + std::promise ready; + runtime->Dispatch([&ready](Napi::Env) { + ready.set_value(); }); + ready.get_future().wait(); - auto status = testDone.get_future().wait_for(std::chrono::seconds(5)); + // Enable the hook and dispatch a no-op to wake the worker, + // ensuring it cycles through the hook on its way back to idle + hookEnabled.store(true); + runtime->Dispatch([](Napi::Env) {}); - arcana::set_before_wait_callback([]() {}); + // Wait for the worker to be in the hook (holding mutex, sleeping) + auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5)); + if (hookStatus == std::future_status::timeout) + { + // Hook didn't fire — no deadlock risk, clean up normally + arcana::set_before_wait_callback([]() {}); + FAIL() << "Worker thread did not enter before-wait hook"; + } + + // Worker is in the hook (holding mutex, sleeping). Destroy on a + // detachable thread so the test doesn't hang if the destructor deadlocks. + auto runtimePtr = std::make_shared>(std::move(runtime)); + std::promise destroyDone; + auto destroyFuture = destroyDone.get_future(); + std::thread destroyThread([runtimePtr, &destroyDone]() { + runtimePtr->reset(); + destroyDone.set_value(); + }); + auto status = destroyFuture.wait_for(std::chrono::seconds(5)); if (status == std::future_status::timeout) { - testThread.detach(); - FAIL() << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; + destroyThread.detach(); } else { - testThread.join(); + destroyThread.join(); } + + arcana::set_before_wait_callback([]() {}); + + ASSERT_NE(status, std::future_status::timeout) + << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; } int RunTests()