From 15bb2f18e53eabb1432e09f1380f37a087f106c3 Mon Sep 17 00:00:00 2001 From: Gary Hsu Date: Thu, 9 Apr 2026 10:46:57 -0700 Subject: [PATCH] Revert WorkQueue destructor to old broken code + updated test Reverts the deadlock fix to demonstrate the test detects it. Test updated with cleanup from #151. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- Core/AppRuntime/Source/WorkQueue.cpp | 11 +-- Tests/UnitTests/Shared/Shared.cpp | 130 ++++++++++++++------------- 2 files changed, 70 insertions(+), 71 deletions(-) diff --git a/Core/AppRuntime/Source/WorkQueue.cpp b/Core/AppRuntime/Source/WorkQueue.cpp index 8ff5efe1..4c1c5f21 100644 --- a/Core/AppRuntime/Source/WorkQueue.cpp +++ b/Core/AppRuntime/Source/WorkQueue.cpp @@ -14,17 +14,8 @@ namespace Babylon Resume(); } - // Cancel immediately so pending work is dropped promptly, then append - // a no-op work item to wake the worker thread from blocking_tick. The - // no-op goes through push() which acquires the queue mutex, avoiding - // the race where a bare notify_all() can be missed by wait(). - // - // NOTE: This preserves the existing shutdown behavior where pending - // callbacks are dropped on cancellation. A more complete solution - // would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so - // consumers can finish cleanup work before the runtime is destroyed. m_cancelSource.cancel(); - Append([](Napi::Env) {}); + m_dispatcher.cancelled(); m_thread.join(); } diff --git a/Tests/UnitTests/Shared/Shared.cpp b/Tests/UnitTests/Shared/Shared.cpp index e09da4fd..f3b9ddda 100644 --- a/Tests/UnitTests/Shared/Shared.cpp +++ b/Tests/UnitTests/Shared/Shared.cpp @@ -131,82 +131,90 @@ TEST(Console, Log) TEST(AppRuntime, DestroyDoesNotDeadlock) { - // Deterministic test for the race condition in the AppRuntime destructor. + // Regression test verifying AppRuntime destruction doesn't deadlock. + // Uses a global arcana hook to sleep while holding the queue mutex + // before wait(), ensuring the worker is in the vulnerable window + // when the destructor fires. See #147 for details on the bug and fix. // - // A global hook sleeps WHILE HOLDING the queue mutex, right before - // condition_variable::wait(). We synchronize so the worker is definitely - // in the hook before triggering destruction. + // The entire test runs on a separate thread so the gtest thread can + // detect a deadlock via timeout without hanging the process. // - // Old (broken) code: cancel() + notify_all() fire without the mutex, - // so the notification is lost while the worker sleeps → deadlock. - // Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex, - // so it blocks until the worker enters wait() → notification delivered. - - // Shared state for hook synchronization - std::atomic hookEnabled{false}; - std::atomic hookSignaled{false}; + // Test flow: + // + // Test Thread Worker Thread + // ----------- ------------- + // 1. Create AppRuntime Worker starts, enters blocking_tick + // Wait for init to complete + // 2. Install hook + // Dispatch(no-op) Worker wakes, runs no-op, + // returns to blocking_tick + // Hook fires: + // signal workerInHook + // sleep 200ms (holding mutex!) + // 3. workerInHook.wait() + // Worker is sleeping in hook + // 4. ~AppRuntime(): + // cancel() + // Append(no-op): + // push() blocks ------> (worker holds mutex) + // 200ms sleep ends + // wait(lock) releases mutex + // push() acquires mutex + // pushes, notifies ---> wakes up! + // join() waits drains no-op, cancelled -> exit + // join() returns <----- thread exits + // 5. destroy completes -> PASS + + bool hookSignaled{false}; std::promise workerInHook; - // Set the callback. It checks hookEnabled so we control - // when it actually sleeps. - arcana::set_before_wait_callback([&]() { - if (hookEnabled.load() && !hookSignaled.exchange(true)) - { - workerInHook.set_value(); - } - if (hookEnabled.load()) - { + // Run the full lifecycle on a separate thread so the gtest thread + // can detect a deadlock via timeout. + std::promise testDone; + std::thread testThread([&]() { + auto runtime = std::make_unique(); + + // Wait for the runtime to fully initialize. The constructor dispatches + // CreateForJavaScript which must complete before we install the hook + // so the worker is idle and ready to enter the hook on the next wait. + std::promise ready; + runtime->Dispatch([&ready](Napi::Env) { + ready.set_value(); + }); + ready.get_future().wait(); + + // Install the hook and dispatch a no-op to wake the worker, + // ensuring it cycles through the hook on its way back to idle. + arcana::set_before_wait_callback([&]() { + if (!hookSignaled) + { + hookSignaled = true; + workerInHook.set_value(); + } std::this_thread::sleep_for(std::chrono::milliseconds(200)); - } - }); + }); + runtime->Dispatch([](Napi::Env) {}); - auto runtime = std::make_unique(); + // Wait for the worker to be in the hook (holding mutex, sleeping) + workerInHook.get_future().wait(); - // Dispatch work and wait for completion - std::promise ready; - runtime->Dispatch([&ready](Napi::Env) { - ready.set_value(); + // Destroy — if the fix works, the destructor completes. + // If broken, it deadlocks and the timeout detects it. + runtime.reset(); + testDone.set_value(); }); - ready.get_future().wait(); - // Enable the hook and dispatch a no-op to wake the worker, - // ensuring it cycles through the hook on its way back to idle - hookEnabled.store(true); - runtime->Dispatch([](Napi::Env) {}); + auto status = testDone.get_future().wait_for(std::chrono::seconds(5)); - // Wait for the worker to be in the hook (holding mutex, sleeping) - auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5)); - if (hookStatus == std::future_status::timeout) - { - // Hook didn't fire — no deadlock risk, clean up normally - arcana::set_before_wait_callback([]() {}); - FAIL() << "Worker thread did not enter before-wait hook"; - } - - // Worker is in the hook (holding mutex, sleeping). Destroy on a - // detachable thread so the test doesn't hang if the destructor deadlocks. - auto runtimePtr = std::make_shared>(std::move(runtime)); - std::promise destroyDone; - auto destroyFuture = destroyDone.get_future(); - std::thread destroyThread([runtimePtr, &destroyDone]() { - runtimePtr->reset(); - destroyDone.set_value(); - }); + arcana::set_before_wait_callback([]() {}); - auto status = destroyFuture.wait_for(std::chrono::seconds(5)); if (status == std::future_status::timeout) { - destroyThread.detach(); - } - else - { - destroyThread.join(); + testThread.detach(); + FAIL() << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; } - arcana::set_before_wait_callback([]() {}); - - ASSERT_NE(status, std::future_status::timeout) - << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds"; + testThread.join(); } int RunTests()