Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 1 addition & 10 deletions Core/AppRuntime/Source/WorkQueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,8 @@ namespace Babylon
Resume();
}

// Cancel immediately so pending work is dropped promptly, then append
// a no-op work item to wake the worker thread from blocking_tick. The
// no-op goes through push() which acquires the queue mutex, avoiding
// the race where a bare notify_all() can be missed by wait().
//
// NOTE: This preserves the existing shutdown behavior where pending
// callbacks are dropped on cancellation. A more complete solution
// would add cooperative shutdown (e.g. NotifyDisposing/Rundown) so
// consumers can finish cleanup work before the runtime is destroyed.
m_cancelSource.cancel();
Append([](Napi::Env) {});
m_dispatcher.cancelled();

m_thread.join();
}
Expand Down
130 changes: 69 additions & 61 deletions Tests/UnitTests/Shared/Shared.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -131,82 +131,90 @@ TEST(Console, Log)

TEST(AppRuntime, DestroyDoesNotDeadlock)
{
// Deterministic test for the race condition in the AppRuntime destructor.
// Regression test verifying AppRuntime destruction doesn't deadlock.
// Uses a global arcana hook to sleep while holding the queue mutex
// before wait(), ensuring the worker is in the vulnerable window
// when the destructor fires. See #147 for details on the bug and fix.
//
// A global hook sleeps WHILE HOLDING the queue mutex, right before
// condition_variable::wait(). We synchronize so the worker is definitely
// in the hook before triggering destruction.
// The entire test runs on a separate thread so the gtest thread can
// detect a deadlock via timeout without hanging the process.
//
// Old (broken) code: cancel() + notify_all() fire without the mutex,
// so the notification is lost while the worker sleeps → deadlock.
// Fixed code: cancel() + Append(no-op), where push() NEEDS the mutex,
// so it blocks until the worker enters wait() → notification delivered.

// Shared state for hook synchronization
std::atomic<bool> hookEnabled{false};
std::atomic<bool> hookSignaled{false};
// Test flow:
//
// Test Thread Worker Thread
// ----------- -------------
// 1. Create AppRuntime Worker starts, enters blocking_tick
// Wait for init to complete
// 2. Install hook
// Dispatch(no-op) Worker wakes, runs no-op,
// returns to blocking_tick
// Hook fires:
// signal workerInHook
// sleep 200ms (holding mutex!)
// 3. workerInHook.wait()
// Worker is sleeping in hook
// 4. ~AppRuntime():
// cancel()
// Append(no-op):
// push() blocks ------> (worker holds mutex)
// 200ms sleep ends
// wait(lock) releases mutex
// push() acquires mutex
// pushes, notifies ---> wakes up!
// join() waits drains no-op, cancelled -> exit
// join() returns <----- thread exits
// 5. destroy completes -> PASS

bool hookSignaled{false};
std::promise<void> workerInHook;

// Set the callback. It checks hookEnabled so we control
// when it actually sleeps.
arcana::set_before_wait_callback([&]() {
if (hookEnabled.load() && !hookSignaled.exchange(true))
{
workerInHook.set_value();
}
if (hookEnabled.load())
{
// Run the full lifecycle on a separate thread so the gtest thread
// can detect a deadlock via timeout.
std::promise<void> testDone;
std::thread testThread([&]() {
auto runtime = std::make_unique<Babylon::AppRuntime>();

// Wait for the runtime to fully initialize. The constructor dispatches
// CreateForJavaScript which must complete before we install the hook
// so the worker is idle and ready to enter the hook on the next wait.
std::promise<void> ready;
runtime->Dispatch([&ready](Napi::Env) {
ready.set_value();
});
ready.get_future().wait();

// Install the hook and dispatch a no-op to wake the worker,
// ensuring it cycles through the hook on its way back to idle.
arcana::set_before_wait_callback([&]() {
if (!hookSignaled)
{
hookSignaled = true;
workerInHook.set_value();
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
}
});
});
runtime->Dispatch([](Napi::Env) {});

auto runtime = std::make_unique<Babylon::AppRuntime>();
// Wait for the worker to be in the hook (holding mutex, sleeping)
workerInHook.get_future().wait();

// Dispatch work and wait for completion
std::promise<void> ready;
runtime->Dispatch([&ready](Napi::Env) {
ready.set_value();
// Destroy — if the fix works, the destructor completes.
// If broken, it deadlocks and the timeout detects it.
runtime.reset();
testDone.set_value();
});
ready.get_future().wait();

// Enable the hook and dispatch a no-op to wake the worker,
// ensuring it cycles through the hook on its way back to idle
hookEnabled.store(true);
runtime->Dispatch([](Napi::Env) {});
auto status = testDone.get_future().wait_for(std::chrono::seconds(5));

// Wait for the worker to be in the hook (holding mutex, sleeping)
auto hookStatus = workerInHook.get_future().wait_for(std::chrono::seconds(5));
if (hookStatus == std::future_status::timeout)
{
// Hook didn't fire — no deadlock risk, clean up normally
arcana::set_before_wait_callback([]() {});
FAIL() << "Worker thread did not enter before-wait hook";
}

// Worker is in the hook (holding mutex, sleeping). Destroy on a
// detachable thread so the test doesn't hang if the destructor deadlocks.
auto runtimePtr = std::make_shared<std::unique_ptr<Babylon::AppRuntime>>(std::move(runtime));
std::promise<void> destroyDone;
auto destroyFuture = destroyDone.get_future();
std::thread destroyThread([runtimePtr, &destroyDone]() {
runtimePtr->reset();
destroyDone.set_value();
});
arcana::set_before_wait_callback([]() {});

auto status = destroyFuture.wait_for(std::chrono::seconds(5));
if (status == std::future_status::timeout)
{
destroyThread.detach();
}
else
{
destroyThread.join();
testThread.detach();
FAIL() << "Deadlock detected: AppRuntime destructor did not complete within 5 seconds";
}

arcana::set_before_wait_callback([]() {});

ASSERT_NE(status, std::future_status::timeout)
<< "Deadlock detected: AppRuntime destructor did not complete within 5 seconds";
testThread.join();
}

int RunTests()
Expand Down
Loading