From 4d52029f67480af42cc40262404d810cfcfcef35 Mon Sep 17 00:00:00 2001 From: Pasin Suriyentrakorn Date: Fri, 17 Apr 2026 21:17:57 -0700 Subject: [PATCH] CBL-8161 : Timeout when closing the database with an active MultipeerReplicator running (#470) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Back ported from master branch (3fb6a7b2346f6b03cc6d14a0903d9207e2e6ff7c) — Problems : Closing the database with an active MultipeerReplicator running failed with the timeout error. Three related issues contributed to this bug: 1. Incorrect CountDownLatch reset in AbstractDatabase.shutdown() : When close() returns a BUSY error, the latch was reset to 2 under the assumption that a new process would be found in verifyActiveProcesses(). This assumption is not always valid (e.g. a background thread is still accessing the database during close). As a result, the latch never reaches zero and close() is never retried correctly. 2. MultipeerReplicator.onSyncStatusChanged reports inactive early (fixed in EE repo) : MultipeerReplicator.onSyncStatusChanged marked the process offline immediately but deferred unregisterProcess to an async callback . As a result, AbstractDatabase could treat the process as stopped, attempt to close the database too early, and receive BUSY from LiteCore. (fixed in EE repo) 3. Deadlock when close() is called from the Android main thread (fixed in EE repo) : Waiting for all active processes to unregister blocks the main thread, while shutdownConflictResolverService is also scheduled on the main thread (default executor), causing a deadlock. Fixes Simplified the shutdown logic by splitting it into two independent phases, which also eliminates the problematic CountDownLatch reset: Phase 1 — Drain active processes: Shut down all active processes, then wait for them to finish or time out (10 secs). Phase 2 — Close the database: Attempt close(); if BUSY is returned, wait briefly (2 secs) and retry (max 5 retries). Previously, the two phases were interleaved in a single loop: shut down processes → wait → close → on BUSY, reset latch to 2 and repeat from the top (max 5 retries). This coupling was the root cause of the latch never reaching zero. --- .../com/couchbase/lite/AbstractDatabase.java | 53 ++++++++++--------- 1 file changed, 27 insertions(+), 26 deletions(-) diff --git a/common/main/java/com/couchbase/lite/AbstractDatabase.java b/common/main/java/com/couchbase/lite/AbstractDatabase.java index b85cb3340..eba360a90 100644 --- a/common/main/java/com/couchbase/lite/AbstractDatabase.java +++ b/common/main/java/com/couchbase/lite/AbstractDatabase.java @@ -91,8 +91,13 @@ abstract class AbstractDatabase extends BaseDatabase private static final LogDomain DOMAIN = LogDomain.DATABASE; - private static final int DB_CLOSE_WAIT_SECS = 6; // > Core replicator timeout - private static final int DB_CLOSE_MAX_RETRIES = 5; // random choice: wait for 5 replicators + // Max time to wait for active processes to finish. + private static final int DB_CLOSE_PROCESS_TIMEOUT_SECS = 10; + // Backoff between BUSY retries on close. + private static final int DB_CLOSE_RETRY_DELAY_SECS = 2; + // Max number of BUSY retries before giving up. + private static final int DB_CLOSE_MAX_RETRIES = 5; + // Max time to wait for executors to drain. private static final int EXECUTOR_CLOSE_MAX_WAIT_SECS = 5; static class ActiveProcess { @@ -1224,7 +1229,6 @@ private void verifyActiveProcesses() { for (ActiveProcess process: processes) { Log.d(DOMAIN, " processes: %s", process); } } - @SuppressWarnings("PMD.NPathComplexity") private void shutdown(boolean failIfClosed, Fn.ConsumerThrows onShut) throws CouchbaseLiteException { final C4Database c4Db; @@ -1251,33 +1255,30 @@ private void shutdown(boolean failIfClosed, Fn.ConsumerThrows= DB_CLOSE_MAX_RETRIES) && (closeLatch.getCount() > 0)) { - throw new CouchbaseLiteException("Shutdown failed", CBLError.Domain.CBLITE, CBLError.Code.BUSY); + for (int i = 0; ; i++) { + try { + synchronized (getDbLock()) { onShut.accept(c4Db); } + break; } - - if (closeLatch.await(DB_CLOSE_WAIT_SECS, TimeUnit.SECONDS)) { - try { - synchronized (getDbLock()) { onShut.accept(c4Db); } - break; - } - catch (LiteCoreException e) { - if ((e.getDomain() != C4Constants.ErrorDomain.LITE_CORE) - || (e.getCode() != C4Constants.LiteCoreError.BUSY)) { - throw CouchbaseLiteException.convertException(e); - } + catch (LiteCoreException e) { + if (i >= DB_CLOSE_MAX_RETRIES + || e.getDomain() != C4Constants.ErrorDomain.LITE_CORE + || e.getCode() != C4Constants.LiteCoreError.BUSY) { + throw CouchbaseLiteException.convertException(e); } + Log.i( + DOMAIN, + "Database close returned BUSY, retrying in %d secs (retry %d/%d)", + DB_CLOSE_RETRY_DELAY_SECS, + i + 1, + DB_CLOSE_MAX_RETRIES); + Thread.sleep(TimeUnit.SECONDS.toMillis(DB_CLOSE_RETRY_DELAY_SECS)); } - - // If we get here then, despite the fact that it appears to us that all - // active processes have been stopped, LiteCore has other ideas. - // We have no way of finding out what LiteCore thinks, other than waiting - // a bit and trying the again. Since verifyActiveProcess will count down - // the latch, we need a new one with a count of at least 2 in order to force - // a wait - closeLatch = new CountDownLatch(2); } } catch (InterruptedException ignore) { }