From b47f224d83c1500cb24a0bd307d803c1f2cb62eb Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Mon, 23 Mar 2026 17:14:37 +0530 Subject: [PATCH 1/3] fix(sqlite-provider): checkpoint WAL after open_or_build Data written during build may only exist in the WAL file. Without an explicit checkpoint, the data can be lost if the process exits before SQLite performs a passive checkpoint. This caused empty query results when reloading the .db file after an engine restart. --- src/sqlite_provider.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/sqlite_provider.rs b/src/sqlite_provider.rs index 6fa723f..6b767ab 100644 --- a/src/sqlite_provider.rs +++ b/src/sqlite_provider.rs @@ -252,6 +252,12 @@ impl SqliteLookupProvider { )?; } + // Checkpoint WAL so the data is flushed to the main database file. + // Without this, data written during build may only exist in the WAL + // and can be lost if the process exits before a passive checkpoint. + conn.execute_batch("PRAGMA wal_checkpoint(TRUNCATE);") + .map_err(|e| DataFusionError::Execution(format!("WAL checkpoint failed: {e}")))?; + let mut conns = vec![conn]; for _ in 1..pool_size { conns.push(open_conn(db_path)?); From 469de3905b66a4f2c9469308e918c068e61ab463 Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Mon, 23 Mar 2026 17:14:52 +0530 Subject: [PATCH 2/3] feat(registry): add view_index for memory-mapped index loading view_index uses mmap instead of loading the full index into RAM, keeping resident memory proportional to the working set. Prefer this for the reload-from-disk path where the index file is already local. --- src/registry.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/registry.rs b/src/registry.rs index e7d494d..c7c7551 100644 --- a/src/registry.rs +++ b/src/registry.rs @@ -87,7 +87,7 @@ impl USearchIndexConfig { .map_err(|e| DataFusionError::Execution(format!("USearch Index::new failed: {e}"))) } - /// Load a previously saved index from `path`. + /// Load a previously saved index from `path` into memory. /// /// Uses the same `IndexOptions` as `build_index()`. The options must /// match those used when the index was originally built — passing wrong @@ -101,6 +101,26 @@ impl USearchIndexConfig { Ok(index) } + /// Memory-map a previously saved index from `path`. + /// + /// Unlike [`load_index`], this does not copy the index into RAM. The OS + /// pages data in on demand, keeping resident memory proportional to the + /// working set rather than the full index size. Prefer this for the + /// reload-from-disk path where the index file is already local. + /// + /// The returned [`Index`] is fully functional for search; the backing + /// file must remain on disk for the lifetime of the index. + /// + /// [`load_index`]: Self::load_index + pub fn view_index(&self, path: &str) -> Result { + let index = Index::new(&self.to_index_options()) + .map_err(|e| DataFusionError::Execution(format!("USearch Index::new failed: {e}")))?; + index + .view(path) + .map_err(|e| DataFusionError::Execution(format!("USearch index view failed: {e}")))?; + Ok(index) + } + fn to_index_options(&self) -> IndexOptions { IndexOptions { dimensions: self.dimensions, From 84499cec45b2c4da62c76e5af384b35169d1a2bd Mon Sep 17 00:00:00 2001 From: Anoop Narang Date: Mon, 23 Mar 2026 17:15:31 +0530 Subject: [PATCH 3/3] test(rule): add tests for SELECT-only-distance query pattern Covers the case where the SELECT list contains only the distance UDF and no base table columns, with bare and qualified table references. --- tests/optimizer_rule.rs | 49 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/tests/optimizer_rule.rs b/tests/optimizer_rule.rs index ec98fe8..de0d5fe 100644 --- a/tests/optimizer_rule.rs +++ b/tests/optimizer_rule.rs @@ -419,3 +419,52 @@ async fn test_qualified_ref_where_clause_rewrites() { "qualified ref + WHERE → filter absorbed, rule must fire\nPlan: {plan:?}" ); } + +// ═══════════════════════════════════════════════════════════════════════════════ +// SELECT only distance — no base columns projected +// ═══════════════════════════════════════════════════════════════════════════════ +// +// When the SELECT list contains only the distance UDF (no base table columns), +// the Projection node has a single computed expression. The optimizer must still +// recognise the pattern and rewrite to USearchNode. + +/// Bare table, SELECT only distance alias, ORDER BY alias — rule must fire. +#[tokio::test] +async fn test_select_only_distance_bare_rewrites() { + let ctx = make_ctx(MetricKind::L2sq).await; + let sql = + format!("SELECT l2_distance(vector, {Q}) AS dist FROM items ORDER BY dist ASC LIMIT 5"); + let plan = optimized_plan(&ctx, &sql).await; + assert!( + contains_usearch_node(&plan), + "SELECT only distance (bare) → rule must fire\nPlan: {plan:?}" + ); +} + +/// Qualified table, SELECT only distance alias, ORDER BY alias — rule must fire. +#[tokio::test] +async fn test_select_only_distance_qualified_rewrites() { + let ctx = make_ctx_qualified(MetricKind::L2sq).await; + let sql = format!( + "SELECT l2_distance(vector, {Q}) AS dist FROM datafusion.public.items ORDER BY dist ASC LIMIT 5" + ); + let plan = optimized_plan(&ctx, &sql).await; + assert!( + contains_usearch_node(&plan), + "SELECT only distance (qualified) → rule must fire\nPlan: {plan:?}" + ); +} + +/// Bare table, SELECT only distance (no alias), ORDER BY the UDF directly. +#[tokio::test] +async fn test_select_only_distance_no_alias_rewrites() { + let ctx = make_ctx(MetricKind::L2sq).await; + let sql = format!( + "SELECT l2_distance(vector, {Q}) FROM items ORDER BY l2_distance(vector, {Q}) ASC LIMIT 5" + ); + let plan = optimized_plan(&ctx, &sql).await; + assert!( + contains_usearch_node(&plan), + "SELECT only distance (no alias, ORDER BY UDF) → rule must fire\nPlan: {plan:?}" + ); +}