From 4de155e9091431352b65721d6ff30d2fed531d8b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 20:09:32 +0000 Subject: [PATCH 1/4] Initial plan From 0676e375160964bad6eabb1d96cba73013f6458a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 20:14:04 +0000 Subject: [PATCH 2/4] Remove unused normalizing_util.rs module from diskann-providers The normalizing_util module exported three functions (normalize_data_file, normalize_data_internal, normalize_data_internal_no_cblas) that had no callers anywhere in the codebase. Removing the module and its re-exports from utils/mod.rs as part of the diskann-providers cruft cleanup (#899). Closes #901 Agent-Logs-Url: https://github.com/microsoft/DiskANN/sessions/96941172-bd7d-4bae-815d-af0d0fd8cf6b Co-authored-by: harsha-simhadri <5590673+harsha-simhadri@users.noreply.github.com> --- diskann-providers/src/utils/mod.rs | 5 - .../src/utils/normalizing_util.rs | 168 ------------------ 2 files changed, 173 deletions(-) delete mode 100644 diskann-providers/src/utils/normalizing_util.rs diff --git a/diskann-providers/src/utils/mod.rs b/diskann-providers/src/utils/mod.rs index 87ff9df0b..341853a2d 100644 --- a/diskann-providers/src/utils/mod.rs +++ b/diskann-providers/src/utils/mod.rs @@ -9,11 +9,6 @@ pub use file_util::{ open_file_to_write, }; -mod normalizing_util; -pub use normalizing_util::{ - normalize_data_file, normalize_data_internal, normalize_data_internal_no_cblas, -}; - #[allow(clippy::module_inception)] mod utils; pub use utils::DatasetDto; diff --git a/diskann-providers/src/utils/normalizing_util.rs b/diskann-providers/src/utils/normalizing_util.rs deleted file mode 100644 index 0ff98c9e6..000000000 --- a/diskann-providers/src/utils/normalizing_util.rs +++ /dev/null @@ -1,168 +0,0 @@ -/* - * Copyright (c) Microsoft Corporation. - * Licensed under the MIT license. - */ -use std::{ - io::{BufReader, BufWriter, Read, Write}, - sync::atomic::{AtomicBool, Ordering}, -}; - -use crate::storage::{StorageReadProvider, StorageWriteProvider}; -use diskann::{ANNError, ANNResult}; -use diskann_utils::io::Metadata; -use diskann_vector::{Norm, norm::FastL2Norm}; -use rayon::prelude::*; -use tracing::info; - -use super::{AsThreadPool, ParallelIteratorInPool, RayonThreadPool}; -use crate::forward_threadpool; - -/// The normalizing_utils derives from the DiskANN c++ utils. -pub fn normalize_data_file( - in_file_name: &str, - out_file_name: &str, - storage_provider: &StorageProvider, - pool: Pool, -) -> ANNResult<()> -where - StorageProvider: StorageReadProvider + StorageWriteProvider, - Pool: AsThreadPool, -{ - let mut reader = BufReader::new(storage_provider.open_reader(in_file_name)?); - let mut writer = BufWriter::new(storage_provider.create_for_write(out_file_name)?); - - let metadata = Metadata::read(&mut reader)?; - let (npts, ndims) = (metadata.npoints(), metadata.ndims()); - metadata.write(&mut writer)?; - - info!("Normalizing FLOAT vectors in file: {}", in_file_name); - info!("Dataset: #pts = {}, # dims = {}", npts, ndims); - - let blk_size = 131072; - let nblks = npts.div_ceil(blk_size); - info!("# blks: {}", nblks); - - forward_threadpool!(pool = pool); - for i in 0..nblks { - let cblk_size = std::cmp::min(npts - i * blk_size, blk_size); - block_convert(&mut writer, &mut reader, cblk_size, ndims, pool)?; - } - - info!("Wrote normalized points to file: {}", out_file_name); - Ok(()) -} - -// Read block size buffer, convert original vector to normalized vector and write to file. -fn block_convert( - //writer: &mut BufWriter, - writer: &mut W, - reader: &mut R, - cblk_size: usize, - ndims: usize, - pool: &RayonThreadPool, -) -> ANNResult<()> { - let mut buffer = vec![0; ndims * cblk_size * std::mem::size_of::()]; - - match reader.read_exact(&mut buffer) { - Ok(()) => { - let mut float_data = buffer - .chunks_exact(4) - .map(|chunk| f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]])) - .collect::>(); - normalize_data_internal(&mut float_data, ndims, pool)?; - - for &normalized_value in &float_data { - writer.write_all(&normalized_value.to_le_bytes())?; - } - } - Err(_err) => { - return ANNResult::Err(ANNError::log_file_handle_error( - "Error while reading vectors", - )); - } - } - Ok(()) -} - -pub fn normalize_data_internal_no_cblas( - data: &mut [f32], - ndims: usize, - pool: Pool, -) -> ANNResult<()> { - let zero_norm: AtomicBool = AtomicBool::new(false); - - forward_threadpool!(pool = pool); - data.par_chunks_mut(ndims).for_each_in_pool(pool, |chunk| { - let norm_pt = chunk.iter().map(|val| val * val).sum::().sqrt(); - if norm_pt != 0.0 { - chunk.iter_mut().for_each(|val| *val /= norm_pt); - } else { - zero_norm.store(true, Ordering::SeqCst); - } - }); - - if zero_norm.load(Ordering::SeqCst) { - return Err(ANNError::log_index_error( - "Zero norm encountered. Unable to compute with zero-norm vector.", - )); - } - - Ok(()) -} - -pub fn normalize_data_internal( - data: &mut [f32], - ndims: usize, - pool: Pool, -) -> ANNResult<()> { - let zero_norm = AtomicBool::new(false); - - forward_threadpool!(pool = pool); - data.par_chunks_mut(ndims).for_each_in_pool(pool, |chunk| { - let norm_pt: f32 = (FastL2Norm).evaluate(&*chunk); - if norm_pt != 0.0 { - chunk.iter_mut().for_each(|val| *val /= norm_pt); - } else { - zero_norm.store(true, Ordering::SeqCst); - } - }); - - if zero_norm.load(Ordering::SeqCst) { - return Err(ANNError::log_index_error( - "Zero norm encountered. Unable to compute with zero-norm vector.", - )); - } - - Ok(()) -} - -#[cfg(test)] -mod normalizing_utils_test { - use crate::storage::{StorageReadProvider, VirtualStorageProvider}; - use diskann_utils::{io::read_bin, test_data_root}; - - use super::*; - use crate::utils::create_thread_pool_for_test; - - #[test] - fn test_normalize_data_file() { - let in_file_name = "/sift/siftsmall_learn_256pts.fbin"; - let norm_file_name = "/sift/siftsmall_learn_256pts_normalized.fbin"; - let out_file_name = "/siftsmall_learn_256pts_normalized.fbin"; - - let storage_provider = VirtualStorageProvider::new_overlay(test_data_root()); - let pool = create_thread_pool_for_test(); - normalize_data_file(in_file_name, out_file_name, &storage_provider, &pool).unwrap(); - - let load_data = - read_bin::(&mut storage_provider.open_reader(out_file_name).unwrap()).unwrap(); - storage_provider - .delete(out_file_name) - .expect("Should be able to delete temp file"); - - let norm_data = - read_bin::(&mut storage_provider.open_reader(norm_file_name).unwrap()).unwrap(); - - assert_eq!(load_data, norm_data); - } -} From fad4e123172efb0ed1e9c6a4fbdc58741d73039c Mon Sep 17 00:00:00 2001 From: Harsha Simhadri Date: Mon, 6 Apr 2026 13:31:44 -0700 Subject: [PATCH 3/4] Update diskann-providers/src/utils/mod.rs Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- diskann-providers/src/utils/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-providers/src/utils/mod.rs b/diskann-providers/src/utils/mod.rs index 341853a2d..2ed3c624b 100644 --- a/diskann-providers/src/utils/mod.rs +++ b/diskann-providers/src/utils/mod.rs @@ -10,7 +10,7 @@ pub use file_util::{ }; #[allow(clippy::module_inception)] -mod utils; +pub mod utils; pub use utils::DatasetDto; mod bridge_error; From 0d9c3113798e8feffbe9926942df590047fd581c Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Mon, 6 Apr 2026 22:12:28 +0000 Subject: [PATCH 4/4] Revert utils module back to private visibility Agent-Logs-Url: https://github.com/microsoft/DiskANN/sessions/b46a05b5-8b1a-4082-a57a-cdb207203763 Co-authored-by: hildebrandmw <24898651+hildebrandmw@users.noreply.github.com> --- diskann-providers/src/utils/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/diskann-providers/src/utils/mod.rs b/diskann-providers/src/utils/mod.rs index 2ed3c624b..341853a2d 100644 --- a/diskann-providers/src/utils/mod.rs +++ b/diskann-providers/src/utils/mod.rs @@ -10,7 +10,7 @@ pub use file_util::{ }; #[allow(clippy::module_inception)] -pub mod utils; +mod utils; pub use utils::DatasetDto; mod bridge_error;