From 3357cf5c5a9e713b45cd7d2552d454b4663313e0 Mon Sep 17 00:00:00 2001 From: fastio Date: Mon, 2 Mar 2026 18:12:31 +0800 Subject: [PATCH 1/4] feat: add clickhouse-bench with auto-downloaded ClickHouse binary Introduce a new clickhouse-bench benchmark crate that runs ClickBench queries against Parquet data via clickhouse-local, providing a baseline for comparing Vortex performance against ClickHouse. Key design decisions: - build.rs auto-downloads the full ClickHouse binary (with Parquet support) into target/clickhouse-local/, similar to how vortex-duckdb downloads the DuckDB library. This eliminates manual install steps and avoids issues with slim/homebrew builds lacking Parquet support. - The binary path is baked in via CLICKHOUSE_BINARY env at compile time; CLICKHOUSE_LOCAL env var allows runtime override. - ClickHouse-dialect SQL queries are maintained in a separate clickbench_clickhouse_queries.sql file (43 queries). - CI workflows updated to include clickhouse:parquet target in ClickBench benchmarks and conditionally build clickhouse-bench. --- .github/scripts/run-sql-bench.sh | 20 +- .github/workflows/bench.yml | 2 +- .github/workflows/sql-benchmarks.yml | 5 +- Cargo.lock | 12 + Cargo.toml | 1 + benchmarks/clickhouse-bench/Cargo.toml | 25 ++ benchmarks/clickhouse-bench/build.rs | 117 ++++++++++ benchmarks/clickhouse-bench/src/lib.rs | 216 ++++++++++++++++++ benchmarks/clickhouse-bench/src/main.rs | 105 +++++++++ .../clickbench_clickhouse_queries.sql | 43 ++++ vortex-bench/src/clickbench/benchmark.rs | 30 ++- vortex-bench/src/lib.rs | 4 + 12 files changed, 570 insertions(+), 10 deletions(-) create mode 100644 benchmarks/clickhouse-bench/Cargo.toml create mode 100644 benchmarks/clickhouse-bench/build.rs create mode 100644 benchmarks/clickhouse-bench/src/lib.rs create mode 100644 benchmarks/clickhouse-bench/src/main.rs create mode 100644 vortex-bench/clickbench_clickhouse_queries.sql diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh index 93e96cb89dd..73d2a26d962 100755 --- a/.github/scripts/run-sql-bench.sh +++ b/.github/scripts/run-sql-bench.sh @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright the Vortex contributors # -# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets. -# This script is used by the sql-benchmarks.yml workflow. +# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench) +# for the given targets. This script is used by the sql-benchmarks.yml workflow. # # Usage: # run-sql-bench.sh [options] @@ -11,12 +11,12 @@ # Arguments: # subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds) # targets Comma-separated list of engine:format pairs -# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet") +# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet") # # Options: # --scale-factor Scale factor for the benchmark (e.g., 1.0, 10.0) # --remote-storage Remote storage URL (e.g., s3://bucket/path/) -# If provided, runs in remote mode (no lance support). +# If provided, runs in remote mode (no lance/clickhouse support). # --benchmark-id Benchmark ID for error messages (e.g., tpch-s3) set -Eeu -o pipefail @@ -78,6 +78,7 @@ fi df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false") +has_clickhouse=$(echo "$targets" | grep -q '^clickhouse:' && echo "true" || echo "false") # Build options string. opts="" @@ -127,3 +128,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l cat lance-results.json >> results.json fi + +# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files). +if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then + # shellcheck disable=SC2086 + target/release_debug/clickhouse-bench \ + -d gh-json \ + $opts \ + -o ch-results.json + + cat ch-results.json >> results.json +fi diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index 68f97854045..5507d268e8a 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -110,7 +110,7 @@ jobs: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet", "build_lance": true }, { diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 030953c100b..ecdcc069096 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -21,7 +21,7 @@ on: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb" + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet" }, { "id": "tpch-nvme", @@ -136,6 +136,9 @@ jobs: if [ "${{ matrix.build_lance }}" = "true" ]; then packages="$packages --bin lance-bench" fi + if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then + packages="$packages --bin clickhouse-bench" + fi cargo build $packages --profile release_debug - name: Generate data diff --git a/Cargo.lock b/Cargo.lock index 091dff9418d..95fe9ecf59b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1582,6 +1582,18 @@ version = "0.7.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" +[[package]] +name = "clickhouse-bench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "reqwest", + "tokio", + "tracing", + "vortex-bench", +] + [[package]] name = "cmake" version = "0.1.57" diff --git a/Cargo.toml b/Cargo.toml index ba9a0268b87..8b1186c5f00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -51,6 +51,7 @@ members = [ "encodings/zstd", "encodings/bytebool", # Benchmarks + "benchmarks/clickhouse-bench", "benchmarks/lance-bench", "benchmarks/compress-bench", "benchmarks/datafusion-bench", diff --git a/benchmarks/clickhouse-bench/Cargo.toml b/benchmarks/clickhouse-bench/Cargo.toml new file mode 100644 index 00000000000..789cce5a69f --- /dev/null +++ b/benchmarks/clickhouse-bench/Cargo.toml @@ -0,0 +1,25 @@ +[package] +name = "clickhouse-bench" +description = "ClickHouse (clickhouse-local) benchmark runner for Vortex" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive"] } +tokio = { workspace = true, features = ["full"] } +tracing = { workspace = true } +vortex-bench = { workspace = true } + +[build-dependencies] +reqwest = { workspace = true, features = ["blocking"] } + +[lints] +workspace = true diff --git a/benchmarks/clickhouse-bench/build.rs b/benchmarks/clickhouse-bench/build.rs new file mode 100644 index 00000000000..917d248f9d8 --- /dev/null +++ b/benchmarks/clickhouse-bench/build.rs @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Build script that downloads a full ClickHouse binary (with Parquet support) +//! into the target directory. The binary path is exported via +//! `cargo:rustc-env=CLICKHOUSE_BINARY=...` so that `lib.rs` can locate it at runtime +//! via `env!("CLICKHOUSE_BINARY")` without any user-installed dependency. +//! +//! The approach mirrors `vortex-duckdb/build.rs` which auto-downloads a DuckDB dylib. +//! +//! Resolution order: +//! 1. `CLICKHOUSE_LOCAL` env var — use as-is (skip download). +//! 2. Download from `builds.clickhouse.com` (official master builds) into +//! `target/clickhouse-local/clickhouse`. +//! +//! We use the official master builds because macOS binaries are only available +//! from `builds.clickhouse.com`, not from the tgz/stable package repos. + +#![allow(clippy::unwrap_used)] +#![allow(clippy::expect_used)] +#![allow(clippy::panic)] + +use std::env; +use std::fs; +use std::os::unix::fs::PermissionsExt; +use std::path::PathBuf; + +/// Returns the download URL for the clickhouse binary based on the compilation target. +fn download_url() -> Result> { + let target = env::var("TARGET")?; + let dir = match target.as_str() { + "x86_64-apple-darwin" => "macos", + "aarch64-apple-darwin" => "macos-aarch64", + "x86_64-unknown-linux-gnu" => "amd64", + "aarch64-unknown-linux-gnu" => "aarch64", + other => return Err(format!("Unsupported target for clickhouse download: {other}").into()), + }; + Ok(format!( + "https://builds.clickhouse.com/master/{dir}/clickhouse" + )) +} + +/// Get the base target directory for ClickHouse artifacts. +fn target_dir() -> PathBuf { + let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + manifest_dir.parent().unwrap().parent().unwrap().join("target") +} + +fn main() { + println!("cargo:rerun-if-env-changed=CLICKHOUSE_LOCAL"); + + // If the user explicitly provides a binary path, just export it. + if let Ok(path) = env::var("CLICKHOUSE_LOCAL") { + println!("cargo:rustc-env=CLICKHOUSE_BINARY={path}"); + return; + } + + let ch_dir = target_dir().join("clickhouse-local"); + let binary_path = ch_dir.join("clickhouse"); + + // If the binary already exists (and is executable), skip download. + if binary_path.exists() { + println!("cargo:rustc-env=CLICKHOUSE_BINARY={}", binary_path.display()); + return; + } + + // Download the full ClickHouse binary. + let url = download_url().expect("Failed to determine clickhouse download URL"); + println!("cargo:warning=Downloading ClickHouse binary from {url} (this may take a minute)..."); + + fs::create_dir_all(&ch_dir).expect("Failed to create clickhouse-local directory"); + + let timeout_secs: u64 = env::var("CARGO_HTTP_TIMEOUT") + .or_else(|_| env::var("HTTP_TIMEOUT")) + .ok() + .and_then(|s| s.parse().ok()) + .unwrap_or(300); // 5 minute timeout for ~160MB download + + let client = reqwest::blocking::Client::builder() + .timeout(std::time::Duration::from_secs(timeout_secs)) + .build() + .expect("Failed to create HTTP client"); + + let response = client + .get(&url) + .send() + .expect("Failed to download ClickHouse binary"); + + assert!( + response.status().is_success(), + "Failed to download ClickHouse binary: HTTP {}", + response.status() + ); + + let bytes = response + .bytes() + .expect("Failed to read ClickHouse binary response body"); + + // Write to a temporary file first, then rename (atomic on same filesystem). + let tmp_path = ch_dir.join("clickhouse.tmp"); + fs::write(&tmp_path, &bytes).expect("Failed to write ClickHouse binary"); + + // Make it executable (0o755). + let mut perms = fs::metadata(&tmp_path) + .expect("Failed to read tmp binary metadata") + .permissions(); + perms.set_mode(0o755); + fs::set_permissions(&tmp_path, perms).expect("Failed to set executable permissions"); + + fs::rename(&tmp_path, &binary_path).expect("Failed to rename ClickHouse binary into place"); + + println!("cargo:rustc-env=CLICKHOUSE_BINARY={}", binary_path.display()); + println!( + "cargo:warning=ClickHouse binary downloaded to {}", + binary_path.display() + ); +} diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs new file mode 100644 index 00000000000..6f622b9c3a1 --- /dev/null +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ClickHouse Local context for benchmarks. +//! +//! Uses `clickhouse-local` via `std::process::Command` to execute SQL queries +//! against Parquet files on disk. +//! +//! The ClickHouse binary is **automatically downloaded** at build time by `build.rs` +//! (similar to how `vortex-duckdb/build.rs` downloads the DuckDB dynamic library). +//! No manual installation is required. +//! +//! ## Scan API Evaluation for ClickHouse Integration +//! +//! Per @gatesn's request in Discussion #6425, we evaluated whether the Vortex Scan API +//! (`vortex-scan/src/api.rs`) can support a good ClickHouse integration. +//! +//! ### Mapping +//! +//! The Scan API's four-layer abstraction maps naturally to ClickHouse: +//! +//! | Scan API | ClickHouse mapping | +//! |---|---| +//! | `DataSource` | Table metadata + connection config (`Send + Sync`, shareable) | +//! | `ScanRequest.projection` | `SELECT` column/expression pushdown (needs `Expression` → SQL converter) | +//! | `ScanRequest.filter` | `WHERE` clause pushdown (similar to `vortex-datafusion/convert/exprs.rs`) | +//! | `ScanRequest.limit` | `LIMIT N` pushdown (trivial) | +//! | `DataSourceScan` | Query planning + partition discovery (`system.parts` or file-level) | +//! | `Split` | Per-partition query execution unit | +//! | `Split::execute()` | Executes partition query, streams results as `SendableArrayStream` | +//! +//! ### Potential API Gaps +//! +//! 1. **No engine capability negotiation** — `DataSource` cannot declare which expression types +//! it supports for pushdown. Suggest adding `capabilities()` method. +//! 2. **`Split::execute()` is sync** — ClickHouse queries are inherently async (network I/O). +//! The pattern used by `LayoutReaderDataSource` (pre-compute `BoxFuture` in `scan()`) works +//! but should be documented as the recommended approach. +//! 3. **No column statistics API** — only `row_count_estimate()` exists. ClickHouse has rich +//! column stats (min/max/NDV) that could enable better query planning. +//! 4. **No transaction/snapshot semantics** — could lead to inconsistent reads across splits +//! on ClickHouse replicas. +//! +//! ### Conclusion +//! +//! The Scan API is a reasonable fit. None of the gaps are blockers. The recommended integration +//! order is: +//! 1. This PR: ClickBench baseline with `clickhouse-local` CLI (performance reference) +//! 2. `vortex-clickhouse` crate with type conversion (DType ↔ ClickHouse types) +//! 3. `ClickHouseDataSource` implementing `DataSource` trait (basic scan, no pushdown) +//! 4. Filter pushdown (`Expression` → ClickHouse WHERE clause) +//! 5. Projection pushdown and performance optimization + +use std::io::Write; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use tracing::trace; +use vortex_bench::Benchmark; +use vortex_bench::Format; + +/// Path to the ClickHouse binary, set by build.rs at compile time. +const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY"); + +/// A client that wraps `clickhouse-local` for running SQL benchmarks. +pub struct ClickHouseClient { + /// The path to the `clickhouse` binary. + binary: PathBuf, + /// SQL statements to run before each query (CREATE VIEW statements). + setup_sql: Vec, +} + +impl ClickHouseClient { + /// Create a new client. Only Parquet format is supported. + pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result { + if format != Format::Parquet { + anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}"); + } + + let binary = PathBuf::from(CLICKHOUSE_BINARY); + anyhow::ensure!( + binary.exists(), + "ClickHouse binary not found at '{}'. \ + This should have been downloaded by build.rs. Try `cargo clean -p clickhouse-bench`.", + binary.display() + ); + + tracing::info!(binary = %binary.display(), "Using clickhouse-local"); + + let mut client = Self { + binary, + setup_sql: Vec::new(), + }; + client.register_tables(benchmark, format)?; + Ok(client) + } + + /// Generate `CREATE VIEW ... AS SELECT * FROM file(...)` statements. + /// + /// We use a VIEW over the `file()` table function rather than `CREATE TABLE ... ENGINE = File()` + /// because the `file()` function handles glob patterns (e.g., `*.parquet`) more reliably across + /// ClickHouse versions. + fn register_tables(&mut self, benchmark: &dyn Benchmark, format: Format) -> Result<()> { + let data_url = benchmark.data_url(); + let base_dir = if data_url.scheme() == "file" { + data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {data_url}"))? + } else { + anyhow::bail!("clickhouse-bench only supports local file:// data URLs"); + }; + + let format_dir = base_dir.join(format.name()); + if !format_dir.exists() { + anyhow::bail!( + "Data directory does not exist: {}. Run data generation first.", + format_dir.display() + ); + } + + for table_spec in benchmark.table_specs() { + let name = table_spec.name; + let pattern = benchmark + .pattern(name, format) + .map(|p| p.to_string()) + .unwrap_or_else(|| format!("*.{}", format.ext())); + + let data_path = format!("{}/{}", format_dir.display(), pattern); + + tracing::info!( + table = name, + path = %data_path, + "Registering ClickHouse table" + ); + + let create_sql = format!( + "CREATE VIEW IF NOT EXISTS {name} AS \ + SELECT * FROM file('{data_path}', Parquet);" + ); + self.setup_sql.push(create_sql); + } + + Ok(()) + } + + /// Execute a SQL query via `clickhouse-local`, returning `(row_count, timing)`. + /// + /// The approach: + /// 1. Prepend all CREATE VIEW statements + /// 2. Append the benchmark query + /// 3. Pipe the combined SQL into `clickhouse local` via stdin + /// 4. Parse stdout to count result rows + pub fn execute_query(&self, query: &str) -> Result<(usize, Option)> { + trace!("execute clickhouse query: {query}"); + + // Build the full SQL: setup views + the actual query + let mut full_sql = String::new(); + for stmt in &self.setup_sql { + full_sql.push_str(stmt); + full_sql.push('\n'); + } + full_sql.push_str(query); + // Ensure we have a trailing semicolon + if !query.trim_end().ends_with(';') { + full_sql.push(';'); + } + + let time_instant = Instant::now(); + + // The downloaded binary is the multi-tool `clickhouse` binary, + // so we always invoke it as `clickhouse local`. + let mut child = Command::new(&self.binary) + .args(["local", "--format", "TabSeparated"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn clickhouse-local")?; + + // Write SQL to stdin + { + let stdin = child + .stdin + .as_mut() + .context("Failed to open clickhouse-local stdin")?; + stdin + .write_all(full_sql.as_bytes()) + .context("Failed to write SQL to clickhouse-local stdin")?; + } + + let output = child + .wait_with_output() + .context("Failed to wait for clickhouse-local")?; + + let query_time = time_instant.elapsed(); + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!( + "clickhouse-local failed (exit {}): {stderr}", + output.status.code().unwrap_or(-1) + ); + } + + // Count non-empty lines in stdout as row count + let stdout = String::from_utf8_lossy(&output.stdout); + let row_count = stdout.lines().filter(|line| !line.is_empty()).count(); + + Ok((row_count, Some(query_time))) + } +} diff --git a/benchmarks/clickhouse-bench/src/main.rs b/benchmarks/clickhouse-bench/src/main.rs new file mode 100644 index 00000000000..03e8db27fd6 --- /dev/null +++ b/benchmarks/clickhouse-bench/src/main.rs @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::path::PathBuf; + +use clap::Parser; +use clickhouse_bench::ClickHouseClient; +use tokio::runtime::Runtime; +use vortex_bench::Benchmark; +use vortex_bench::Engine; +use vortex_bench::Format; +use vortex_bench::Opt; +use vortex_bench::Opts; +use vortex_bench::clickbench::ClickBenchBenchmark; +use vortex_bench::clickbench::Flavor; +use vortex_bench::create_output_writer; +use vortex_bench::display::DisplayFormat; +use vortex_bench::runner::SqlBenchmarkRunner; +use vortex_bench::runner::filter_queries; +use vortex_bench::setup_logging_and_tracing; + +/// ClickHouse (clickhouse-local) benchmark runner. +/// +/// Runs ClickBench queries against Parquet data using clickhouse-local as a performance baseline. +/// This allows comparing ClickHouse's native Parquet reading performance against other engines +/// (DuckDB, DataFusion) on the same hardware and dataset. +#[derive(Parser)] +struct Args { + #[arg(short, long, default_value_t = 5)] + iterations: usize, + + #[arg(short, long)] + verbose: bool, + + #[arg(long)] + tracing: bool, + + #[arg(short, long, default_value_t, value_enum)] + display_format: DisplayFormat, + + #[arg(short, long, value_delimiter = ',')] + queries: Option>, + + #[arg(short, long, value_delimiter = ',')] + exclude_queries: Option>, + + #[arg(short)] + output_path: Option, + + #[arg(long, default_value_t = false)] + track_memory: bool, + + #[arg(long, default_value_t = false)] + hide_progress_bar: bool, + + #[arg(long = "opt", value_delimiter = ',', value_parser = clap::value_parser!(Opt))] + options: Vec, +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let opts = Opts::from(args.options); + + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let flavor = opts.get_as::("flavor").unwrap_or_default(); + let remote_data_dir = opts.get_as::("remote-data-dir"); + let benchmark = + ClickBenchBenchmark::new(flavor, None, remote_data_dir)?.with_engine(Engine::ClickHouse); + + let filtered_queries = filter_queries( + benchmark.queries()?, + args.queries.as_ref(), + args.exclude_queries.as_ref(), + ); + + // Generate base Parquet data if needed. + if benchmark.data_url().scheme() == "file" { + let runtime = Runtime::new()?; + runtime.block_on(async { benchmark.generate_base_data().await })?; + } + + let formats = vec![Format::Parquet]; + + let mut runner = SqlBenchmarkRunner::new( + &benchmark, + Engine::ClickHouse, + formats, + args.track_memory, + args.hide_progress_bar, + )?; + + runner.run_all( + &filtered_queries, + args.iterations, + |format| ClickHouseClient::new(&benchmark, format), + |ctx, _query_idx, _format, query| ctx.execute_query(query), + )?; + + let benchmark_id = format!("clickhouse-{}", benchmark.dataset_name()); + let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?; + runner.export_to(&args.display_format, writer)?; + + Ok(()) +} diff --git a/vortex-bench/clickbench_clickhouse_queries.sql b/vortex-bench/clickbench_clickhouse_queries.sql new file mode 100644 index 00000000000..31f65fc898d --- /dev/null +++ b/vortex-bench/clickbench_clickhouse_queries.sql @@ -0,0 +1,43 @@ +SELECT COUNT(*) FROM hits; +SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; +SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; +SELECT AVG(UserID) FROM hits; +SELECT COUNT(DISTINCT UserID) FROM hits; +SELECT COUNT(DISTINCT SearchPhrase) FROM hits; +SELECT MIN(EventDate), MAX(EventDate) FROM hits; +SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; +SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; +SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; +SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; +SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; +SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; +SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; +SELECT UserID FROM hits WHERE UserID = 435090932899640449; +SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; +SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; +SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; +SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; +SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; +SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; +SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; +SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; +SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; +SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; +SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; +SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; +SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; +SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs index 5e14cbcf40e..4f4a143b92a 100644 --- a/vortex-bench/src/clickbench/benchmark.rs +++ b/vortex-bench/src/clickbench/benchmark.rs @@ -4,6 +4,7 @@ use std::env; use std::fs; use std::path::Path; +use std::path::PathBuf; use anyhow::Result; use reqwest::Client; @@ -12,6 +13,7 @@ use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; +use crate::Engine; use crate::IdempotentPath; use crate::TableSpec; use crate::clickbench::*; @@ -21,6 +23,8 @@ pub struct ClickBenchBenchmark { pub flavor: Flavor, pub queries_file: Option, pub data_url: Url, + /// Override the engine to select engine-specific query files. + pub engine: Option, } impl ClickBenchBenchmark { @@ -34,9 +38,30 @@ impl ClickBenchBenchmark { flavor, queries_file, data_url: url, + engine: None, }) } + /// Set the engine to select engine-specific query files. + pub fn with_engine(mut self, engine: Engine) -> Self { + self.engine = Some(engine); + self + } + + /// Returns the path to the queries file for the given engine. + fn queries_file_path(&self) -> PathBuf { + if let Some(file) = &self.queries_file { + return file.into(); + } + let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); + match self.engine { + Some(Engine::ClickHouse) => { + manifest_dir.join("clickbench_clickhouse_queries.sql") + } + _ => manifest_dir.join("clickbench_queries.sql"), + } + } + fn create_data_url(remote_data_dir: &Option, flavor: Flavor) -> Result { match remote_data_dir { None => { @@ -69,10 +94,7 @@ impl ClickBenchBenchmark { #[async_trait::async_trait] impl Benchmark for ClickBenchBenchmark { fn queries(&self) -> Result> { - let queries_filepath = match &self.queries_file { - Some(file) => file.into(), - None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"), - }; + let queries_filepath = self.queries_file_path(); Ok(fs::read_to_string(queries_filepath)? .split(';') diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 6dad0f0f6a1..8be4c6bcea8 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -206,6 +206,9 @@ pub enum Engine { #[clap(name = "duckdb")] #[serde(rename = "duckdb")] DuckDB, + #[clap(name = "clickhouse")] + #[serde(rename = "clickhouse")] + ClickHouse, } impl Display for Engine { @@ -213,6 +216,7 @@ impl Display for Engine { match self { Engine::DataFusion => write!(f, "datafusion"), Engine::DuckDB => write!(f, "duckdb"), + Engine::ClickHouse => write!(f, "clickhouse"), Engine::Vortex => write!(f, "vortex"), Engine::Arrow => write!(f, "arrow"), } From cdd7fb4e0edc2ad1f8754b484999f095f6fb75dc Mon Sep 17 00:00:00 2001 From: fastio Date: Tue, 3 Mar 2026 10:16:59 +0800 Subject: [PATCH 2/4] bench(clickbench): tighten ClickHouse query normalization and URL handling --- .../clickbench_clickhouse_queries.sql | 43 --------- vortex-bench/src/clickbench/benchmark.rs | 93 ++++++++++++++++--- 2 files changed, 78 insertions(+), 58 deletions(-) delete mode 100644 vortex-bench/clickbench_clickhouse_queries.sql diff --git a/vortex-bench/clickbench_clickhouse_queries.sql b/vortex-bench/clickbench_clickhouse_queries.sql deleted file mode 100644 index 31f65fc898d..00000000000 --- a/vortex-bench/clickbench_clickhouse_queries.sql +++ /dev/null @@ -1,43 +0,0 @@ -SELECT COUNT(*) FROM hits; -SELECT COUNT(*) FROM hits WHERE AdvEngineID <> 0; -SELECT SUM(AdvEngineID), COUNT(*), AVG(ResolutionWidth) FROM hits; -SELECT AVG(UserID) FROM hits; -SELECT COUNT(DISTINCT UserID) FROM hits; -SELECT COUNT(DISTINCT SearchPhrase) FROM hits; -SELECT MIN(EventDate), MAX(EventDate) FROM hits; -SELECT AdvEngineID, COUNT(*) FROM hits WHERE AdvEngineID <> 0 GROUP BY AdvEngineID ORDER BY COUNT(*) DESC; -SELECT RegionID, COUNT(DISTINCT UserID) AS u FROM hits GROUP BY RegionID ORDER BY u DESC LIMIT 10; -SELECT RegionID, SUM(AdvEngineID), COUNT(*) AS c, AVG(ResolutionWidth), COUNT(DISTINCT UserID) FROM hits GROUP BY RegionID ORDER BY c DESC LIMIT 10; -SELECT MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT MobilePhone, MobilePhoneModel, COUNT(DISTINCT UserID) AS u FROM hits WHERE MobilePhoneModel <> '' GROUP BY MobilePhone, MobilePhoneModel ORDER BY u DESC LIMIT 10; -SELECT SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, COUNT(DISTINCT UserID) AS u FROM hits WHERE SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY u DESC LIMIT 10; -SELECT SearchEngineID, SearchPhrase, COUNT(*) AS c FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT UserID, COUNT(*) FROM hits GROUP BY UserID ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, SearchPhrase LIMIT 10; -SELECT UserID, extract(minute FROM EventTime) AS m, SearchPhrase, COUNT(*) FROM hits GROUP BY UserID, m, SearchPhrase ORDER BY COUNT(*) DESC LIMIT 10; -SELECT UserID FROM hits WHERE UserID = 435090932899640449; -SELECT COUNT(*) FROM hits WHERE URL LIKE '%google%'; -SELECT SearchPhrase, MIN(URL), COUNT(*) AS c FROM hits WHERE URL LIKE '%google%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT SearchPhrase, MIN(URL), MIN(Title), COUNT(*) AS c, COUNT(DISTINCT UserID) FROM hits WHERE Title LIKE '%Google%' AND URL NOT LIKE '%.google.%' AND SearchPhrase <> '' GROUP BY SearchPhrase ORDER BY c DESC LIMIT 10; -SELECT * FROM hits WHERE URL LIKE '%google%' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY SearchPhrase LIMIT 10; -SELECT SearchPhrase FROM hits WHERE SearchPhrase <> '' ORDER BY EventTime, SearchPhrase LIMIT 10; -SELECT CounterID, AVG(length(URL)) AS l, COUNT(*) AS c FROM hits WHERE URL <> '' GROUP BY CounterID HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT REGEXP_REPLACE(Referer, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k, AVG(length(Referer)) AS l, COUNT(*) AS c, MIN(Referer) FROM hits WHERE Referer <> '' GROUP BY k HAVING COUNT(*) > 100000 ORDER BY l DESC LIMIT 25; -SELECT SUM(ResolutionWidth), SUM(ResolutionWidth + 1), SUM(ResolutionWidth + 2), SUM(ResolutionWidth + 3), SUM(ResolutionWidth + 4), SUM(ResolutionWidth + 5), SUM(ResolutionWidth + 6), SUM(ResolutionWidth + 7), SUM(ResolutionWidth + 8), SUM(ResolutionWidth + 9), SUM(ResolutionWidth + 10), SUM(ResolutionWidth + 11), SUM(ResolutionWidth + 12), SUM(ResolutionWidth + 13), SUM(ResolutionWidth + 14), SUM(ResolutionWidth + 15), SUM(ResolutionWidth + 16), SUM(ResolutionWidth + 17), SUM(ResolutionWidth + 18), SUM(ResolutionWidth + 19), SUM(ResolutionWidth + 20), SUM(ResolutionWidth + 21), SUM(ResolutionWidth + 22), SUM(ResolutionWidth + 23), SUM(ResolutionWidth + 24), SUM(ResolutionWidth + 25), SUM(ResolutionWidth + 26), SUM(ResolutionWidth + 27), SUM(ResolutionWidth + 28), SUM(ResolutionWidth + 29), SUM(ResolutionWidth + 30), SUM(ResolutionWidth + 31), SUM(ResolutionWidth + 32), SUM(ResolutionWidth + 33), SUM(ResolutionWidth + 34), SUM(ResolutionWidth + 35), SUM(ResolutionWidth + 36), SUM(ResolutionWidth + 37), SUM(ResolutionWidth + 38), SUM(ResolutionWidth + 39), SUM(ResolutionWidth + 40), SUM(ResolutionWidth + 41), SUM(ResolutionWidth + 42), SUM(ResolutionWidth + 43), SUM(ResolutionWidth + 44), SUM(ResolutionWidth + 45), SUM(ResolutionWidth + 46), SUM(ResolutionWidth + 47), SUM(ResolutionWidth + 48), SUM(ResolutionWidth + 49), SUM(ResolutionWidth + 50), SUM(ResolutionWidth + 51), SUM(ResolutionWidth + 52), SUM(ResolutionWidth + 53), SUM(ResolutionWidth + 54), SUM(ResolutionWidth + 55), SUM(ResolutionWidth + 56), SUM(ResolutionWidth + 57), SUM(ResolutionWidth + 58), SUM(ResolutionWidth + 59), SUM(ResolutionWidth + 60), SUM(ResolutionWidth + 61), SUM(ResolutionWidth + 62), SUM(ResolutionWidth + 63), SUM(ResolutionWidth + 64), SUM(ResolutionWidth + 65), SUM(ResolutionWidth + 66), SUM(ResolutionWidth + 67), SUM(ResolutionWidth + 68), SUM(ResolutionWidth + 69), SUM(ResolutionWidth + 70), SUM(ResolutionWidth + 71), SUM(ResolutionWidth + 72), SUM(ResolutionWidth + 73), SUM(ResolutionWidth + 74), SUM(ResolutionWidth + 75), SUM(ResolutionWidth + 76), SUM(ResolutionWidth + 77), SUM(ResolutionWidth + 78), SUM(ResolutionWidth + 79), SUM(ResolutionWidth + 80), SUM(ResolutionWidth + 81), SUM(ResolutionWidth + 82), SUM(ResolutionWidth + 83), SUM(ResolutionWidth + 84), SUM(ResolutionWidth + 85), SUM(ResolutionWidth + 86), SUM(ResolutionWidth + 87), SUM(ResolutionWidth + 88), SUM(ResolutionWidth + 89) FROM hits; -SELECT SearchEngineID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY SearchEngineID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits WHERE SearchPhrase <> '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT WatchID, ClientIP, COUNT(*) AS c, SUM(IsRefresh), AVG(ResolutionWidth) FROM hits GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS c FROM hits GROUP BY URL ORDER BY c DESC LIMIT 10; -SELECT 1, URL, COUNT(*) AS c FROM hits GROUP BY 1, URL ORDER BY c DESC LIMIT 10; -SELECT ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3, COUNT(*) AS c FROM hits GROUP BY ClientIP, ClientIP - 1, ClientIP - 2, ClientIP - 3 ORDER BY c DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND URL <> '' GROUP BY URL ORDER BY PageViews DESC LIMIT 10; -SELECT Title, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND DontCountHits = 0 AND IsRefresh = 0 AND Title <> '' GROUP BY Title ORDER BY PageViews DESC LIMIT 10; -SELECT URL, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND IsLink <> 0 AND IsDownload = 0 GROUP BY URL ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT TraficSourceID, SearchEngineID, AdvEngineID, CASE WHEN (SearchEngineID = 0 AND AdvEngineID = 0) THEN Referer ELSE '' END AS Src, URL AS Dst, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 GROUP BY TraficSourceID, SearchEngineID, AdvEngineID, Src, Dst ORDER BY PageViews DESC LIMIT 10 OFFSET 1000; -SELECT URLHash, EventDate, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND TraficSourceID IN (-1, 6) AND RefererHash = 3594120000172545465 GROUP BY URLHash, EventDate ORDER BY PageViews DESC LIMIT 10 OFFSET 100; -SELECT WindowClientWidth, WindowClientHeight, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-01' AND EventDate <= '2013-07-31' AND IsRefresh = 0 AND DontCountHits = 0 AND URLHash = 2868770270353813622 GROUP BY WindowClientWidth, WindowClientHeight ORDER BY PageViews DESC LIMIT 10 OFFSET 10000; -SELECT DATE_TRUNC('minute', EventTime) AS M, COUNT(*) AS PageViews FROM hits WHERE CounterID = 62 AND EventDate >= '2013-07-14' AND EventDate <= '2013-07-15' AND IsRefresh = 0 AND DontCountHits = 0 GROUP BY DATE_TRUNC('minute', EventTime) ORDER BY DATE_TRUNC('minute', EventTime) LIMIT 10 OFFSET 1000; diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs index 4f4a143b92a..cece28c6b6b 100644 --- a/vortex-bench/src/clickbench/benchmark.rs +++ b/vortex-bench/src/clickbench/benchmark.rs @@ -1,15 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::env; use std::fs; -use std::path::Path; use std::path::PathBuf; use anyhow::Result; use reqwest::Client; use url::Url; -use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; @@ -48,28 +45,36 @@ impl ClickBenchBenchmark { self } - /// Returns the path to the queries file for the given engine. + /// Returns the path to the queries file. fn queries_file_path(&self) -> PathBuf { if let Some(file) = &self.queries_file { return file.into(); } - let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); - match self.engine { - Some(Engine::ClickHouse) => { - manifest_dir.join("clickbench_clickhouse_queries.sql") - } - _ => manifest_dir.join("clickbench_queries.sql"), + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest_dir.join("clickbench_queries.sql") + } + + /// Returns true if the engine requires unquoted column names. + fn uses_unquoted_identifiers(&self) -> bool { + matches!(self.engine, Some(Engine::ClickHouse)) + } + + /// Strips double quotes only from simple SQL identifiers for engines like + /// ClickHouse that don't require quoted column names. + fn normalize_query(&self, query: &str) -> String { + if !self.uses_unquoted_identifiers() { + return query.to_string(); } + + strip_simple_identifier_quotes(query) } fn create_data_url(remote_data_dir: &Option, flavor: Flavor) -> Result { match remote_data_dir { None => { let basepath = format!("clickbench_{flavor}").to_data_path(); - Ok(Url::parse(&format!( - "file:{}/", - basepath.to_str().vortex_expect("path should be utf8") - ))?) + Url::from_directory_path(basepath) + .map_err(|_| anyhow::anyhow!("Failed to convert ClickBench data path to URL")) } Some(remote_data_dir) => { if !remote_data_dir.ends_with("/") { @@ -91,6 +96,64 @@ impl ClickBenchBenchmark { } } +fn strip_simple_identifier_quotes(query: &str) -> String { + let bytes = query.as_bytes(); + let mut out = String::with_capacity(query.len()); + let mut i = 0; + + while i < query.len() { + let rel = match query[i..].find('"') { + Some(pos) => pos, + None => { + out.push_str(&query[i..]); + break; + } + }; + + let start = i + rel; + out.push_str(&query[i..start]); + + let mut end = start + 1; + while end < bytes.len() { + if bytes[end] == b'"' { + if end + 1 < bytes.len() && bytes[end + 1] == b'"' { + end += 2; + } else { + break; + } + } else { + end += 1; + } + } + + if end >= bytes.len() { + out.push_str(&query[start..]); + break; + } + + let inner = &query[start + 1..end]; + if is_simple_identifier(inner) { + out.push_str(inner); + } else { + out.push_str(&query[start..=end]); + } + + i = end + 1; + } + + out +} + +fn is_simple_identifier(s: &str) -> bool { + let mut chars = s.chars(); + let Some(first) = chars.next() else { + return false; + }; + + (first.is_ascii_alphabetic() || first == '_') + && chars.all(|c| c.is_ascii_alphanumeric() || c == '_') +} + #[async_trait::async_trait] impl Benchmark for ClickBenchBenchmark { fn queries(&self) -> Result> { @@ -100,7 +163,7 @@ impl Benchmark for ClickBenchBenchmark { .split(';') .map(|s| s.trim()) .filter(|s| !s.is_empty()) - .map(|s| s.to_string()) + .map(|s| self.normalize_query(s)) .enumerate() .collect()) } From f8883786b2739d1b664baf42751a4e9e3653cd66 Mon Sep 17 00:00:00 2001 From: fastio Date: Tue, 3 Mar 2026 21:10:34 +0800 Subject: [PATCH 3/4] bench(clickhouse): remove build-time binary download, resolve clickhouse from PATH - Remove reqwest-based binary download from build.rs - Resolve clickhouse binary via CLICKHOUSE_BINARY env var or $PATH at runtime - Add CI step to install clickhouse before building when needed - Fail with clear error message if binary is not found locally --- .github/workflows/sql-benchmarks.yml | 7 ++ Cargo.lock | 11 +-- benchmarks/clickhouse-bench/Cargo.toml | 3 - benchmarks/clickhouse-bench/build.rs | 115 ++----------------------- benchmarks/clickhouse-bench/src/lib.rs | 106 ++++++++++++----------- 5 files changed, 72 insertions(+), 170 deletions(-) diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 453b64b673f..b79f6d2d19a 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -127,6 +127,13 @@ jobs: - uses: ./.github/actions/system-info + - name: Install ClickHouse + if: contains(matrix.targets, 'clickhouse:') + run: | + curl https://clickhouse.com/ | sh + sudo ./clickhouse install + echo "CLICKHOUSE_BINARY=$(which clickhouse)" >> $GITHUB_ENV + - name: Build binaries shell: bash env: diff --git a/Cargo.lock b/Cargo.lock index acd33115ef0..96ddd27fc9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1191,9 +1191,9 @@ dependencies = [ [[package]] name = "clap_lex" -version = "0.7.7" +version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3e64b0cc0439b12df2fa678eae89a1c56a529fd067a9115f7827f1fffd22b32" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" [[package]] name = "clickhouse-bench" @@ -1201,18 +1201,11 @@ version = "0.1.0" dependencies = [ "anyhow", "clap", - "reqwest", "tokio", "tracing", "vortex-bench", ] -[[package]] -name = "cmake" -version = "1.0.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" - [[package]] name = "codespan-reporting" version = "0.13.1" diff --git a/benchmarks/clickhouse-bench/Cargo.toml b/benchmarks/clickhouse-bench/Cargo.toml index 789cce5a69f..7b26ae12053 100644 --- a/benchmarks/clickhouse-bench/Cargo.toml +++ b/benchmarks/clickhouse-bench/Cargo.toml @@ -18,8 +18,5 @@ tokio = { workspace = true, features = ["full"] } tracing = { workspace = true } vortex-bench = { workspace = true } -[build-dependencies] -reqwest = { workspace = true, features = ["blocking"] } - [lints] workspace = true diff --git a/benchmarks/clickhouse-bench/build.rs b/benchmarks/clickhouse-bench/build.rs index 917d248f9d8..7ef98c8e48d 100644 --- a/benchmarks/clickhouse-bench/build.rs +++ b/benchmarks/clickhouse-bench/build.rs @@ -1,117 +1,18 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Build script that downloads a full ClickHouse binary (with Parquet support) -//! into the target directory. The binary path is exported via -//! `cargo:rustc-env=CLICKHOUSE_BINARY=...` so that `lib.rs` can locate it at runtime -//! via `env!("CLICKHOUSE_BINARY")` without any user-installed dependency. -//! -//! The approach mirrors `vortex-duckdb/build.rs` which auto-downloads a DuckDB dylib. +//! Build script that exports the ClickHouse binary path. //! //! Resolution order: -//! 1. `CLICKHOUSE_LOCAL` env var — use as-is (skip download). -//! 2. Download from `builds.clickhouse.com` (official master builds) into -//! `target/clickhouse-local/clickhouse`. +//! 1. `CLICKHOUSE_BINARY` env var — use as-is. +//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime). //! -//! We use the official master builds because macOS binaries are only available -//! from `builds.clickhouse.com`, not from the tgz/stable package repos. - -#![allow(clippy::unwrap_used)] -#![allow(clippy::expect_used)] -#![allow(clippy::panic)] - -use std::env; -use std::fs; -use std::os::unix::fs::PermissionsExt; -use std::path::PathBuf; - -/// Returns the download URL for the clickhouse binary based on the compilation target. -fn download_url() -> Result> { - let target = env::var("TARGET")?; - let dir = match target.as_str() { - "x86_64-apple-darwin" => "macos", - "aarch64-apple-darwin" => "macos-aarch64", - "x86_64-unknown-linux-gnu" => "amd64", - "aarch64-unknown-linux-gnu" => "aarch64", - other => return Err(format!("Unsupported target for clickhouse download: {other}").into()), - }; - Ok(format!( - "https://builds.clickhouse.com/master/{dir}/clickhouse" - )) -} - -/// Get the base target directory for ClickHouse artifacts. -fn target_dir() -> PathBuf { - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); - manifest_dir.parent().unwrap().parent().unwrap().join("target") -} +//! Users must install ClickHouse themselves for local runs. +//! In CI, it is installed via the workflow before the benchmark step. fn main() { - println!("cargo:rerun-if-env-changed=CLICKHOUSE_LOCAL"); - - // If the user explicitly provides a binary path, just export it. - if let Ok(path) = env::var("CLICKHOUSE_LOCAL") { - println!("cargo:rustc-env=CLICKHOUSE_BINARY={path}"); - return; - } - - let ch_dir = target_dir().join("clickhouse-local"); - let binary_path = ch_dir.join("clickhouse"); - - // If the binary already exists (and is executable), skip download. - if binary_path.exists() { - println!("cargo:rustc-env=CLICKHOUSE_BINARY={}", binary_path.display()); - return; - } - - // Download the full ClickHouse binary. - let url = download_url().expect("Failed to determine clickhouse download URL"); - println!("cargo:warning=Downloading ClickHouse binary from {url} (this may take a minute)..."); - - fs::create_dir_all(&ch_dir).expect("Failed to create clickhouse-local directory"); - - let timeout_secs: u64 = env::var("CARGO_HTTP_TIMEOUT") - .or_else(|_| env::var("HTTP_TIMEOUT")) - .ok() - .and_then(|s| s.parse().ok()) - .unwrap_or(300); // 5 minute timeout for ~160MB download - - let client = reqwest::blocking::Client::builder() - .timeout(std::time::Duration::from_secs(timeout_secs)) - .build() - .expect("Failed to create HTTP client"); - - let response = client - .get(&url) - .send() - .expect("Failed to download ClickHouse binary"); - - assert!( - response.status().is_success(), - "Failed to download ClickHouse binary: HTTP {}", - response.status() - ); - - let bytes = response - .bytes() - .expect("Failed to read ClickHouse binary response body"); - - // Write to a temporary file first, then rename (atomic on same filesystem). - let tmp_path = ch_dir.join("clickhouse.tmp"); - fs::write(&tmp_path, &bytes).expect("Failed to write ClickHouse binary"); - - // Make it executable (0o755). - let mut perms = fs::metadata(&tmp_path) - .expect("Failed to read tmp binary metadata") - .permissions(); - perms.set_mode(0o755); - fs::set_permissions(&tmp_path, perms).expect("Failed to set executable permissions"); - - fs::rename(&tmp_path, &binary_path).expect("Failed to rename ClickHouse binary into place"); + println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY"); - println!("cargo:rustc-env=CLICKHOUSE_BINARY={}", binary_path.display()); - println!( - "cargo:warning=ClickHouse binary downloaded to {}", - binary_path.display() - ); + let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string()); + println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}"); } diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs index 6f622b9c3a1..960675a251f 100644 --- a/benchmarks/clickhouse-bench/src/lib.rs +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -6,50 +6,13 @@ //! Uses `clickhouse-local` via `std::process::Command` to execute SQL queries //! against Parquet files on disk. //! -//! The ClickHouse binary is **automatically downloaded** at build time by `build.rs` -//! (similar to how `vortex-duckdb/build.rs` downloads the DuckDB dynamic library). -//! No manual installation is required. +//! The ClickHouse binary is resolved at build time via `build.rs`: +//! 1. `CLICKHOUSE_BINARY` env var — use the specified path. +//! 2. Falls back to `"clickhouse"` — resolved from `$PATH` at runtime. //! -//! ## Scan API Evaluation for ClickHouse Integration -//! -//! Per @gatesn's request in Discussion #6425, we evaluated whether the Vortex Scan API -//! (`vortex-scan/src/api.rs`) can support a good ClickHouse integration. -//! -//! ### Mapping -//! -//! The Scan API's four-layer abstraction maps naturally to ClickHouse: -//! -//! | Scan API | ClickHouse mapping | -//! |---|---| -//! | `DataSource` | Table metadata + connection config (`Send + Sync`, shareable) | -//! | `ScanRequest.projection` | `SELECT` column/expression pushdown (needs `Expression` → SQL converter) | -//! | `ScanRequest.filter` | `WHERE` clause pushdown (similar to `vortex-datafusion/convert/exprs.rs`) | -//! | `ScanRequest.limit` | `LIMIT N` pushdown (trivial) | -//! | `DataSourceScan` | Query planning + partition discovery (`system.parts` or file-level) | -//! | `Split` | Per-partition query execution unit | -//! | `Split::execute()` | Executes partition query, streams results as `SendableArrayStream` | -//! -//! ### Potential API Gaps -//! -//! 1. **No engine capability negotiation** — `DataSource` cannot declare which expression types -//! it supports for pushdown. Suggest adding `capabilities()` method. -//! 2. **`Split::execute()` is sync** — ClickHouse queries are inherently async (network I/O). -//! The pattern used by `LayoutReaderDataSource` (pre-compute `BoxFuture` in `scan()`) works -//! but should be documented as the recommended approach. -//! 3. **No column statistics API** — only `row_count_estimate()` exists. ClickHouse has rich -//! column stats (min/max/NDV) that could enable better query planning. -//! 4. **No transaction/snapshot semantics** — could lead to inconsistent reads across splits -//! on ClickHouse replicas. -//! -//! ### Conclusion -//! -//! The Scan API is a reasonable fit. None of the gaps are blockers. The recommended integration -//! order is: -//! 1. This PR: ClickBench baseline with `clickhouse-local` CLI (performance reference) -//! 2. `vortex-clickhouse` crate with type conversion (DType ↔ ClickHouse types) -//! 3. `ClickHouseDataSource` implementing `DataSource` trait (basic scan, no pushdown) -//! 4. Filter pushdown (`Expression` → ClickHouse WHERE clause) -//! 5. Projection pushdown and performance optimization +//! For local runs, install ClickHouse manually (e.g., `brew install clickhouse` +//! or download from ). +//! In CI, it is installed by the workflow before the benchmark step. use std::io::Write; use std::path::PathBuf; @@ -65,6 +28,9 @@ use vortex_bench::Benchmark; use vortex_bench::Format; /// Path to the ClickHouse binary, set by build.rs at compile time. +/// +/// This is either the value of the `CLICKHOUSE_BINARY` env var at build time, +/// or `"clickhouse"` (resolved from `$PATH` at runtime). const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY"); /// A client that wraps `clickhouse-local` for running SQL benchmarks. @@ -77,18 +43,19 @@ pub struct ClickHouseClient { impl ClickHouseClient { /// Create a new client. Only Parquet format is supported. + /// + /// The ClickHouse binary is resolved from (in order): + /// 1. `CLICKHOUSE_BINARY` env var at build time + /// 2. `"clickhouse"` on `$PATH` pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result { if format != Format::Parquet { anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}"); } let binary = PathBuf::from(CLICKHOUSE_BINARY); - anyhow::ensure!( - binary.exists(), - "ClickHouse binary not found at '{}'. \ - This should have been downloaded by build.rs. Try `cargo clean -p clickhouse-bench`.", - binary.display() - ); + + // Verify the binary is usable (either absolute path exists, or resolvable via PATH). + Self::verify_binary(&binary)?; tracing::info!(binary = %binary.display(), "Using clickhouse-local"); @@ -100,6 +67,44 @@ impl ClickHouseClient { Ok(client) } + /// Check that the ClickHouse binary is available. + /// + /// For absolute paths, checks that the file exists on disk. + /// For bare names (e.g., `"clickhouse"`), tries to resolve via `$PATH` using `which`. + fn verify_binary(binary: &PathBuf) -> Result<()> { + if binary.is_absolute() { + anyhow::ensure!( + binary.exists(), + "ClickHouse binary not found at '{path}'. \ + Set CLICKHOUSE_BINARY env var to the correct path, or install ClickHouse \ + and ensure it is on $PATH.", + path = binary.display() + ); + } else { + // Try to find the binary on $PATH via `which`. + let output = Command::new("which") + .arg(binary.as_os_str()) + .output() + .context("Failed to run `which` to locate clickhouse binary")?; + + anyhow::ensure!( + output.status.success(), + "ClickHouse binary '{name}' not found on $PATH. \ + Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ + CLICKHOUSE_BINARY env var to an absolute path before building.", + name = binary.display() + ); + + let resolved = String::from_utf8_lossy(&output.stdout); + tracing::debug!( + resolved = resolved.trim(), + "Resolved clickhouse binary from PATH" + ); + } + + Ok(()) + } + /// Generate `CREATE VIEW ... AS SELECT * FROM file(...)` statements. /// /// We use a VIEW over the `file()` table function rather than `CREATE TABLE ... ENGINE = File()` @@ -172,8 +177,7 @@ impl ClickHouseClient { let time_instant = Instant::now(); - // The downloaded binary is the multi-tool `clickhouse` binary, - // so we always invoke it as `clickhouse local`. + // The `clickhouse` binary is a multi-tool; invoke it as `clickhouse local`. let mut child = Command::new(&self.binary) .args(["local", "--format", "TabSeparated"]) .stdin(Stdio::piped()) From 0e64800793df257bec45a94cd6c0de799d318385 Mon Sep 17 00:00:00 2001 From: fastio Date: Wed, 4 Mar 2026 10:38:59 +0800 Subject: [PATCH 4/4] bench(clickhouse): fix review issues and pin LTS version - Pass subcommand arg to clickhouse-bench in run-sql-bench.sh for consistency - Use BenchmarkArg + create_benchmark() in main.rs like other engines - Replace `which` with `clickhouse local --version` for binary verification - Pin ClickHouse to LTS release v25.8.18.1 from GitHub Releases --- .github/scripts/run-sql-bench.sh | 2 +- .github/workflows/sql-benchmarks.yml | 9 +++-- benchmarks/clickhouse-bench/src/lib.rs | 45 +++++++++++++------------ benchmarks/clickhouse-bench/src/main.rs | 19 +++++------ 4 files changed, 40 insertions(+), 35 deletions(-) diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh index 73d2a26d962..9fd91b0dd7f 100755 --- a/.github/scripts/run-sql-bench.sh +++ b/.github/scripts/run-sql-bench.sh @@ -132,7 +132,7 @@ fi # ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files). if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then # shellcheck disable=SC2086 - target/release_debug/clickhouse-bench \ + target/release_debug/clickhouse-bench "$subcommand" \ -d gh-json \ $opts \ -o ch-results.json diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index b79f6d2d19a..424d16066aa 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -129,10 +129,13 @@ jobs: - name: Install ClickHouse if: contains(matrix.targets, 'clickhouse:') + env: + CLICKHOUSE_VERSION: "25.8.18.1" run: | - curl https://clickhouse.com/ | sh - sudo ./clickhouse install - echo "CLICKHOUSE_BINARY=$(which clickhouse)" >> $GITHUB_ENV + wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz + cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse . + chmod +x clickhouse + echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV - name: Build binaries shell: bash diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs index 960675a251f..9327776578b 100644 --- a/benchmarks/clickhouse-bench/src/lib.rs +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -70,7 +70,7 @@ impl ClickHouseClient { /// Check that the ClickHouse binary is available. /// /// For absolute paths, checks that the file exists on disk. - /// For bare names (e.g., `"clickhouse"`), tries to resolve via `$PATH` using `which`. + /// For bare names (e.g., `"clickhouse"`), tries to invoke it to verify it's resolvable. fn verify_binary(binary: &PathBuf) -> Result<()> { if binary.is_absolute() { anyhow::ensure!( @@ -80,28 +80,31 @@ impl ClickHouseClient { and ensure it is on $PATH.", path = binary.display() ); - } else { - // Try to find the binary on $PATH via `which`. - let output = Command::new("which") - .arg(binary.as_os_str()) - .output() - .context("Failed to run `which` to locate clickhouse binary")?; - - anyhow::ensure!( - output.status.success(), - "ClickHouse binary '{name}' not found on $PATH. \ - Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ - CLICKHOUSE_BINARY env var to an absolute path before building.", - name = binary.display() - ); - - let resolved = String::from_utf8_lossy(&output.stdout); - tracing::debug!( - resolved = resolved.trim(), - "Resolved clickhouse binary from PATH" - ); } + // Verify the binary is actually usable by running `clickhouse local --version`. + let output = Command::new(binary.as_os_str()) + .args(["local", "--version"]) + .output() + .with_context(|| { + format!( + "ClickHouse binary '{name}' not found on $PATH. \ + Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ + CLICKHOUSE_BINARY env var to an absolute path before building.", + name = binary.display() + ) + })?; + + anyhow::ensure!( + output.status.success(), + "ClickHouse binary at '{name}' failed to run: {stderr}", + name = binary.display(), + stderr = String::from_utf8_lossy(&output.stderr) + ); + + let version = String::from_utf8_lossy(&output.stdout); + tracing::debug!(version = version.trim(), "Verified clickhouse binary"); + Ok(()) } diff --git a/benchmarks/clickhouse-bench/src/main.rs b/benchmarks/clickhouse-bench/src/main.rs index 03e8db27fd6..bd8e7a7666d 100644 --- a/benchmarks/clickhouse-bench/src/main.rs +++ b/benchmarks/clickhouse-bench/src/main.rs @@ -6,13 +6,12 @@ use std::path::PathBuf; use clap::Parser; use clickhouse_bench::ClickHouseClient; use tokio::runtime::Runtime; -use vortex_bench::Benchmark; +use vortex_bench::BenchmarkArg; use vortex_bench::Engine; use vortex_bench::Format; use vortex_bench::Opt; use vortex_bench::Opts; -use vortex_bench::clickbench::ClickBenchBenchmark; -use vortex_bench::clickbench::Flavor; +use vortex_bench::create_benchmark; use vortex_bench::create_output_writer; use vortex_bench::display::DisplayFormat; use vortex_bench::runner::SqlBenchmarkRunner; @@ -21,11 +20,14 @@ use vortex_bench::setup_logging_and_tracing; /// ClickHouse (clickhouse-local) benchmark runner. /// -/// Runs ClickBench queries against Parquet data using clickhouse-local as a performance baseline. +/// Runs queries against Parquet data using clickhouse-local as a performance baseline. /// This allows comparing ClickHouse's native Parquet reading performance against other engines /// (DuckDB, DataFusion) on the same hardware and dataset. #[derive(Parser)] struct Args { + #[arg(value_enum)] + benchmark: BenchmarkArg, + #[arg(short, long, default_value_t = 5)] iterations: usize, @@ -63,10 +65,7 @@ fn main() -> anyhow::Result<()> { setup_logging_and_tracing(args.verbose, args.tracing)?; - let flavor = opts.get_as::("flavor").unwrap_or_default(); - let remote_data_dir = opts.get_as::("remote-data-dir"); - let benchmark = - ClickBenchBenchmark::new(flavor, None, remote_data_dir)?.with_engine(Engine::ClickHouse); + let benchmark = create_benchmark(args.benchmark, &opts)?; let filtered_queries = filter_queries( benchmark.queries()?, @@ -83,7 +82,7 @@ fn main() -> anyhow::Result<()> { let formats = vec![Format::Parquet]; let mut runner = SqlBenchmarkRunner::new( - &benchmark, + benchmark.as_ref(), Engine::ClickHouse, formats, args.track_memory, @@ -93,7 +92,7 @@ fn main() -> anyhow::Result<()> { runner.run_all( &filtered_queries, args.iterations, - |format| ClickHouseClient::new(&benchmark, format), + |format| ClickHouseClient::new(benchmark.as_ref(), format), |ctx, _query_idx, _format, query| ctx.execute_query(query), )?;