Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions .github/scripts/run-sql-bench.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,21 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright the Vortex contributors
#
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets.
# This script is used by the sql-benchmarks.yml workflow.
# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench)
# for the given targets. This script is used by the sql-benchmarks.yml workflow.
#
# Usage:
# run-sql-bench.sh <subcommand> <targets> [options]
#
# Arguments:
# subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds)
# targets Comma-separated list of engine:format pairs
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet")
# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet")
#
# Options:
# --scale-factor <sf> Scale factor for the benchmark (e.g., 1.0, 10.0)
# --remote-storage <url> Remote storage URL (e.g., s3://bucket/path/)
# If provided, runs in remote mode (no lance support).
# If provided, runs in remote mode (no lance/clickhouse support).
# --benchmark-id <id> Benchmark ID for error messages (e.g., tpch-s3)

set -Eeu -o pipefail
Expand Down Expand Up @@ -78,6 +78,7 @@ fi
df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//')
ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//')
has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false")
has_clickhouse=$(echo "$targets" | grep -q '^clickhouse:' && echo "true" || echo "false")

# Build options string.
opts=""
Expand Down Expand Up @@ -127,3 +128,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l

cat lance-results.json >> results.json
fi

# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files).
if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then
# shellcheck disable=SC2086
target/release_debug/clickhouse-bench "$subcommand" \
-d gh-json \
$opts \
-o ch-results.json

cat ch-results.json >> results.json
fi
2 changes: 1 addition & 1 deletion .github/workflows/bench.yml
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ jobs:
"id": "clickbench-nvme",
"subcommand": "clickbench",
"name": "Clickbench on NVME",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet",
"build_lance": true
},
{
Expand Down
15 changes: 14 additions & 1 deletion .github/workflows/sql-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ on:
"id": "clickbench-nvme",
"subcommand": "clickbench",
"name": "Clickbench on NVME",
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb"
"targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet"
},
{
"id": "tpch-nvme",
Expand Down Expand Up @@ -127,6 +127,16 @@ jobs:

- uses: ./.github/actions/system-info

- name: Install ClickHouse
if: contains(matrix.targets, 'clickhouse:')
env:
CLICKHOUSE_VERSION: "25.8.18.1"
run: |
wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz
cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse .
chmod +x clickhouse
echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV

- name: Build binaries
shell: bash
env:
Expand All @@ -136,6 +146,9 @@ jobs:
if [ "${{ matrix.build_lance }}" = "true" ]; then
packages="$packages --bin lance-bench"
fi
if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then
packages="$packages --bin clickhouse-bench"
fi
cargo build $packages --profile release_debug

- name: Generate data
Expand Down
11 changes: 11 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ members = [
"encodings/zstd",
"encodings/bytebool",
# Benchmarks
"benchmarks/clickhouse-bench",
"benchmarks/lance-bench",
"benchmarks/compress-bench",
"benchmarks/datafusion-bench",
Expand Down
22 changes: 22 additions & 0 deletions benchmarks/clickhouse-bench/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[package]
name = "clickhouse-bench"
description = "ClickHouse (clickhouse-local) benchmark runner for Vortex"
authors.workspace = true
edition.workspace = true
homepage.workspace = true
license.workspace = true
readme.workspace = true
repository.workspace = true
rust-version.workspace = true
version.workspace = true
publish = false

[dependencies]
anyhow = { workspace = true }
clap = { workspace = true, features = ["derive"] }
tokio = { workspace = true, features = ["full"] }
tracing = { workspace = true }
vortex-bench = { workspace = true }

[lints]
workspace = true
18 changes: 18 additions & 0 deletions benchmarks/clickhouse-bench/build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Build script that exports the ClickHouse binary path.
//!
//! Resolution order:
//! 1. `CLICKHOUSE_BINARY` env var — use as-is.
//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime).
//!
//! Users must install ClickHouse themselves for local runs.
//! In CI, it is installed via the workflow before the benchmark step.

fn main() {
println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY");

let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string());
println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}");
}
223 changes: 223 additions & 0 deletions benchmarks/clickhouse-bench/src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! ClickHouse Local context for benchmarks.
//!
//! Uses `clickhouse-local` via `std::process::Command` to execute SQL queries
//! against Parquet files on disk.
//!
//! The ClickHouse binary is resolved at build time via `build.rs`:
//! 1. `CLICKHOUSE_BINARY` env var — use the specified path.
//! 2. Falls back to `"clickhouse"` — resolved from `$PATH` at runtime.
//!
//! For local runs, install ClickHouse manually (e.g., `brew install clickhouse`
//! or download from <https://clickhouse.com/docs/en/install>).
//! In CI, it is installed by the workflow before the benchmark step.

use std::io::Write;
use std::path::PathBuf;
use std::process::Command;
use std::process::Stdio;
use std::time::Duration;
use std::time::Instant;

use anyhow::Context;
use anyhow::Result;
use tracing::trace;
use vortex_bench::Benchmark;
use vortex_bench::Format;

/// Path to the ClickHouse binary, set by build.rs at compile time.
///
/// This is either the value of the `CLICKHOUSE_BINARY` env var at build time,
/// or `"clickhouse"` (resolved from `$PATH` at runtime).
const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY");

/// A client that wraps `clickhouse-local` for running SQL benchmarks.
pub struct ClickHouseClient {
/// The path to the `clickhouse` binary.
binary: PathBuf,
/// SQL statements to run before each query (CREATE VIEW statements).
setup_sql: Vec<String>,
}

impl ClickHouseClient {
/// Create a new client. Only Parquet format is supported.
///
/// The ClickHouse binary is resolved from (in order):
/// 1. `CLICKHOUSE_BINARY` env var at build time
/// 2. `"clickhouse"` on `$PATH`
pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result<Self> {
if format != Format::Parquet {
anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}");
}

let binary = PathBuf::from(CLICKHOUSE_BINARY);

// Verify the binary is usable (either absolute path exists, or resolvable via PATH).
Self::verify_binary(&binary)?;

tracing::info!(binary = %binary.display(), "Using clickhouse-local");

let mut client = Self {
binary,
setup_sql: Vec::new(),
};
client.register_tables(benchmark, format)?;
Ok(client)
}

/// Check that the ClickHouse binary is available.
///
/// For absolute paths, checks that the file exists on disk.
/// For bare names (e.g., `"clickhouse"`), tries to invoke it to verify it's resolvable.
fn verify_binary(binary: &PathBuf) -> Result<()> {
if binary.is_absolute() {
anyhow::ensure!(
binary.exists(),
"ClickHouse binary not found at '{path}'. \
Set CLICKHOUSE_BINARY env var to the correct path, or install ClickHouse \
and ensure it is on $PATH.",
path = binary.display()
);
}

// Verify the binary is actually usable by running `clickhouse local --version`.
let output = Command::new(binary.as_os_str())
.args(["local", "--version"])
.output()
.with_context(|| {
format!(
"ClickHouse binary '{name}' not found on $PATH. \
Install ClickHouse (https://clickhouse.com/docs/en/install) or set \
CLICKHOUSE_BINARY env var to an absolute path before building.",
name = binary.display()
)
})?;

anyhow::ensure!(
output.status.success(),
"ClickHouse binary at '{name}' failed to run: {stderr}",
name = binary.display(),
stderr = String::from_utf8_lossy(&output.stderr)
);

let version = String::from_utf8_lossy(&output.stdout);
tracing::debug!(version = version.trim(), "Verified clickhouse binary");

Ok(())
}

/// Generate `CREATE VIEW ... AS SELECT * FROM file(...)` statements.
///
/// We use a VIEW over the `file()` table function rather than `CREATE TABLE ... ENGINE = File()`
/// because the `file()` function handles glob patterns (e.g., `*.parquet`) more reliably across
/// ClickHouse versions.
fn register_tables(&mut self, benchmark: &dyn Benchmark, format: Format) -> Result<()> {
let data_url = benchmark.data_url();
let base_dir = if data_url.scheme() == "file" {
data_url
.to_file_path()
.map_err(|_| anyhow::anyhow!("Invalid file URL: {data_url}"))?
} else {
anyhow::bail!("clickhouse-bench only supports local file:// data URLs");
};

let format_dir = base_dir.join(format.name());
if !format_dir.exists() {
anyhow::bail!(
"Data directory does not exist: {}. Run data generation first.",
format_dir.display()
);
}

for table_spec in benchmark.table_specs() {
let name = table_spec.name;
let pattern = benchmark
.pattern(name, format)
.map(|p| p.to_string())
.unwrap_or_else(|| format!("*.{}", format.ext()));

let data_path = format!("{}/{}", format_dir.display(), pattern);

tracing::info!(
table = name,
path = %data_path,
"Registering ClickHouse table"
);

let create_sql = format!(
"CREATE VIEW IF NOT EXISTS {name} AS \
SELECT * FROM file('{data_path}', Parquet);"
);
self.setup_sql.push(create_sql);
}

Ok(())
}

/// Execute a SQL query via `clickhouse-local`, returning `(row_count, timing)`.
///
/// The approach:
/// 1. Prepend all CREATE VIEW statements
/// 2. Append the benchmark query
/// 3. Pipe the combined SQL into `clickhouse local` via stdin
/// 4. Parse stdout to count result rows
pub fn execute_query(&self, query: &str) -> Result<(usize, Option<Duration>)> {
trace!("execute clickhouse query: {query}");

// Build the full SQL: setup views + the actual query
let mut full_sql = String::new();
for stmt in &self.setup_sql {
full_sql.push_str(stmt);
full_sql.push('\n');
}
full_sql.push_str(query);
// Ensure we have a trailing semicolon
if !query.trim_end().ends_with(';') {
full_sql.push(';');
}

let time_instant = Instant::now();

// The `clickhouse` binary is a multi-tool; invoke it as `clickhouse local`.
let mut child = Command::new(&self.binary)
.args(["local", "--format", "TabSeparated"])
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.context("Failed to spawn clickhouse-local")?;

// Write SQL to stdin
{
let stdin = child
.stdin
.as_mut()
.context("Failed to open clickhouse-local stdin")?;
stdin
.write_all(full_sql.as_bytes())
.context("Failed to write SQL to clickhouse-local stdin")?;
}

let output = child
.wait_with_output()
.context("Failed to wait for clickhouse-local")?;

let query_time = time_instant.elapsed();

if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
anyhow::bail!(
"clickhouse-local failed (exit {}): {stderr}",
output.status.code().unwrap_or(-1)
);
}

// Count non-empty lines in stdout as row count
let stdout = String::from_utf8_lossy(&output.stdout);
let row_count = stdout.lines().filter(|line| !line.is_empty()).count();

Ok((row_count, Some(query_time)))
}
}
Loading