Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
f5dd884
wip
a10y Feb 25, 2026
a208d42
try again
a10y Feb 25, 2026
34b1fbc
save
a10y Feb 26, 2026
1e9c62e
more
a10y Feb 26, 2026
b98be83
kernel gen with patches
a10y Feb 26, 2026
478812c
pass patches
a10y Feb 26, 2026
fe81c52
pass patches arg to bitunpack kernel
a10y Feb 26, 2026
afd8657
save
a10y Feb 27, 2026
3911f71
fix
a10y Feb 27, 2026
8f6ecd1
inject the patches
a10y Feb 27, 2026
be0262c
specialize
a10y Feb 27, 2026
2d76ad2
device
a10y Feb 27, 2026
e9529e3
add device printf for debug
a10y Feb 27, 2026
96cef63
woops
a10y Feb 27, 2026
274950b
fixup
a10y Feb 27, 2026
45a9dd2
more
a10y Feb 27, 2026
691cf71
clang-format
a10y Feb 27, 2026
cd35e85
lints
a10y Feb 27, 2026
85caa85
remove prints
a10y Feb 27, 2026
33f78c7
some fixes
a10y Feb 27, 2026
36d1c53
stop off-by-one-maxxing
a10y Feb 27, 2026
e9c22ff
format
a10y Feb 27, 2026
7860d89
more formatxxing
a10y Feb 27, 2026
48df407
update test to force multiple chunks
a10y Feb 27, 2026
2ff4f93
transpose patches bench
a10y Feb 27, 2026
6e25c1f
ref
a10y Feb 27, 2026
bdef786
fix xfer
a10y Feb 27, 2026
8d965de
tweak
a10y Feb 27, 2026
2bac265
fix lane sizing
a10y Feb 27, 2026
e2a83b3
format
a10y Feb 27, 2026
541ae31
remove GPUPatches passing + add __syncwarps
a10y Mar 2, 2026
507ee94
less register pressure
0ax1 Mar 3, 2026
0d409c3
pass patches by ref
a10y Mar 3, 2026
8ca6943
struct -> class
a10y Mar 3, 2026
709b8e5
more
a10y Mar 3, 2026
39b96c7
fix
a10y Mar 3, 2026
2006eea
add more unit tests
a10y Mar 3, 2026
1a6cd1f
fix test
a10y Mar 3, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vortex-cuda/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,7 @@ harness = false
[[bench]]
name = "throughput_cuda"
harness = false

[[bench]]
name = "transpose_patches"
harness = false
76 changes: 76 additions & 0 deletions vortex-cuda/benches/transpose_patches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#![allow(clippy::unwrap_used)]
#![allow(clippy::cast_possible_truncation)]

use std::time::Duration;

use criterion::BenchmarkId;
use criterion::Criterion;
use criterion::Throughput;
use futures::executor::block_on;
use vortex::buffer::Buffer;
use vortex::buffer::buffer;
use vortex::session::VortexSession;
use vortex_array::IntoArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::dtype::PType;
use vortex_array::patches::Patches;
use vortex_array::validity::Validity;
use vortex_cuda::CudaSession;
use vortex_cuda::transpose_patches;
use vortex_cuda_macros::cuda_available;
use vortex_cuda_macros::cuda_not_available;
use vortex_error::VortexExpect;

fn benchmark_transpose(c: &mut Criterion) {
let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
.vortex_expect("failed to create execution context");

let patches = block_on(async {
// Assume that we have 64k values, and we have 1024 patches evenly disbursed across
// the range.
let indices = (0..1024).map(|x| x * 64).collect::<Buffer<u32>>();

let values = buffer![-1.0f32; 1024];

let device_indices = cuda_ctx.copy_to_device(indices)?.await?;
let device_values = cuda_ctx.copy_to_device(values)?.await?;

Patches::new(
64 * 1024,
0,
PrimitiveArray::from_buffer_handle(device_indices, PType::U32, Validity::NonNullable)
.into_array(),
PrimitiveArray::from_buffer_handle(device_values, PType::F32, Validity::NonNullable)
.into_array(),
None,
)
})
.unwrap();

let mut group = c.benchmark_group("transpose");
group.sample_size(100);
group.measurement_time(Duration::from_secs(10));

group.throughput(Throughput::Bytes(
patches.indices().nbytes() + patches.values().nbytes(),
));

group.bench_with_input(
BenchmarkId::new("transpose_patches", 0),
&patches,
|b, patches| {
b.iter(|| block_on(async { transpose_patches(patches, &mut cuda_ctx).await.unwrap() }))
},
);
}

criterion::criterion_group!(benches, benchmark_transpose);

#[cuda_available]
criterion::criterion_main!(benches);

#[cuda_not_available]
fn main() {}
18 changes: 18 additions & 0 deletions vortex-cuda/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ fn main() {

let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set"));
generate_dynamic_dispatch_bindings(&kernels_src, &out_dir);
generate_patches_bindings(&kernels_src, &out_dir);

if !is_cuda_available() {
return;
Expand Down Expand Up @@ -202,6 +203,23 @@ fn generate_dynamic_dispatch_bindings(kernels_src: &Path, out_dir: &Path) {
.expect("Failed to write dynamic_dispatch.rs");
}

/// Generate bindings for patches shared header.
fn generate_patches_bindings(kernels_src: &Path, out_dir: &Path) {
let header = kernels_src.join("patches.h");
println!("cargo:rerun-if-changed={}", header.display());

let bindings = bindgen::Builder::default()
.header(header.to_string_lossy())
.derive_copy(true)
.derive_debug(true)
.generate()
.expect("Failed to generate dynamic_dispatch bindings");

bindings
.write_to_file(out_dir.join("patches.rs"))
.expect("Failed to write patches.rs");
}

/// Check if CUDA is available based on nvcc.
fn is_cuda_available() -> bool {
Command::new("nvcc")
Expand Down
31 changes: 23 additions & 8 deletions vortex-cuda/cuda_kernel_generator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");

let local_func_params = format!(
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx)"
"(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches)"
);

writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
Expand All @@ -141,12 +141,22 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, shared_out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
}

writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?;
output.indent(|output| {
writeln!(output, "auto idx = i * {thread_count} + thread_idx;")?;
writeln!(output, "out[idx] = shared_out[idx];")
})?;
writeln!(output, "}}")
writeln!(output, "__syncwarp();")?;
writeln!(output, "PatchesCursor<uint{bits}_t> cursor(patches, blockIdx.x, thread_idx, {thread_count});")?;
writeln!(output, "auto patch = cursor.next();")?;
writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?;
output.indent(|output| {
writeln!(output, "auto idx = i * {thread_count} + thread_idx;")?;
writeln!(output, "if (idx == patch.index) {{")?;
writeln!(output, " out[idx] = patch.value;")?;
writeln!(output, " patch = cursor.next();")?;
writeln!(output, "}} else {{")?;
writeln!(output, " out[idx] = shared_out[idx];")?;
writeln!(output, "}}")
})?;
writeln!(output, "}}")
})
})?;

writeln!(output, "}}")
Expand All @@ -161,7 +171,7 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(

let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
let func_params = format!(
"(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference)"
"(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference, GPUPatches patches)"
);

writeln!(
Expand All @@ -170,14 +180,18 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
)?;

output.indent(|output| {
// Create a new set of patches
writeln!(output, "int thread_idx = threadIdx.x;")?;
writeln!(
output,
"auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
)?;
writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;

writeln!(output, "_{func_name}(in, out, reference, thread_idx);")
writeln!(
output,
"_{func_name}(in, out, reference, thread_idx, patches);"
)
})?;

writeln!(output, "}}")
Expand All @@ -195,6 +209,7 @@ pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
writeln!(output, "#include <cuda_runtime.h>")?;
writeln!(output, "#include <stdint.h>")?;
writeln!(output, "#include \"fastlanes_common.cuh\"")?;
writeln!(output, "#include \"patches.cuh\"")?;
writeln!(output)?;

// First, emit all lane decoders.
Expand Down
1 change: 1 addition & 0 deletions vortex-cuda/kernels/src/bit_unpack.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "bit_unpack_16.cu"
#include "bit_unpack_32.cu"
#include "bit_unpack_64.cu"
#include "patches.h"

/// Decodes a single lane of packed data.
///
Expand Down
Loading
Loading