vortex-data · a10y · Mar 4, 2026 · Feb 25, 2026 · Feb 25, 2026 · Feb 26, 2026
diff --git a/vortex-cuda/Cargo.toml b/vortex-cuda/Cargo.toml
@@ -88,3 +88,7 @@ harness = false
 [[bench]]
 name = "throughput_cuda"
 harness = false
+
+[[bench]]
+name = "transpose_patches"
+harness = false
diff --git a/vortex-cuda/benches/transpose_patches.rs b/vortex-cuda/benches/transpose_patches.rs
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(clippy::unwrap_used)]
+#![allow(clippy::cast_possible_truncation)]
+
+use std::time::Duration;
+
+use criterion::BenchmarkId;
+use criterion::Criterion;
+use criterion::Throughput;
+use futures::executor::block_on;
+use vortex::buffer::Buffer;
+use vortex::buffer::buffer;
+use vortex::session::VortexSession;
+use vortex_array::IntoArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::PType;
+use vortex_array::patches::Patches;
+use vortex_array::validity::Validity;
+use vortex_cuda::CudaSession;
+use vortex_cuda::transpose_patches;
+use vortex_cuda_macros::cuda_available;
+use vortex_cuda_macros::cuda_not_available;
+use vortex_error::VortexExpect;
+
+fn benchmark_transpose(c: &mut Criterion) {
+    let mut cuda_ctx = CudaSession::create_execution_ctx(&VortexSession::empty())
+        .vortex_expect("failed to create execution context");
+
+    let patches = block_on(async {
+        // Assume that we have 64k values, and we have 1024 patches evenly disbursed across
+        // the range.
+        let indices = (0..1024).map(|x| x * 64).collect::<Buffer<u32>>();
+
+        let values = buffer![-1.0f32; 1024];
+
+        let device_indices = cuda_ctx.copy_to_device(indices)?.await?;
+        let device_values = cuda_ctx.copy_to_device(values)?.await?;
+
+        Patches::new(
+            64 * 1024,
+            0,
+            PrimitiveArray::from_buffer_handle(device_indices, PType::U32, Validity::NonNullable)
+                .into_array(),
+            PrimitiveArray::from_buffer_handle(device_values, PType::F32, Validity::NonNullable)
+                .into_array(),
+            None,
+        )
+    })
+    .unwrap();
+
+    let mut group = c.benchmark_group("transpose");
+    group.sample_size(100);
+    group.measurement_time(Duration::from_secs(10));
+
+    group.throughput(Throughput::Bytes(
+        patches.indices().nbytes() + patches.values().nbytes(),
+    ));
+
+    group.bench_with_input(
+        BenchmarkId::new("transpose_patches", 0),
+        &patches,
+        |b, patches| {
+            b.iter(|| block_on(async { transpose_patches(patches, &mut cuda_ctx).await.unwrap() }))
+        },
+    );
+}
+
+criterion::criterion_group!(benches, benchmark_transpose);
+
+#[cuda_available]
+criterion::criterion_main!(benches);
+
+#[cuda_not_available]
+fn main() {}
diff --git a/vortex-cuda/build.rs b/vortex-cuda/build.rs
@@ -56,6 +56,7 @@ fn main() {
 
     let out_dir = PathBuf::from(env::var("OUT_DIR").expect("OUT_DIR not set"));
     generate_dynamic_dispatch_bindings(&kernels_src, &out_dir);
+    generate_patches_bindings(&kernels_src, &out_dir);
 
     if !is_cuda_available() {
         return;
@@ -202,6 +203,23 @@ fn generate_dynamic_dispatch_bindings(kernels_src: &Path, out_dir: &Path) {
         .expect("Failed to write dynamic_dispatch.rs");
 }
 
+/// Generate bindings for patches shared header.
+fn generate_patches_bindings(kernels_src: &Path, out_dir: &Path) {
+    let header = kernels_src.join("patches.h");
+    println!("cargo:rerun-if-changed={}", header.display());
+
+    let bindings = bindgen::Builder::default()
+        .header(header.to_string_lossy())
+        .derive_copy(true)
+        .derive_debug(true)
+        .generate()
+        .expect("Failed to generate dynamic_dispatch bindings");
+
+    bindings
+        .write_to_file(out_dir.join("patches.rs"))
+        .expect("Failed to write patches.rs");
+}
+
 /// Check if CUDA is available based on nvcc.
 fn is_cuda_available() -> bool {
     Command::new("nvcc")

diff --git a/vortex-cuda/cuda_kernel_generator/mod.rs b/vortex-cuda/cuda_kernel_generator/mod.rs
@@ -129,7 +129,7 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
     let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
 
     let local_func_params = format!(
-        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx)"
+        "(const uint{bits}_t *__restrict in, uint{bits}_t *__restrict out, uint{bits}_t reference, int thread_idx, GPUPatches& patches)"
     );
 
     writeln!(output, "__device__ void _{func_name}{local_func_params} {{")?;
@@ -141,12 +141,22 @@ fn generate_device_kernel_for_width<T: FastLanes, W: Write>(
             writeln!(output, "_bit_unpack_{bits}_{bit_width}bw_lane(in, shared_out, reference, thread_idx * {per_thread_loop_count} + {thread_lane});")?;
         }
 
-        writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?;
         output.indent(|output| {
-            writeln!(output, "auto idx = i * {thread_count} + thread_idx;")?;
-            writeln!(output, "out[idx] = shared_out[idx];")
-        })?;
-        writeln!(output, "}}")
+            writeln!(output, "__syncwarp();")?;
+            writeln!(output, "PatchesCursor<uint{bits}_t> cursor(patches, blockIdx.x, thread_idx, {thread_count});")?;
+            writeln!(output, "auto patch = cursor.next();")?;
+            writeln!(output, "for (int i = 0; i < {shared_copy_ncount}; i++) {{")?;
+            output.indent(|output| {
+                writeln!(output, "auto idx = i * {thread_count} + thread_idx;")?;
+                writeln!(output, "if (idx == patch.index) {{")?;
+                writeln!(output, "    out[idx] = patch.value;")?;
+                writeln!(output, "    patch = cursor.next();")?;
+                writeln!(output, "}} else {{")?;
+                writeln!(output, "    out[idx] = shared_out[idx];")?;
+                writeln!(output, "}}")
+            })?;
+            writeln!(output, "}}")
+        })
     })?;
 
     writeln!(output, "}}")
@@ -161,7 +171,7 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
 
     let func_name = format!("bit_unpack_{bits}_{bit_width}bw_{thread_count}t");
     let func_params = format!(
-        "(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference)"
+        "(const uint{bits}_t *__restrict full_in, uint{bits}_t *__restrict full_out, uint{bits}_t reference, GPUPatches patches)"
     );
 
     writeln!(
@@ -170,14 +180,18 @@ fn generate_global_kernel_for_width<T: FastLanes, W: Write>(
     )?;
 
     output.indent(|output| {
+        // Create a new set of patches
         writeln!(output, "int thread_idx = threadIdx.x;")?;
         writeln!(
             output,
             "auto in = full_in + (blockIdx.x * (128 * {bit_width} / sizeof(uint{bits}_t)));"
         )?;
         writeln!(output, "auto out = full_out + (blockIdx.x * 1024);")?;
 
-        writeln!(output, "_{func_name}(in, out, reference, thread_idx);")
+        writeln!(
+            output,
+            "_{func_name}(in, out, reference, thread_idx, patches);"
+        )
     })?;
 
     writeln!(output, "}}")
@@ -195,6 +209,7 @@ pub fn generate_cuda_unpack_for_width<T: FastLanes, W: Write>(
     writeln!(output, "#include <cuda_runtime.h>")?;
     writeln!(output, "#include <stdint.h>")?;
     writeln!(output, "#include \"fastlanes_common.cuh\"")?;
+    writeln!(output, "#include \"patches.cuh\"")?;
     writeln!(output)?;
 
     // First, emit all lane decoders.

diff --git a/vortex-cuda/kernels/src/bit_unpack.cuh b/vortex-cuda/kernels/src/bit_unpack.cuh
@@ -11,6 +11,7 @@
 #include "bit_unpack_16.cu"
 #include "bit_unpack_32.cu"
 #include "bit_unpack_64.cu"
+#include "patches.h"
 
 /// Decodes a single lane of packed data.
 ///