txus
diff --git a/‎Makefile‎
Lines changed: 9 additions & 4 deletions b/‎Makefile‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎include/tensor/ops.hpp‎
Lines changed: 13 additions & 1 deletion b/‎include/tensor/ops.hpp‎
Lines changed: 13 additions & 1 deletion
diff --git a/‎include/tensor/storage.hpp‎
Lines changed: 6 additions & 0 deletions b/‎include/tensor/storage.hpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/tensor/tensor.hpp‎
Lines changed: 36 additions & 73 deletions b/‎include/tensor/tensor.hpp‎
Lines changed: 36 additions & 73 deletions
diff --git a/‎src/llama/grouped_query_attention.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama/grouped_query_attention.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama/kv_cache.cpp‎
Lines changed: 2 additions & 2 deletions b/‎src/llama/kv_cache.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/llama/rope.cpp‎
Lines changed: 7 additions & 10 deletions b/‎src/llama/rope.cpp‎
Lines changed: 7 additions & 10 deletions
diff --git a/‎src/nn/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions b/‎src/nn/CMakeLists.txt‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎src/nn/cpu/softmax.cpp‎
Lines changed: 15 additions & 5 deletions b/‎src/nn/cpu/softmax.cpp‎
Lines changed: 15 additions & 5 deletions
@@ -46,10 +46,15 @@ tensor_cuda:
 	@cmake --build build --target test_tensor_cuda
 	@ctest --test-dir build -R "^TensorCUDA" --output-on-failure
 
-.PHONY: nn
-nn:
-	@cmake --build build --target test_nn
-	@ctest --test-dir build -R "^NN" --output-on-failure
+.PHONY: nn_cpu
+nn_cpu:
+	@cmake --build build --target test_nn_cpu
+	@ctest --test-dir build -R "^NNCPU" --output-on-failure
+
+.PHONY: nn_cuda
+nn_cuda:
+	@cmake --build build --target test_nn_cuda
+	@ctest --test-dir build -R "^NNCUDA" --output-on-failure
 
 .PHONY: llama
 llama:
 
@@ -66,6 +66,10 @@ template <typename T, typename D>
 Tensor<std::remove_const_t<T>, D> slice(const TensorView<T, D>& view, int dim, size_t start,
                                         size_t end);
 
+template <typename T, typename D>
+Tensor<std::remove_const_t<T>, D> repeat_interleave(const TensorView<T, D>& view, int dim,
+                                                    size_t repeats);
+
 template <typename T, typename D>
 Tensor<std::remove_const_t<T>, D> sum(const TensorView<T, D>& input, int dim, bool keepdim);
 template <typename T, typename D>
@@ -74,9 +78,17 @@ Tensor<std::remove_const_t<T>, D> max(const TensorView<T, D>& input, int dim, bo
 template <typename T, typename D>
 Tensor<int, D> argmax(const TensorView<T, D>& input, int dim, bool keepdim);
 
+// copies
+
+template <typename TIn, typename TOut, typename D>
+Tensor<TOut, D> to(const TensorView<TIn, D>& tensor);
+
+template <typename T, typename D>
+Tensor<std::remove_const_t<T>, D> copy(const TensorView<T, D>& tensor);
+
 // mutations
 
 template <typename T, typename D>
-void replace_from_(Tensor<T, D>& out, const TensorView<T, D>& input);
+void replace_from_(Tensor<T, D>& destination, const TensorView<T, D>& source);
 
 } // namespace tensor
@@ -132,6 +132,9 @@ template <typename T> class TensorStorage<T, CUDA> {
     return data_;
   }
 
+  T operator[](size_t idx);
+  const T operator[](size_t idx) const;
+
   void resize(size_t size);
   void fill(T value);
 };
@@ -169,6 +172,9 @@ template <typename T> class TensorStorage<const T, CUDA> {
     return data_;
   }
 
+  T operator[](size_t idx);
+  const T operator[](size_t idx) const;
+
   void resize(size_t size);
 };
 #endif
 
@@ -134,6 +134,28 @@ template <DType T, Device D> struct TensorView {
     return std::span<const T>(data, data_size);
   }
 
+#ifdef TENSOR_HAS_CUDA
+  T operator[](int idx) const
+    requires std::same_as<D, device::CUDA>
+  {
+    if (idx > data_size) {
+      throw std::out_of_range("cannot index past the tensor size");
+    }
+    T value;
+    cudaMemcpy(&value, data + idx, sizeof(T), cudaMemcpyDeviceToHost);
+    return value;
+  }
+#endif
+
+  T operator[](int idx) const
+    requires std::same_as<D, device::CPU>
+  {
+    if (idx > data_size) {
+      throw std::out_of_range("cannot index past the tensor size");
+    }
+    return *(data + idx);
+  }
+
   [[nodiscard]] size_t total_elements() const {
     size_t out = 1;
     for (auto dim : shape) {
@@ -241,46 +263,6 @@ template <DType T, Device D> struct TensorView {
     transpose(0, 1);
   }
 
-  Tensor<std::remove_const_t<T>, D> repeat_interleave(size_t dim, size_t repeats) const {
-    assert(dim < shape.size());
-
-    Shape temp_shape;
-    Shape temp_stride;
-
-    for (size_t dim_ = 0; dim_ <= dim; ++dim_) {
-      temp_shape.push_back(shape[dim_]);
-      temp_stride.push_back(stride[dim_]);
-    }
-
-    temp_shape.push_back(repeats);
-    temp_stride.push_back(0);
-
-    for (size_t dim_ = dim + 1; dim_ < shape.size(); ++dim_) {
-      temp_shape.push_back(shape[dim_]);
-      temp_stride.push_back(stride[dim_]);
-    }
-
-    size_t temp_size = 1;
-    for (auto dim_ : temp_shape) {
-      temp_size *= dim_;
-    }
-
-    TensorView temp_view{data, temp_size, temp_shape, temp_stride};
-
-    Tensor<T, D> materialized = temp_view.copy();
-
-    Shape final_shape;
-    for (size_t dim_ = 0; dim_ < shape.size(); ++dim_) {
-      if (dim_ == dim) {
-        final_shape.push_back(shape[dim_] * repeats); // Expanded dimension
-      } else {
-        final_shape.push_back(shape[dim_]);
-      }
-    }
-
-    return materialized.view().reshape(final_shape);
-  }
-
   [[nodiscard]] bool is_contiguous() const {
     if (shape.empty()) {
       return true;
@@ -296,7 +278,10 @@ template <DType T, Device D> struct TensorView {
     return true;
   }
 
-  template <DType OutT, typename Func> Tensor<OutT, D> map(Func func) const {
+  template <DType OutT, typename Func>
+  Tensor<OutT, D> map(Func func) const
+    requires std::same_as<D, device::CPU>
+  {
     Tensor<OutT, D> result{shape};
 
     auto result_span = result.span();
@@ -317,7 +302,10 @@ template <DType T, Device D> struct TensorView {
     return result;
   }
 
-  template <typename Func> void each(Func func) const {
+  template <typename Func>
+  void each(Func func) const
+    requires std::same_as<D, device::CPU>
+  {
     size_t total_elems = total_elements();
 
     for (size_t linear_idx = 0; linear_idx < total_elems; ++linear_idx) {
@@ -332,10 +320,6 @@ template <DType T, Device D> struct TensorView {
     }
   }
 
-  template <DType OutT> Tensor<OutT, D> to() const {
-    return map<OutT>([](T val) { return static_cast<OutT>(val); });
-  }
-
   void check_for_nans() const {
     for (size_t i = 0; i < span().size(); ++i) {
       if (std::isnan(span()[i])) {
@@ -349,10 +333,6 @@ template <DType T, Device D> struct TensorView {
     }
   }
 
-  Tensor<std::remove_const_t<T>, D> copy() const {
-    return map<std::remove_const_t<T>>([](T val) { return val; });
-  }
-
   Tensor<std::remove_const_t<T>, D> contiguous() const {
     Tensor<std::remove_const_t<T>, D> result{shape};
     auto dst_span = result.span();
@@ -406,18 +386,6 @@ template <DType T, Device D> struct TensorView {
     return out;
   }
 
-  Tensor<std::remove_const_t<T>, D> cos() const {
-    return map<std::remove_const_t<T>>([](T val) { return std::cos(val); });
-  }
-
-  Tensor<std::remove_const_t<T>, D> sin() const {
-    return map<std::remove_const_t<T>>([](T val) { return std::sin(val); });
-  }
-
-  Tensor<std::remove_const_t<T>, D> exp() const {
-    return map<std::remove_const_t<T>>([](T val) { return std::exp(val); });
-  }
-
   T item() const {
     assert(data_size == 1);
     return data[0];
@@ -491,11 +459,6 @@ template <DType T, Device D> class Tensor {
     return TensorView<const T, D>{data(), size(), shape(), get_all_strides(shape())};
   }
 
-  // Copy to a new mutable tensor
-  Tensor<std::remove_const_t<T>, D> copy() const {
-    return view().copy();
-  }
-
   void fill_(T value)
     requires(!std::is_const_v<T>)
   {
@@ -547,17 +510,17 @@ template <DType T, Device D> class Tensor {
     span()[idx] = value;
   }
 
-  T item() const {
-    assert(shape().size() == 0);
-    return storage_.data()[0];
-  }
-
   T at(int idx) const {
     if (idx > size()) {
       throw std::out_of_range("cannot index past the tensor size");
     }
     return storage_[idx];
   }
+
+  T item() const {
+    assert(shape().size() == 0);
+    return at(0);
+  }
 };
 
 } // namespace tensor
@@ -619,7 +582,7 @@ template <tensor::DType T, tensor::Device D> struct fmt::formatter<tensor::Tenso
     const auto& strides = tensor_view.stride;
     if (dim == shape.size()) {
       // Base case: actually print one scalar
-      return fmt::format_to(out, "{}", tensor_view.span()[offset]);
+      return fmt::format_to(out, "{}", tensor_view[offset]);
     }
 
     auto dim_size = shape[dim];
 
@@ -107,8 +107,8 @@ GroupedQueryAttention<T, D>::forward(const TensorView<T, D>& inputs,
   keys_v = keys.view();
 
   // repeat-expand to (batch, [num_kv_groups * group_size], seq_len, head_dim)
-  keys = keys_v.repeat_interleave(1, group_size);
-  values = values_v.repeat_interleave(1, group_size);
+  keys = repeat_interleave(keys_v, 1, group_size);
+  values = repeat_interleave(values_v, 1, group_size);
 
   auto transposed_keys_ = keys.view();
   transposed_keys_.transpose(2, 3); // (batch, [num_kv_groups*group_size], head_dim, kvs_len)
 
@@ -30,8 +30,8 @@ KVCache<T, D>::forward(tensor::TensorView<T, D> new_keys, tensor::TensorView<T,
     all_keys = cat(already_cached_keys.view(), new_keys, 1);
     all_values = cat(already_cached_values.view(), new_values, 1);
   } else { // prefill
-    all_keys = new_keys.copy();
-    all_values = new_values.copy();
+    all_keys = copy(new_keys);
+    all_values = copy(new_values);
   }
 
   replace_from_(k_cache, all_keys.view());
 
@@ -15,7 +15,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
 
   // compute the inverse frequencies
   Tensor<int, D> range = arange<int, D>(0, head_dim, 2);
-  auto range_float = range.view().template to<float>();
+  auto range_float = to<int, float>(range.view());
 
   auto scaled = div(range_float.view(), float(head_dim));
 
@@ -47,7 +47,7 @@ precompute_rope_values(size_t head_dim, float theta_base, size_t context_length)
       // Medium frequency: smooth interpolation
       float smooth =
           (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor);
-      float scaled_inv_freq = ((1.0 - smooth) * (inv_f / factor)) + (smooth * inv_f);
+      float scaled_inv_freq = ((1.0 - smooth) * (inv_f / factor)) + (smooth * inv_f); // NOLINT
       inv_freq_.span()[i] = scaled_inv_freq;
     }
   }
@@ -86,21 +86,18 @@ Tensor<std::remove_const_t<T>, D> RoPE<T, D>::forward(TensorView<T, D> inputs,
 
   assert(head_dim % 2 == 0);
 
-  // Copy inputs to a tensor (stay in bfloat16)
-  Tensor<T, D> inputs_t = inputs.copy();
-
   // Slice and convert cos/sin to bfloat16
   auto adj_cos_ = slice(cos.view(), 0, position_offset, position_offset + seq_len);
-  auto adj_cos_bf16 = adj_cos_.view().template to<T>();
+  auto adj_cos_bf16 = to<float, T>(adj_cos_.view());
   auto adj_cos = adj_cos_bf16.view().reshape({1, 1, seq_len, head_dim});
 
   auto adj_sin_ = slice(sin.view(), 0, position_offset, position_offset + seq_len);
-  auto adj_sin_bf16 = adj_sin_.view().template to<T>();
+  auto adj_sin_bf16 = to<float, T>(adj_sin_.view());
   auto adj_sin = adj_sin_bf16.view().reshape({1, 1, seq_len, head_dim});
 
   // Split input into halves
-  auto first_half = slice(inputs_t.view(), -1, 0, head_dim / 2);
-  auto second_half = slice(inputs_t.view(), -1, head_dim / 2, head_dim);
+  auto first_half = slice(inputs, -1, 0, head_dim / 2);
+  auto second_half = slice(inputs, -1, head_dim / 2, head_dim);
 
   // Negate second half
   auto second_half_neg = mul(second_half.view(), T(-1.0));
@@ -109,7 +106,7 @@ Tensor<std::remove_const_t<T>, D> RoPE<T, D>::forward(TensorView<T, D> inputs,
   auto rotated = cat(second_half_neg.view(), first_half.view(), -1);
 
   // Apply rotation: inputs * cos + rotated * sin
-  auto input_cos = mul(inputs_t.view(), adj_cos.view());
+  auto input_cos = mul(inputs, adj_cos.view());
   auto rotated_sin = mul(rotated.view(), adj_sin.view());
 
   auto out = add(input_cos.view(), rotated_sin.view());
 
@@ -19,6 +19,19 @@ target_sources(nn_core
 
 add_subdirectory(cpu)
 
+if(APPLE)
+  option(NN_BUILD_CUDA "Build nn_cuda library" OFF)
+else()
+  option(NN_BUILD_CUDA "Build nn_cuda library" ON)
+endif()
+
+if(NN_BUILD_CUDA)
+  enable_language(CUDA)
+  find_package(CUDAToolkit REQUIRED)
+  add_subdirectory(cuda)
+  target_compile_definitions(nn_core INTERFACE NN_HAS_CUDA)
+endif()
+
 source_group(
   TREE "${PROJECT_SOURCE_DIR}/include"
   PREFIX "Header Files"
 
@@ -7,19 +7,29 @@ using namespace tensor;
 template <typename T, typename D>
 Tensor<std::remove_const_t<T>, D> Softmax::operator()(const TensorView<T, D>& input,
                                                       int dim) const {
-  Tensor<float, D> f32 = input.template to<float>();
+  Tensor<float, D> f32 = to<T, float>(input);
 
-  auto maxes = max(f32.view(), dim, true);
+  auto maxes = tensor::max(f32.view(), dim, true);
+
+  fmt::println("MAXES: {}", maxes.view());
 
   auto scaled = sub(f32.view(), maxes.view());
 
-  auto expd = scaled.view().exp();
+  fmt::println("SCALED: {}", scaled.view());
+
+  auto expd = tensor::exp(scaled.view());
+
+  fmt::println("EXPD: {}", expd.view());
 
   auto expd_sum = sum(expd.view(), dim, true);
 
-  auto out = div(expd.view(), expd_sum.view());
+  fmt::println("EXPD SUM: {}", expd_sum.view());
+
+  auto out = tensor::div(expd.view(), expd_sum.view());
+
+  fmt::println("NORMALIZED: {}", out.view());
 
-  return out.view().template to<T>();
+  return to<float, T>(out.view());
 }
 
 template Tensor<bfloat16, CPU> Softmax::operator()(const TensorView<bfloat16, CPU>& input,