diff --git a/challenges/colab_exports/README.md b/challenges/colab_exports/README.md
new file mode 100644
index 00000000..5751eebf
--- /dev/null
+++ b/challenges/colab_exports/README.md
@@ -0,0 +1,100 @@
+# LeetGPU Colab Notebooks
+
+Click the badges below to open the challenges directly in Google Colab.
+
+## Easy
+
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/19_reverse_array.ipynb) **19_reverse_array**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/1_vector_add.ipynb) **1_vector_add**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/21_relu.ipynb) **21_relu**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/23_leaky_relu.ipynb) **23_leaky_relu**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/24_rainbow_table.ipynb) **24_rainbow_table**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/2_matrix_multiplication.ipynb) **2_matrix_multiplication**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/31_matrix_copy.ipynb) **31_matrix_copy**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/3_matrix_transpose.ipynb) **3_matrix_transpose**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/41_simple_inference.ipynb) **41_simple_inference**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/52_silu.ipynb) **52_silu**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/54_swiglu.ipynb) **54_swiglu**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/62_value_clipping.ipynb) **62_value_clipping**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/63_interleave.ipynb) **63_interleave**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/65_geglu.ipynb) **65_geglu**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/66_rgb_to_grayscale.ipynb) **66_rgb_to_grayscale**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/68_sigmoid.ipynb) **68_sigmoid**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/7_color_inversion.ipynb) **7_color_inversion**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/8_matrix_addition.ipynb) **8_matrix_addition**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/easy/9_1d_convolution.ipynb) **9_1d_convolution**
+
+## Medium
+
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/10_2d_convolution.ipynb) **10_2d_convolution**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/11_3d_convolution.ipynb) **11_3d_convolution**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/13_histogramming.ipynb) **13_histogramming**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/16_prefix_sum.ipynb) **16_prefix_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/17_dot_product.ipynb) **17_dot_product**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/18_sparse_matrix_vector_multiplication.ipynb) **18_sparse_matrix_vector_multiplication**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/22_gemm.ipynb) **22_gemm**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/25_categorical_cross_entropy_loss.ipynb) **25_categorical_cross_entropy_loss**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/27_mean_squared_error.ipynb) **27_mean_squared_error**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/28_gaussian_blur.ipynb) **28_gaussian_blur**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/29_top_k_selection.ipynb) **29_top_k_selection**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/30_batched_matrix_multiplication.ipynb) **30_batched_matrix_multiplication**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/32_int8_quantized_matmul.ipynb) **32_int8_quantized_matmul**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/33_ordinary_least_squares.ipynb) **33_ordinary_least_squares**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/34_logistic_regression.ipynb) **34_logistic_regression**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/35_monte_carlo_integration.ipynb) **35_monte_carlo_integration**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/37_matrix_power.ipynb) **37_matrix_power**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/38_nearest_neighbor.ipynb) **38_nearest_neighbor**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/40_batch_normalization.ipynb) **40_batch_normalization**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/42_2d_max_pooling.ipynb) **42_2d_max_pooling**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/43_count_array_element.ipynb) **43_count_array_element**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/44_count_2d_array_element.ipynb) **44_count_2d_array_element**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/45_count_3d_array_element.ipynb) **45_count_3d_array_element**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/47_subarray_sum.ipynb) **47_subarray_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/48_2d_subarray_sum.ipynb) **48_2d_subarray_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/49_3d_subarray_sum.ipynb) **49_3d_subarray_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/4_reduction.ipynb) **4_reduction**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/50_rms_normalization.ipynb) **50_rms_normalization**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/51_max_subarray_sum.ipynb) **51_max_subarray_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/55_attn_w_linear_bias.ipynb) **55_attn_w_linear_bias**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/57_fp16_batched_matmul.ipynb) **57_fp16_batched_matmul**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/58_fp16_dot_product.ipynb) **58_fp16_dot_product**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/5_softmax.ipynb) **5_softmax**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/60_top_p_sampling.ipynb) **60_top_p_sampling**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/61_rope_embedding.ipynb) **61_rope_embedding**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/64_weight_dequantization.ipynb) **64_weight_dequantization**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/67_moe_topk_gating.ipynb) **67_moe_topk_gating**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/69_jacobi_stencil_2d.ipynb) **69_jacobi_stencil_2d**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/6_softmax_attention.ipynb) **6_softmax_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/70_segmented_prefix_sum.ipynb) **70_segmented_prefix_sum**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/71_parallel_merge.ipynb) **71_parallel_merge**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/72_stream_compaction.ipynb) **72_stream_compaction**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/75_sparse_matrix_dense_matrix_multiplication.ipynb) **75_sparse_matrix_dense_matrix_multiplication**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/76_adder_transformer.ipynb) **76_adder_transformer**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/78_2d_fft.ipynb) **78_2d_fft**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/80_grouped_query_attention.ipynb) **80_grouped_query_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/81_int4_matmul.ipynb) **81_int4_matmul**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/82_linear_recurrence.ipynb) **82_linear_recurrence**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/84_swiglu_mlp_block.ipynb) **84_swiglu_mlp_block**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/85_lora_linear.ipynb) **85_lora_linear**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/87_speculative_decoding_verification.ipynb) **87_speculative_decoding_verification**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/90_causal_depthwise_conv1d.ipynb) **90_causal_depthwise_conv1d**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/92_decaying_causal_attention.ipynb) **92_decaying_causal_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/94_ssm_selective_scan.ipynb) **94_ssm_selective_scan**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/medium/96_int8_kv_cache_attention.ipynb) **96_int8_kv_cache_attention**
+
+## Hard
+
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/12_multi_head_attention.ipynb) **12_multi_head_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/14_multi_agent_sim.ipynb) **14_multi_agent_sim**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/15_sorting.ipynb) **15_sorting**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/20_kmeans_clustering.ipynb) **20_kmeans_clustering**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/36_radix_sort.ipynb) **36_radix_sort**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/39_Fast_Fourier_transform.ipynb) **39_Fast_Fourier_transform**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/46_bfs_shortest_path.ipynb) **46_bfs_shortest_path**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/53_casual_attention.ipynb) **53_casual_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/56_linear_attention.ipynb) **56_linear_attention**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/59_sliding_window_attn.ipynb) **59_sliding_window_attn**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/73_all_pairs_shortest_paths.ipynb) **73_all_pairs_shortest_paths**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/74_gpt2_block.ipynb) **74_gpt2_block**
+- [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/hard/93_llama_transformer_block.ipynb) **93_llama_transformer_block**
+
diff --git a/challenges/colab_exports/easy/19_reverse_array.ipynb b/challenges/colab_exports/easy/19_reverse_array.ipynb
new file mode 100644
index 00000000..6229a23d
--- /dev/null
+++ b/challenges/colab_exports/easy/19_reverse_array.ipynb
@@ -0,0 +1,519 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that reverses an array of 32-bit floating point\n  numbers in-place. The program should perform an in-place reversal of <code>input</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored back in <code>input</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>Input: [1.0, 2.0, 3.0, 4.0]\nOutput: [4.0, 3.0, 2.0, 1.0]</pre>\n\n<h2>Example 2:</h2>\n<pre>Input: [1.5, 2.5, 3.5]\nOutput: [3.5, 2.5, 1.5]</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 25,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void reverse_array(float* input, int N) {}\n\n// input is device pointer\nextern \"C\" void solve(float* input, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    reverse_array<<<blocksPerGrid, threadsPerBlock>>>(input, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef reverse_array_kernel(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32):\n    pass\n\n\n# input is a device pointer (i.e. pointer to memory on the GPU)\n@export\ndef solve(input: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(N, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[reverse_array_kernel, reverse_array_kernel]()\n    ctx.enqueue_function(_kernel, input, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input is a tensor on the GPU\ndef solve(input: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef reverse_kernel(input, N, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input is a tensor on the GPU\ndef solve(input: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    n_blocks = triton.cdiv(N // 2, BLOCK_SIZE)\n    grid = (n_blocks,)\n\n    reverse_kernel[grid](input, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Reverse Array\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, N: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert input.dtype == torch.float32\n",
+        "\n",
+        "        # Reverse the array in-place\n",
+        "        input[:] = torch.flip(input, [0])\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"inout\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Fixed value test cases\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype), \"N\": 4}\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append({\"input\": torch.tensor([42.0], device=\"cuda\", dtype=dtype), \"N\": 1})\n",
+        "\n",
+        "        test_cases.append({\"input\": torch.tensor([0.0] * 16, device=\"cuda\", dtype=dtype), \"N\": 16})\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        2.0,\n",
+        "                        3.0,\n",
+        "                        4.0,\n",
+        "                        5.0,\n",
+        "                        6.0,\n",
+        "                        7.0,\n",
+        "                        8.0,\n",
+        "                        9.0,\n",
+        "                        10.0,\n",
+        "                        11.0,\n",
+        "                        12.0,\n",
+        "                        13.0,\n",
+        "                        14.0,\n",
+        "                        15.0,\n",
+        "                        16.0,\n",
+        "                        17.0,\n",
+        "                        18.0,\n",
+        "                        19.0,\n",
+        "                        20.0,\n",
+        "                        21.0,\n",
+        "                        22.0,\n",
+        "                        23.0,\n",
+        "                        24.0,\n",
+        "                        25.0,\n",
+        "                        26.0,\n",
+        "                        27.0,\n",
+        "                        28.0,\n",
+        "                        29.0,\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"N\": 30,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.tensor([-1.0, -2.0, -3.0, -4.0], device=\"cuda\", dtype=dtype), \"N\": 4}\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype), \"N\": 4}\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [0.000001, 0.0000001, 0.00000001, 0.000000001], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1000000.0, 10000000.0, -1000000.0, -10000000.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Random range test cases\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.empty(32, device=\"cuda\", dtype=dtype).uniform_(0.0, 32.0), \"N\": 32}\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(0.0, 7.0), \"N\": 1000}\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\"input\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(0.0, 1.0), \"N\": 10000}\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 25000000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/1_vector_add.ipynb b/challenges/colab_exports/easy/1_vector_add.ipynb
new file mode 100644
index 00000000..8ca02f8c
--- /dev/null
+++ b/challenges/colab_exports/easy/1_vector_add.ipynb
@@ -0,0 +1,483 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that performs element-wise addition of two vectors containing 32-bit floating point numbers.\n  The program should take two input vectors of equal length and produce a single output vector containing their sum.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in vector <code>C</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  A = [1.0, 2.0, 3.0, 4.0]\n        B = [5.0, 6.0, 7.0, 8.0]\nOutput: C = [6.0, 8.0, 10.0, 12.0]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  A = [1.5, 1.5, 1.5]\n        B = [2.3, 2.3, 2.3]\nOutput: C = [3.8, 3.8, 3.8]\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>Input vectors <code>A</code> and <code>B</code> have identical lengths</li>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 25,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void vector_add(const float* A, const float* B, float* C, int N) {}\n\n// A, B, C are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, const float* B, float* C, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    vector_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, C: cute.Tensor, N: cute.Uint32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef vector_add_kernel(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# A, B, C are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[vector_add_kernel, vector_add_kernel]()\n    ctx.enqueue_function(_kernel, A, B, C, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef vector_add_kernel(a, b, c, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# a, b, c are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    vector_add_kernel[grid](a, b, c, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Vector Addition\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):\n",
+        "        assert A.shape == B.shape == C.shape\n",
+        "        assert A.dtype == B.dtype == C.dtype\n",
+        "        assert A.device == B.device == C.device\n",
+        "\n",
+        "        torch.add(A, B, out=C)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_size_t, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        A = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        C = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_specs = [\n",
+        "            (\"scalar_tail_1\", [1.0], [2.0]),\n",
+        "            (\"scalar_tail_2\", [1.0, 2.0], [3.0, 4.0]),\n",
+        "            (\"scalar_tail_3\", [1.0, 2.0, 3.0], [4.0, 5.0, 6.0]),\n",
+        "            (\"basic_small\", [1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]),\n",
+        "            (\"all_zeros\", [0.0] * 16, [0.0] * 16),\n",
+        "            (\"non_power_of_two\", [1.0] * 30, [2.0] * 30),\n",
+        "            (\"negative_numbers\", [-1.0, -2.0, -3.0, -4.0], [-5.0, -6.0, -7.0, -8.0]),\n",
+        "            (\"mixed_positive_negative\", [1.0, -2.0, 3.0, -4.0], [-1.0, 2.0, -3.0, 4.0]),\n",
+        "            (\n",
+        "                \"very_small_numbers\",\n",
+        "                [0.000001, 0.0000001, 0.00000001, 0.000000001],\n",
+        "                [0.000001, 0.0000001, 0.00000001, 0.000000001],\n",
+        "            ),\n",
+        "            (\n",
+        "                \"large_numbers\",\n",
+        "                [1000000.0, 10000000.0, -1000000.0, -10000000.0],\n",
+        "                [1000000.0, -10000000.0, -1000000.0, 10000000.0],\n",
+        "            ),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, a_vals, b_vals in test_specs:\n",
+        "            n = len(a_vals)\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"A\": torch.tensor(a_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"B\": torch.tensor(b_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"C\": torch.zeros(n, device=\"cuda\", dtype=dtype),\n",
+        "                    \"N\": n,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random test cases\n",
+        "        for _, size, a_range, b_range in [\n",
+        "            (\"powers_of_two_size\", 32, (0.0, 32.0), (0.0, 64.0)),\n",
+        "            (\"medium_sized_vector\", 1000, (0.0, 7.0), (0.0, 5.0)),\n",
+        "            (\"large_vector\", 10000, (0.0, 1.0), (0.0, 1.0)),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"A\": torch.empty(size, device=\"cuda\", dtype=dtype).uniform_(*a_range),\n",
+        "                    \"B\": torch.empty(size, device=\"cuda\", dtype=dtype).uniform_(*b_range),\n",
+        "                    \"C\": torch.zeros(size, device=\"cuda\", dtype=dtype),\n",
+        "                    \"N\": size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 25000000\n",
+        "        return {\n",
+        "            \"A\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"B\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"C\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/21_relu.ipynb b/challenges/colab_exports/easy/21_relu.ipynb
new file mode 100644
index 00000000..289f21bc
--- /dev/null
+++ b/challenges/colab_exports/easy/21_relu.ipynb
@@ -0,0 +1,530 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that performs the Rectified Linear Unit (ReLU) activation function on a vector of 32-bit floating point numbers.\n  The ReLU function sets all negative values to zero and leaves positive values unchanged: $$\\text{ReLU}(x) = \\max(0, x)$$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>output</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [-2.0, -1.0, 0.0, 1.0, 2.0]\nOutput: output = [0.0, 0.0, 0.0, 1.0, 2.0]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [-3.5, 0.0, 4.2]\nOutput: output = [0.0, 0.0, 4.2]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 25,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void relu_kernel(const float* input, float* output, int N) {}\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef relu_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(N, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[relu_kernel, relu_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef relu_kernel(input, output, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    relu_kernel[grid](input, output, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"ReLU\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        # Apply ReLU: max(0, x)\n",
+        "        output.copy_(torch.relu(input))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_tensor = torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0], device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(5, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"N\": 5,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Edge case: single element (N=1)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Edge case: N=2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Edge case: N=3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-2.0, 0.0, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Fixed-value test cases\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-2.0, -1.0, 0.0, 1.0, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, -2.0, -3.0, -4.0, -5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0.0, 0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-1000.0, -100.0, 0.0, 100.0, 1000.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-0.001, -0.0001, 0.0, 0.0001, 0.001], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Random range test cases\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(1024, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"output\": torch.zeros(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"output\": torch.zeros(10000, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 25000000  # Large vector for performance testing\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "            \"output\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/23_leaky_relu.ipynb b/challenges/colab_exports/easy/23_leaky_relu.ipynb
new file mode 100644
index 00000000..cc59cbaf
--- /dev/null
+++ b/challenges/colab_exports/easy/23_leaky_relu.ipynb
@@ -0,0 +1,505 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a program that performs the leaky ReLU activation function on a vector of floating-point numbers. The leaky ReLU function is defined as:\n    $$ f(x) = \\begin{cases}\n      x & \\text{if } x > 0 \\\\\n      \\alpha x & \\text{if } x \\leq 0\n    \\end{cases} $$\n    where $\\alpha$ is a small positive constant (0.01 in this problem).\n  </p>\n\n  <h2>Implementation Requirements</h2>\n  <ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in vector <code>output</code></li>\n    <li>Use $\\alpha = 0.01$ as the leaky coefficient</li>\n  </ul>\n\n  <h2>Example 1:</h2>\n  <pre>\n  Input:  x = [1.0, -2.0, 3.0, -4.0]\n  Output: y = [1.0, -0.02, 3.0, -0.04]</pre>\n\n  <h2>Example 2:</h2>\n  <pre>\n  Input:  x = [-1.5, 0.0, 2.5, -3.0]\n  Output: y = [-0.015, 0.0, 2.5, -0.03]</pre>\n\n  <h2>Constraints</h2>\n  <ul>\n    <li>1 \u2264 <code>N</code> \u2264 100,000,000</li>\n    <li>-1000.0 \u2264 <code>input[i]</code> \u2264 1000.0</li>\n\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void leaky_relu_kernel(const float* input, float* output, int N) {}\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    leaky_relu_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef leaky_relu_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(N, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[leaky_relu_kernel, leaky_relu_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef leaky_relu_kernel(input, output, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    leaky_relu_kernel[grid](input, output, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Leaky ReLU\", atol=1e-06, rtol=1e-06, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        # Apply Leaky ReLU: f(x) = x if x > 0, else 0.01 * x\n",
+        "        alpha = 0.01\n",
+        "        output[:] = torch.where(input > 0, input, alpha * input)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_tensor = torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Edge case: single element (N=1)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Edge case: N=2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Edge case: N=3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-2.0, 0.0, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # basic_example\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_positive\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_negative\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, -2.0, -3.0, -4.0, -5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zeros\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_random\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"output\": torch.zeros(10000, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50000000  # Large vector for performance testing\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"output\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/24_rainbow_table.ipynb b/challenges/colab_exports/easy/24_rainbow_table.ipynb
new file mode 100644
index 00000000..93443e8b
--- /dev/null
+++ b/challenges/colab_exports/easy/24_rainbow_table.ipynb
@@ -0,0 +1,520 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a program that performs <code>R</code> rounds of parallel hashing on an array of 32-bit integers using the provided hash function.\n    The hash should be applied <code>R</code> times iteratively (the output of one round becomes the input to the next).\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in array <code>output</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>Input:  numbers = [123, 456, 789], R = 2\nOutput: hashes = [1636807824, 1273011621, 2193987222]</pre>\n\n<h2>Example 2:</h2>\n<pre>Input:  numbers = [0, 1, 2147483647], R = 3\nOutput: hashes = [96754810, 3571711400, 2006156166]</pre>\n\n<h2>Constraints</h2>\n<ul>\n    <li>1 \u2264 <code>N</code> \u2264 10,000,000</li>\n    <li>1 \u2264 <code>R</code> \u2264 100</li>\n    <li>0 \u2264 <code>input[i]</code> \u2264 2147483647</li>\n\n  <li>Performance is measured with <code>N</code> = 5,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__device__ unsigned int fnv1a_hash(unsigned int input) {\n    const unsigned int FNV_PRIME = 16777619;\n    const unsigned int OFFSET_BASIS = 2166136261;\n\n    unsigned int hash = OFFSET_BASIS;\n\n    for (int byte_pos = 0; byte_pos < 4; byte_pos++) {\n        unsigned char byte = (input >> (byte_pos * 8)) & 0xFFu;\n        hash = (hash ^ byte) * FNV_PRIME;\n    }\n\n    return hash;\n}\n\n__global__ void fnv1a_hash_kernel(const int* input, unsigned int* output, int N, int R) {}\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, unsigned int* output, int N, int R) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    fnv1a_hash_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N, R);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\ndef fnv1a_hash_u32_scalar(x: cute.Uint32) -> cute.Uint32:\n    FNV_PRIME = 16777619\n    OFFSET_BASIS = 2166136261\n    hash_val = cute.Uint32(OFFSET_BASIS)\n    prime = cute.Uint32(FNV_PRIME)\n    mask = cute.Uint32(0xFF)\n    for byte_pos in range(4):\n        byte = (x >> (byte_pos * 8)) & mask\n        hash_val = (hash_val ^ byte) * prime\n    return cute.Uint32(hash_val)\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, R: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\ndef fnv1a_hash(x: jax.Array) -> jax.Array:\n    FNV_PRIME = jnp.uint32(16777619)\n    OFFSET_BASIS = jnp.uint32(2166136261)\n    hash_val = jnp.full_like(x, OFFSET_BASIS, dtype=jnp.uint32)\n\n    MASK_FF = jnp.uint32(0xFF)\n    for byte_pos in range(4):\n        byte = (x >> jnp.uint32(byte_pos * 8)) & MASK_FF\n        hash_val = hash_val ^ byte\n        hash_val = hash_val * FNV_PRIME\n\n    return hash_val\n\n\n# input is a tensor on the GPU\ndef solve(input: jax.Array, N: int, R: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef fnv1a_hash(input: UInt32) -> UInt32:\n    alias FNV_PRIME: UInt32 = 16777619\n    alias OFFSET_BASIS: UInt32 = 2166136261\n\n    var hash: UInt32 = OFFSET_BASIS\n\n    for byte_pos in range(4):\n        var byte_val: UInt32 = (input >> (byte_pos * 8)) & UInt32(0xFF)\n        hash = (hash ^ byte_val) * FNV_PRIME\n\n    return hash\n\n\ndef fnv1a_hash_kernel(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[UInt32, MutExternalOrigin],\n    N: Int32,\n    R: Int32,\n):\n    pass\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[UInt32, MutExternalOrigin],\n    N: Int32,\n    R: Int32,\n) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(N, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[fnv1a_hash_kernel, fnv1a_hash_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, N, R, grid_dim=blocksPerGrid, block_dim=threadsPerBlock\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\ndef fnv1a_hash(x: torch.Tensor) -> torch.Tensor:\n    FNV_PRIME = 16777619\n    OFFSET_BASIS = 2166136261\n    x_int = x.to(torch.int64)\n    hash_val = torch.full_like(x_int, OFFSET_BASIS, dtype=torch.int64)\n\n    for byte_pos in range(4):\n        byte = (x_int >> (byte_pos * 8)) & 0xFF\n        hash_val = (hash_val ^ byte) * FNV_PRIME\n        hash_val = hash_val & 0xFFFFFFFF\n\n    return hash_val.to(torch.int32)\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, R: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef fnv1a_hash(x):\n    FNV_PRIME = 16777619\n    OFFSET_BASIS = 2166136261\n\n    hash_val = tl.full(x.shape, OFFSET_BASIS, tl.uint32)\n\n    for byte_pos in range(4):\n        byte = (x >> (byte_pos * 8)) & 0xFF\n        hash_val = (hash_val ^ byte) * FNV_PRIME\n\n    return hash_val\n\n\n@triton.jit\ndef fnv1a_hash_kernel(input, output, n_elements, n_rounds, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, R: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    fnv1a_hash_kernel[grid](input, output, N, R, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Rainbow Table\", atol=0, rtol=0, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def fnv1a_hash(self, x: torch.Tensor) -> torch.Tensor:\n",
+        "        FNV_PRIME = 16777619\n",
+        "        OFFSET_BASIS = 2166136261\n",
+        "        x_int = x.to(torch.int64)\n",
+        "        hash_val = torch.full_like(x_int, OFFSET_BASIS, dtype=torch.int64)\n",
+        "        for byte_pos in range(4):\n",
+        "            byte = (x_int >> (byte_pos * 8)) & 0xFF\n",
+        "            hash_val = (hash_val ^ byte) * FNV_PRIME\n",
+        "            hash_val = hash_val & 0xFFFFFFFF\n",
+        "        return hash_val\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, R: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.uint32\n",
+        "\n",
+        "        current = input\n",
+        "\n",
+        "        # Apply hash R times\n",
+        "        for _ in range(R):\n",
+        "            current = self.fnv1a_hash(current)\n",
+        "\n",
+        "        # Reinterpret the lower 32 bits as uint32\n",
+        "        output.copy_(current.to(torch.int32).view(torch.uint32))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int32), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_uint32), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"R\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        input_tensor = torch.tensor([123, 456, 789], device=\"cuda\", dtype=torch.int32)\n",
+        "        output_tensor = torch.empty(3, device=\"cuda\", dtype=torch.uint32)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"N\": 3,\n",
+        "            \"R\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Force users to handle \"0 chunks\" logic\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([100], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 1,\n",
+        "                \"R\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([100, 200], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(2, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 2,\n",
+        "                \"R\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # basic_example\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([123, 456, 789], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(3, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 3,\n",
+        "                \"R\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_and_max\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0, 1, 2147483647], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(3, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 3,\n",
+        "                \"R\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_round\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1, 2, 3, 4, 5], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 5,\n",
+        "                \"R\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # many_rounds\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 2147483647 + 1, (1024,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1024, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 1024,\n",
+        "                \"R\": 50,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 2147483647 + 1, (10000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(10000, device=\"cuda\", dtype=torch.uint32),\n",
+        "                \"N\": 10000,\n",
+        "                \"R\": 10,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        N, R = 5000000, 10  # Large array with moderate rounds for performance testing\n",
+        "        return {\n",
+        "            \"input\": torch.randint(0, 2147483647 + 1, (N,), device=\"cuda\", dtype=torch.int32),\n",
+        "            \"output\": torch.zeros(N, device=\"cuda\", dtype=torch.uint32),\n",
+        "            \"N\": N,\n",
+        "            \"R\": R,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/2_matrix_multiplication.ipynb b/challenges/colab_exports/easy/2_matrix_multiplication.ipynb
new file mode 100644
index 00000000..0010865e
--- /dev/null
+++ b/challenges/colab_exports/easy/2_matrix_multiplication.ipynb
@@ -0,0 +1,518 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a program that multiplies two matrices of 32-bit floating point numbers on a GPU.\n  Given matrix $A$ of dimensions $M \\times N$ and matrix $B$ of dimensions $N \\times K$, compute\n  the product matrix $C = A \\times B$, which will have dimensions $M \\times K$.\n  All matrices are stored in row-major format.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in matrix <code>C</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<p>\nInput:<br>\nMatrix $A$ ($2 \\times 2$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 \\\\\n3.0 & 4.0\n\\end{bmatrix}\n$$\nMatrix $B$ ($2 \\times 2$):\n$$\n\\begin{bmatrix}\n5.0 & 6.0 \\\\\n7.0 & 8.0\n\\end{bmatrix}\n$$\nOutput:<br>\nMatrix $C$ ($2 \\times 2$):\n$$\n\\begin{bmatrix}\n19.0 & 22.0 \\\\\n43.0 & 50.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Example 2:</h2>\n<p>\nInput:<br>\nMatrix $A$ ($1 \\times 3$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0\n\\end{bmatrix}\n$$\nMatrix $B$ ($3 \\times 1$):\n$$\n\\begin{bmatrix}\n4.0 \\\\\n5.0 \\\\\n6.0\n\\end{bmatrix}\n$$\nOutput:<br>\nMatrix $C$ ($1 \\times 1$):\n$$\n\\begin{bmatrix}\n32.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code>, <code>K</code> &le; 8192</li>\n  <li>Performance is measured with <code>M</code> = 8192, <code>N</code> = 6144, <code>K</code> = 4096</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void matrix_multiplication_kernel(const float* A, const float* B, float* C, int M, int N,\n                                             int K) {}\n\n// A, B, C are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, const float* B, float* C, int M, int N, int K) {\n    dim3 threadsPerBlock(16, 16);\n    dim3 blocksPerGrid((K + threadsPerBlock.x - 1) / threadsPerBlock.x,\n                       (M + threadsPerBlock.y - 1) / threadsPerBlock.y);\n\n    matrix_multiplication_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, M, N, K);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor, B: cute.Tensor, C: cute.Tensor, M: cute.Int32, N: cute.Int32, K: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, M: int, N: int, K: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef matrix_multiplication_kernel(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n):\n    pass\n\n\n# A, B, C are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 16\n    var ctx = DeviceContext()\n\n    var grid_dim_x = ceildiv(K, BLOCK_SIZE)\n    var grid_dim_y = ceildiv(M, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[matrix_multiplication_kernel, matrix_multiplication_kernel]()\n    ctx.enqueue_function(\n        _kernel,\n        A,\n        B,\n        C,\n        M,\n        N,\n        K,\n        grid_dim=(grid_dim_x, grid_dim_y),\n        block_dim=(BLOCK_SIZE, BLOCK_SIZE),\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef matrix_multiplication_kernel(\n    a, b, c, M, N, K, stride_am, stride_an, stride_bn, stride_bk, stride_cm, stride_ck\n):\n    pass\n\n\n# a, b, c are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, M: int, N: int, K: int):\n    stride_am, stride_an = N, 1\n    stride_bn, stride_bk = K, 1\n    stride_cm, stride_ck = K, 1\n\n    grid = (M, K)\n    matrix_multiplication_kernel[grid](\n        a, b, c, M, N, K, stride_am, stride_an, stride_bn, stride_bk, stride_cm, stride_ck\n    )\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Matrix Multiplication\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int\n",
+        "    ):\n",
+        "        assert A.shape == (M, N)\n",
+        "        assert B.shape == (N, K)\n",
+        "        assert C.shape == (M, K)\n",
+        "        assert A.dtype == B.dtype == C.dtype\n",
+        "        assert A.device == B.device == C.device\n",
+        "\n",
+        "        torch.matmul(A, B, out=C)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N, K = 2, 2, 2\n",
+        "        A = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        C = torch.empty(M, K, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_specs = [\n",
+        "            # Basic test cases\n",
+        "            (\"basic_2x2\", 2, 2, 2, [[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0], [7.0, 8.0]]),\n",
+        "            (\"basic_1x3_3x1\", 1, 3, 1, [[1.0, 2.0, 3.0]], [[4.0], [5.0], [6.0]]),\n",
+        "            (\n",
+        "                \"identity_matrix\",\n",
+        "                3,\n",
+        "                3,\n",
+        "                3,\n",
+        "                [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],\n",
+        "                [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]],\n",
+        "            ),\n",
+        "            (\"zero_matrix\", 2, 2, 2, [[0.0, 0.0], [0.0, 0.0]], [[0.0, 0.0], [0.0, 0.0]]),\n",
+        "            (\n",
+        "                \"rectangular_matrices\",\n",
+        "                2,\n",
+        "                3,\n",
+        "                1,\n",
+        "                [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],\n",
+        "                [[1.0], [2.0], [3.0]],\n",
+        "            ),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, m, n, k, a_vals, b_vals in test_specs:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"A\": torch.tensor(a_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"B\": torch.tensor(b_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"C\": torch.empty(m, k, device=\"cuda\", dtype=dtype),\n",
+        "                    \"M\": m,\n",
+        "                    \"N\": n,\n",
+        "                    \"K\": k,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random test cases with different sizes\n",
+        "        for _, m, n, k in [\n",
+        "            (\"small_square\", 4, 4, 4),\n",
+        "            (\"medium_rectangular\", 8, 6, 10),\n",
+        "            (\"large_rectangular\", 16, 12, 20),\n",
+        "            (\"tall_matrix\", 32, 8, 16),\n",
+        "            (\"wide_matrix\", 8, 16, 32),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"A\": torch.empty(m, n, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                    \"B\": torch.empty(n, k, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                    \"C\": torch.empty(m, k, device=\"cuda\", dtype=dtype),\n",
+        "                    \"M\": m,\n",
+        "                    \"N\": n,\n",
+        "                    \"K\": k,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Edge cases\n",
+        "        for _, m, n, k in [\n",
+        "            (\"single_element\", 1, 1, 1),\n",
+        "            (\"single_row\", 1, 5, 3),\n",
+        "            (\"single_column\", 5, 3, 1),\n",
+        "            (\"max_dimensions\", 8192, 6144, 4096),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"A\": torch.empty(m, n, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                    \"B\": torch.empty(n, k, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                    \"C\": torch.empty(m, k, device=\"cuda\", dtype=dtype),\n",
+        "                    \"M\": m,\n",
+        "                    \"N\": n,\n",
+        "                    \"K\": k,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N, K = 8192, 6144, 4096\n",
+        "        return {\n",
+        "            \"A\": torch.empty(M, N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"B\": torch.empty(N, K, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"C\": torch.empty(M, K, device=\"cuda\", dtype=dtype),\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/31_matrix_copy.ipynb b/challenges/colab_exports/easy/31_matrix_copy.ipynb
new file mode 100644
index 00000000..e617c322
--- /dev/null
+++ b/challenges/colab_exports/easy/31_matrix_copy.ipynb
@@ -0,0 +1,489 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that copies an $N \\times N$ matrix of 32-bit floating point numbers from input array $A$ to output array $B$ on the GPU. The program should perform a direct element-wise copy so that $B_{i,j} = A_{i,j}$ for all valid indices.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in matrix <code>B</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  A = [[1.0, 2.0],\n             [3.0, 4.0]]\nOutput: B = [[1.0, 2.0],\n             [3.0, 4.0]]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  A = [[5.5, 6.6, 7.7],\n             [8.8, 9.9, 10.1],\n             [11.2, 12.3, 13.4]]\nOutput: B = [[5.5, 6.6, 7.7],\n             [8.8, 9.9, 10.1],\n             [11.2, 12.3, 13.4]]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 4096</li>\n  <li>All elements are 32-bit floating point numbers</li>\n\n  <li>Performance is measured with <code>N</code> = 4,096</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void copy_matrix_kernel(const float* A, float* B, int N) {}\n\n// A, B are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, float* B, int N) {\n    int total = N * N;\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (total + threadsPerBlock - 1) / threadsPerBlock;\n    copy_matrix_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A is a tensor on the GPU\n@jax.jit\ndef solve(A: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef copy_matrix_kernel(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# A, B are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var total = N * N\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(total, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[copy_matrix_kernel, copy_matrix_kernel]()\n    ctx.enqueue_function(_kernel, A, B, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Matrix Copy\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, N: int):\n",
+        "        assert A.shape == (N, N)\n",
+        "        assert B.shape == (N, N)\n",
+        "        assert A.dtype == B.dtype\n",
+        "        assert A.device == B.device\n",
+        "\n",
+        "        # Copy matrix A to B\n",
+        "        B[:] = A\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.empty(2, 2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"N\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # basic_2x2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_zeros_4x4\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros((4, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros((4, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # identity_3x3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"B\": torch.zeros((3, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative_values_2x2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_N_16x16\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"B\": torch.zeros((16, 16), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_element\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[42.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros((1, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4096\n",
+        "        return {\n",
+        "            \"A\": torch.empty(N, N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"B\": torch.empty(N, N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/3_matrix_transpose.ipynb b/challenges/colab_exports/easy/3_matrix_transpose.ipynb
new file mode 100644
index 00000000..0311077c
--- /dev/null
+++ b/challenges/colab_exports/easy/3_matrix_transpose.ipynb
@@ -0,0 +1,490 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a program that transposes a matrix of 32-bit floating point numbers on a GPU. The\n  transpose of a matrix switches its rows and columns. Given a matrix $A$ of dimensions $rows \\times cols$, the transpose $A^T$ will have dimensions $cols \\times rows$. All matrices are stored in row-major format.\n</p>\n\n<svg width=\"420\" height=\"180\" viewBox=\"0 0 420 180\" xmlns=\"http://www.w3.org/2000/svg\" style=\"display:block; margin:20px auto;\">\n  <rect width=\"420\" height=\"180\" rx=\"8\" fill=\"#222\"/>\n  <defs>\n    <marker id=\"arrt\" markerWidth=\"6\" markerHeight=\"6\" refX=\"5\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L6,3 L0,6\" fill=\"none\" stroke=\"#ccc\" stroke-width=\"1\"/>\n    </marker>\n  </defs>\n\n  <!-- Input matrix A (3x2) -->\n  <text x=\"60\" y=\"18\" text-anchor=\"middle\" fill=\"#ccc\" font-family=\"sans-serif\" font-size=\"13\" font-weight=\"bold\">A</text>\n  <rect x=\"10\" y=\"24\" width=\"100\" height=\"140\" rx=\"4\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <!-- Row 0 - blue -->\n  <rect x=\"16\" y=\"30\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#4477bb\"/>\n  <text x=\"36\" y=\"47\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">1</text>\n  <rect x=\"62\" y=\"30\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#4477bb\"/>\n  <text x=\"82\" y=\"47\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">2</text>\n  <!-- Row 1 - green -->\n  <rect x=\"16\" y=\"60\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#44aa66\"/>\n  <text x=\"36\" y=\"77\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">3</text>\n  <rect x=\"62\" y=\"60\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#44aa66\"/>\n  <text x=\"82\" y=\"77\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">4</text>\n  <!-- Row 2 - orange -->\n  <rect x=\"16\" y=\"90\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#cc7744\"/>\n  <text x=\"36\" y=\"107\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">5</text>\n  <rect x=\"62\" y=\"90\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#cc7744\"/>\n  <text x=\"82\" y=\"107\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">6</text>\n  <text x=\"60\" y=\"145\" text-anchor=\"middle\" fill=\"#999\" font-family=\"sans-serif\" font-size=\"10\">3 &#xd7; 2</text>\n\n  <!-- Middle label -->\n  <text x=\"210\" y=\"95\" text-anchor=\"middle\" fill=\"#aaa\" font-family=\"sans-serif\" font-size=\"11\" font-style=\"italic\">rows &#x2192; cols</text>\n\n  <!-- Output matrix A^T (2x3) -->\n  <text x=\"330\" y=\"18\" text-anchor=\"middle\" fill=\"#ccc\" font-family=\"sans-serif\" font-size=\"13\" font-weight=\"bold\">A&#x1d40;</text>\n  <rect x=\"270\" y=\"24\" width=\"140\" height=\"110\" rx=\"4\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <!-- Row 0 of A^T: col 0 of A -->\n  <rect x=\"276\" y=\"30\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#4477bb\"/>\n  <text x=\"296\" y=\"47\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">1</text>\n  <rect x=\"320\" y=\"30\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#44aa66\"/>\n  <text x=\"340\" y=\"47\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">3</text>\n  <rect x=\"364\" y=\"30\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#cc7744\"/>\n  <text x=\"384\" y=\"47\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">5</text>\n  <!-- Row 1 of A^T: col 1 of A -->\n  <rect x=\"276\" y=\"60\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#4477bb\"/>\n  <text x=\"296\" y=\"77\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">2</text>\n  <rect x=\"320\" y=\"60\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#44aa66\"/>\n  <text x=\"340\" y=\"77\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">4</text>\n  <rect x=\"364\" y=\"60\" width=\"40\" height=\"24\" rx=\"3\" fill=\"#cc7744\"/>\n  <text x=\"384\" y=\"77\" text-anchor=\"middle\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\">6</text>\n  <text x=\"340\" y=\"115\" text-anchor=\"middle\" fill=\"#999\" font-family=\"sans-serif\" font-size=\"10\">2 &#xd7; 3</text>\n\n  <!-- Curved dashed arrows showing element movement -->\n  <path d=\"M 56,72 C 140,30 220,20 320,42\" fill=\"none\" stroke=\"#44aa66\" stroke-width=\"1.5\" stroke-dasharray=\"4,3\" marker-end=\"url(#arrt)\"/>\n  <path d=\"M 56,102 C 150,55 230,35 364,42\" fill=\"none\" stroke=\"#cc7744\" stroke-width=\"1.5\" stroke-dasharray=\"4,3\" marker-end=\"url(#arrt)\"/>\n  <path d=\"M 102,72 C 170,120 240,110 320,72\" fill=\"none\" stroke=\"#44aa66\" stroke-width=\"1.5\" stroke-dasharray=\"4,3\" marker-end=\"url(#arrt)\"/>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the matrix <code>output</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<p>Input: 2\u00d73 matrix</p>\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 \\\\\n4.0 & 5.0 & 6.0\n\\end{bmatrix}\n$$\n\n<p>Output: 3\u00d72 matrix</p>\n$$\n\\begin{bmatrix}\n1.0 & 4.0 \\\\\n2.0 & 5.0 \\\\\n3.0 & 6.0\n\\end{bmatrix}\n$$\n\n<h2>Example 2:</h2>\n<p>Input: 3\u00d71 matrix</p>\n$$\n\\begin{bmatrix}\n1.0 \\\\\n2.0 \\\\\n3.0\n\\end{bmatrix}\n$$\n\n<p>Output: 1\u00d73 matrix</p>\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0\n\\end{bmatrix}\n$$\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>rows</code>, <code>cols</code> \u2264 8192</li>\n  <li>Input matrix dimensions: <code>rows</code> \u00d7 <code>cols</code></li>\n  <li>Output matrix dimensions: <code>cols</code> \u00d7 <code>rows</code></li>\n\n  <li>Performance is measured with <code>cols</code> = 6,000, <code>rows</code> = 7,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void matrix_transpose_kernel(const float* input, float* output, int rows, int cols) {}\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, float* output, int rows, int cols) {\n    dim3 threadsPerBlock(16, 16);\n    dim3 blocksPerGrid((cols + threadsPerBlock.x - 1) / threadsPerBlock.x,\n                       (rows + threadsPerBlock.y - 1) / threadsPerBlock.y);\n\n    matrix_transpose_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, rows, cols);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, rows: cute.Int32, cols: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on GPU\n@jax.jit\ndef solve(input: jax.Array, rows: int, cols: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef matrix_transpose_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    rows: Int32,\n    cols: Int32,\n):\n    pass\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    rows: Int32,\n    cols: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 32\n    var ctx = DeviceContext()\n\n    var grid_dim_x = ceildiv(cols, BLOCK_SIZE)\n    var grid_dim_y = ceildiv(rows, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[matrix_transpose_kernel, matrix_transpose_kernel]()\n    ctx.enqueue_function(\n        _kernel,\n        input,\n        output,\n        rows,\n        cols,\n        grid_dim=(grid_dim_x, grid_dim_y),\n        block_dim=(BLOCK_SIZE, BLOCK_SIZE),\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef matrix_transpose_kernel(input, output, rows, cols, stride_ir, stride_ic, stride_or, stride_oc):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):\n    stride_ir, stride_ic = cols, 1\n    stride_or, stride_oc = rows, 1\n\n    grid = (rows, cols)\n    matrix_transpose_kernel[grid](\n        input, output, rows, cols, stride_ir, stride_ic, stride_or, stride_oc\n    )\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Matrix Transpose\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):\n",
+        "        assert input.shape == (rows, cols)\n",
+        "        assert output.shape == (cols, rows)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        output.copy_(input.transpose(0, 1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"cols\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        rows, cols = 2, 3\n",
+        "        input_tensor = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(cols, rows, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"rows\": rows,\n",
+        "            \"cols\": cols,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_specs = [\n",
+        "            # Basic test cases\n",
+        "            (\"basic_2x3\", 2, 3, [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]),\n",
+        "            (\"basic_3x1\", 3, 1, [[1.0], [2.0], [3.0]]),\n",
+        "            (\"square_2x2\", 2, 2, [[1.0, 2.0], [3.0, 4.0]]),\n",
+        "            (\"single_row\", 1, 4, [[1.0, 2.0, 3.0, 4.0]]),\n",
+        "            (\"single_column\", 4, 1, [[1.0], [2.0], [3.0], [4.0]]),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, r, c, input_vals in test_specs:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.tensor(input_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"output\": torch.empty(c, r, device=\"cuda\", dtype=dtype),\n",
+        "                    \"rows\": r,\n",
+        "                    \"cols\": c,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random test cases with different sizes\n",
+        "        for _, rows, cols in [\n",
+        "            (\"small_rectangular\", 4, 6),\n",
+        "            (\"medium_square\", 8, 8),\n",
+        "            (\"large_rectangular\", 16, 12),\n",
+        "            (\"tall_matrix\", 32, 8),\n",
+        "            (\"wide_matrix\", 8, 32),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.empty(rows, cols, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -10.0, 10.0\n",
+        "                    ),\n",
+        "                    \"output\": torch.empty(cols, rows, device=\"cuda\", dtype=dtype),\n",
+        "                    \"rows\": rows,\n",
+        "                    \"cols\": cols,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Edge cases\n",
+        "        for _, rows, cols in [\n",
+        "            (\"single_element\", 1, 1),\n",
+        "            (\"max_dimensions\", 8192, 8192),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.empty(rows, cols, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -1.0, 1.0\n",
+        "                    ),\n",
+        "                    \"output\": torch.empty(cols, rows, device=\"cuda\", dtype=dtype),\n",
+        "                    \"rows\": rows,\n",
+        "                    \"cols\": cols,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        rows, cols = 7000, 6000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(rows, cols, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"output\": torch.zeros(cols, rows, device=\"cuda\", dtype=dtype),\n",
+        "            \"rows\": rows,\n",
+        "            \"cols\": cols,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/41_simple_inference.ipynb b/challenges/colab_exports/easy/41_simple_inference.ipynb
new file mode 100644
index 00000000..280a0552
--- /dev/null
+++ b/challenges/colab_exports/easy/41_simple_inference.ipynb
@@ -0,0 +1,485 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Run inference on a PyTorch model. Given an input tensor and a trained <code>torch.nn.Linear</code> model, compute the forward pass and store the result in the output tensor.\n  </p>\n\n  <p>\n    The model performs a linear transformation: <code>output = input @ weight.T + bias</code> where <code>weight</code> has shape [output_size, input_size] and <code>bias</code> has shape [output_size].\n  </p>\n\n  <h2>Implementation Requirements</h2>\n  <ul>\n    <li>Use PyTorch's built-in functions and operations</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the <code>output</code> tensor</li>\n    <li>The model is already loaded and ready for inference</li>\n  </ul>\n\n  <h2>Example 1:</h2>\n  <pre>\n  Input:  input = [[1.0, 2.0]]  (batch_size=1, input_size=2)\n          model: Linear layer with weight=[[0.5, 1.0], [1.5, 0.5]], bias=[0.1, 0.2]\n  Output: output = [[2.6, 2.7]]  (batch_size=1, output_size=2)\n  </pre>\n\n  <h2>Example 2:</h2>\n  <pre>\n  Input:  input = [[1.0], [2.0], [3.0]]  (batch_size=3, input_size=1)\n          model: Linear layer with weight=[[2.0]], bias=[1.0]\n  Output: output = [[3.0], [5.0], [7.0]]  (batch_size=3, output_size=1)\n  </pre>\n\n  <h2>Constraints</h2>\n  <ul>\n    <li>1 \u2264 <code>batch_size</code> \u2264 1,000</li>\n    <li>1 \u2264 <code>input_size</code> \u2264 1,000</li>\n    <li>1 \u2264 <code>output_size</code> \u2264 1,000</li>\n    <li>-10.0 \u2264 input values \u2264 10.0</li>\n\n  <li>Performance is measured with <code>batch_size</code> = 1,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input and model are on the GPU\ndef solve(input: jax.Array, model) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport torch.nn as nn\n\n\n# input, model, and output are on the GPU\ndef solve(input: torch.Tensor, model: nn.Module, output: torch.Tensor):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn as nn\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Simple Inference\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, model: nn.Module, output: torch.Tensor):\n",
+        "        assert input.device == output.device\n",
+        "        assert input.dtype == output.dtype\n",
+        "\n",
+        "        with torch.no_grad():\n",
+        "            result = model(input)\n",
+        "            output.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (torch.Tensor, \"in\"),\n",
+        "            \"model\": (nn.Module, \"in\"),\n",
+        "            \"output\": (torch.Tensor, \"out\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        # Create a simple linear model\n",
+        "        model = nn.Linear(2, 2)\n",
+        "        model.weight.data = torch.tensor([[0.5, 1.0], [1.5, 0.5]], dtype=dtype)\n",
+        "        model.bias.data = torch.tensor([0.1, 0.2], dtype=dtype)\n",
+        "        model = model.to(device)\n",
+        "\n",
+        "        input = torch.tensor([[1.0, 2.0]], device=device, dtype=dtype)\n",
+        "        output = torch.empty((1, 2), device=device, dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"model\": model,\n",
+        "            \"output\": output,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # Test 1: Basic 2->2 linear layer\n",
+        "        model1 = nn.Linear(2, 2)\n",
+        "        model1.weight.data = torch.tensor([[0.5, 1.0], [1.5, 0.5]], dtype=dtype)\n",
+        "        model1.bias.data = torch.tensor([0.1, 0.2], dtype=dtype)\n",
+        "        model1 = model1.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1.0, 2.0]], device=device, dtype=dtype),\n",
+        "                \"model\": model1,\n",
+        "                \"output\": torch.empty((1, 2), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 2: Single input/output\n",
+        "        model2 = nn.Linear(1, 1)\n",
+        "        model2.weight.data = torch.tensor([[2.0]], dtype=dtype)\n",
+        "        model2.bias.data = torch.tensor([1.0], dtype=dtype)\n",
+        "        model2 = model2.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1.0], [2.0], [3.0]], device=device, dtype=dtype),\n",
+        "                \"model\": model2,\n",
+        "                \"output\": torch.empty((3, 1), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 3: No bias\n",
+        "        model3 = nn.Linear(3, 2, bias=False)\n",
+        "        model3.weight.data = torch.tensor([[1.0, 0.0, -1.0], [0.5, 1.5, 0.0]], dtype=dtype)\n",
+        "        model3 = model3.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [0.0, 1.0, -1.0]], device=device, dtype=dtype\n",
+        "                ),\n",
+        "                \"model\": model3,\n",
+        "                \"output\": torch.empty((2, 2), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 4: Batch processing\n",
+        "        model4 = nn.Linear(4, 3)\n",
+        "        model4.weight.data = torch.randn((3, 4), dtype=dtype) * 0.5\n",
+        "        model4.bias.data = torch.randn(3, dtype=dtype) * 0.1\n",
+        "        model4 = model4.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randn((8, 4), device=device, dtype=dtype),\n",
+        "                \"model\": model4,\n",
+        "                \"output\": torch.empty((8, 3), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 5: Larger model\n",
+        "        model5 = nn.Linear(10, 5)\n",
+        "        model5.weight.data = torch.randn((5, 10), dtype=dtype) * 0.3\n",
+        "        model5.bias.data = torch.randn(5, dtype=dtype) * 0.2\n",
+        "        model5 = model5.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randn((16, 10), device=device, dtype=dtype),\n",
+        "                \"model\": model5,\n",
+        "                \"output\": torch.empty((16, 5), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 6: Zero weights\n",
+        "        model6 = nn.Linear(2, 2)\n",
+        "        model6.weight.data = torch.zeros((2, 2), dtype=dtype)\n",
+        "        model6.bias.data = torch.tensor([1.0, -1.0], dtype=dtype)\n",
+        "        model6 = model6.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[5.0, 10.0]], device=device, dtype=dtype),\n",
+        "                \"model\": model6,\n",
+        "                \"output\": torch.empty((1, 2), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 7: Identity-like transformation\n",
+        "        model7 = nn.Linear(3, 3)\n",
+        "        model7.weight.data = torch.eye(3, dtype=dtype)\n",
+        "        model7.bias.data = torch.zeros(3, dtype=dtype)\n",
+        "        model7 = model7.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [-1.0, 0.0, 1.0]], device=device, dtype=dtype\n",
+        "                ),\n",
+        "                \"model\": model7,\n",
+        "                \"output\": torch.empty((2, 3), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 8: Single batch, many features\n",
+        "        model8 = nn.Linear(20, 1)\n",
+        "        model8.weight.data = torch.ones((1, 20), dtype=dtype) * 0.05  # Sum with scaling\n",
+        "        model8.bias.data = torch.tensor([0.0], dtype=dtype)\n",
+        "        model8 = model8.to(device)\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randn((1, 20), device=device, dtype=dtype),\n",
+        "                \"model\": model8,\n",
+        "                \"output\": torch.empty((1, 1), device=device, dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        # Large model for performance testing\n",
+        "        model = nn.Linear(512, 256)\n",
+        "        model.weight.data = torch.randn((256, 512), dtype=dtype) * 0.1\n",
+        "        model.bias.data = torch.randn(256, dtype=dtype) * 0.05\n",
+        "        model = model.to(device)\n",
+        "\n",
+        "        batch_size = 1000\n",
+        "        input = torch.randn((batch_size, 512), device=device, dtype=dtype)\n",
+        "        output = torch.empty((batch_size, 256), device=device, dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"model\": model,\n",
+        "            \"output\": output,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/52_silu.ipynb b/challenges/colab_exports/easy/52_silu.ipynb
new file mode 100644
index 00000000..3777f551
--- /dev/null
+++ b/challenges/colab_exports/easy/52_silu.ipynb
@@ -0,0 +1,504 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the SiLU (Sigmoid Linear Unit) activation function forward pass for 1D input vectors.\n  Given an input tensor of shape [N] where N is the number of elements, compute the output using the elementwise formula.\n</p>\n\n<p>\n  SiLU is defined as:\n  $$\n  \\begin{align}\n  \\sigma(x) &= \\frac{1}{1 + e^{-x}} \\\\\n  \\text{SiLU}(x) &= x \\cdot \\sigma(x)\n  \\end{align}\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [0.5, 1.0, -0.5]  (N=3)\nOutput: output = [0.3112295, 0.731059, -0.1887705]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [-1.0, -2.0, -3.0, -4.0, -5.0]  (N=5)\nOutput: output = [-0.26894143 -0.23840584 -0.14227763 -0.07194484 -0.03346425]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 10,000</li>\n  <li>-100.0 \u2264 input values \u2264 100.0</li>\n\n  <li>Performance is measured with <code>N</code> = 50,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void silu_kernel(const float* input, float* output, int N) {}\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    silu_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef silu_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(N, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[silu_kernel, silu_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, N, grid_dim=blocksPerGrid, block_dim=threadsPerBlock\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef silu_kernel(input, output, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    silu_kernel[grid](input, output, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Sigmoid Linear Unit\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        # Scale and shift\n",
+        "        output.copy_(input * torch.sigmoid(input))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_small\n",
+        "        N = 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0.5, 1.0, -0.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_element\n",
+        "        N = 1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all zeros\n",
+        "        N = 42\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative numbers\n",
+        "        N = 5\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, -2.0, -3.0, -4.0, -5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed positive/negative\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-0.5, 0.0, 0.5, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large values\n",
+        "        N = 1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large N\n",
+        "        N = 10000\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "            \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/54_swiglu.ipynb b/challenges/colab_exports/easy/54_swiglu.ipynb
new file mode 100644
index 00000000..e594d858
--- /dev/null
+++ b/challenges/colab_exports/easy/54_swiglu.ipynb
@@ -0,0 +1,498 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the Swish-Gated Linear Unit (SWiGLU) activation function forward pass for 1D input vectors.\n  Given an input tensor of shape [N] where N is the number of elements, compute the output using the elementwise formula.\n  The input and output tensor must be of type <code>float32</code>.\n</p>\n\n<p>\n  SWiGLU is defined as:\n  <ol>\n    <li>Split input $x$ into two halves: $x_1$ and $x_2$</li>\n    <li>Compute SiLU on the first half:\n      $$\n        \\text{SiLU}(x_1) = x_1 \\cdot \\sigma(x_1), \\quad\n        \\sigma(x) = \\frac{1}{1 + e^{-x}}\n      $$\n    </li>\n    <li>Compute the SWiGLU output:\n      $$\n        \\text{SWiGLU}(x_1, x_2) = \\text{SiLU}(x_1) \\cdot x_2\n      $$\n    </li>\n  </ol>\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  [1.0, 2.0, 3.0, 4.0]  (N=4)\nOutput: [2.1931758, 7.0463767]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  [0.5, 1.0]  (N=2)\nOutput: [0.31122968]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 100,000</li>\n  <li>N is an even number</li>\n  <li>-100.0 \u2264 input values \u2264 100.0</li>\n\n  <li>Performance is measured with <code>N</code> = 100,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void swiglu_kernel(const float* input, float* output, int halfN) {}\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int halfN = N / 2;\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (halfN + threadsPerBlock - 1) / threadsPerBlock;\n\n    swiglu_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, halfN);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef swiglu_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N // 2, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[swiglu_kernel, swiglu_kernel]()\n    ctx.enqueue_function(_kernel, input, output, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef swiglu(input, output, N, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N // 2, BLOCK_SIZE),)\n    swiglu[grid](input, output, N, BLOCK_SIZE=BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Swish-Gated Linear Unit\", atol=1e-04, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert N % 2 == 0\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N // 2,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        x1, x2 = input.chunk(2)\n",
+        "        output.copy_((x1 * torch.sigmoid(x1)) * x2)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(N // 2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_small\n",
+        "        N = 2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0.5, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all zeros\n",
+        "        N = 42\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative numbers\n",
+        "        N = 6\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-1.0, -2.0, -3.0, -4.0, -5.0, -6.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed positive/negative\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-0.5, 0.0, -1.5, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large values\n",
+        "        N = 1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large N\n",
+        "        N = 2048\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 100000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "            \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/62_value_clipping.ipynb b/challenges/colab_exports/easy/62_value_clipping.ipynb
new file mode 100644
index 00000000..b81313a3
--- /dev/null
+++ b/challenges/colab_exports/easy/62_value_clipping.ipynb
@@ -0,0 +1,510 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that performs clipping on 1D input vectors.\n  Given an input tensor of shape [N] where N is the number of elements,\n  compute the output by clipping each element to a specified range [<code>lo</code>, <code>hi</code>].\n  The input and output tensor must be of type <code>float32</code>.\n</p>\n\n<p>\n  Clipping is defined as:\n  <ol>\n    <li>For each element <code>x</code> in the input tensor, \"clip\" the element so that it falls within the allowed range <code>[lo, hi]</code>.\n    </li>\n    <li>This operation ensures all values are within the specified range and is commonly used in ML for activation stabilization and pre-quantization.</li>\n  </ol>\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  [1.5, -2.0, 3.0, 4.5], lo = 0.0, hi = 3.5\nOutput: [1.5, 0.0, 3.0, 3.5]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  [-1.0, 2.0, 5.0], lo = -0.5, hi = 2.5\nOutput: [-0.5, 2.0, 2.5]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 100,000</li>\n  <li>-10<sup>6</sup> \u2264 input[i] \u2264 10<sup>6</sup></li>\n  <li><code>lo</code> \u2264 <code>hi</code></li>\n\n  <li>Performance is measured with <code>N</code> = 100,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void clip_kernel(const float* input, float* output, float lo, float hi, int N) {}\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, float lo, float hi, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    clip_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, lo, hi, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor, output: cute.Tensor, lo: cute.Float32, hi: cute.Float32, N: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, lo: float, hi: float, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef clip_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    lo: Float32,\n    hi: Float32,\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    lo: Float32,\n    hi: Float32,\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[clip_kernel, clip_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, lo, hi, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, lo: float, hi: float, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef clip_kernel(input, output, lo, hi, N, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, lo: float, hi: float, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    clip_kernel[grid](input, output, lo, hi, N, BLOCK_SIZE=BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Value Clipping\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, input: torch.Tensor, output: torch.Tensor, lo: float, hi: float, N: int\n",
+        "    ):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "        output.copy_(input.clamp(min=lo, max=hi))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"lo\": (ctypes.c_float, \"in\"),\n",
+        "            \"hi\": (ctypes.c_float, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        input = torch.tensor([1.5, -2.0, 3.0, 4.5], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        lo, hi = 0.0, 3.5\n",
+        "        return {\"input\": input, \"output\": output, \"lo\": lo, \"hi\": hi, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # Example 2\n",
+        "        N = 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, 2.0, 5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -0.5,\n",
+        "                \"hi\": 2.5,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all zeros\n",
+        "        N = 42\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -1.0,\n",
+        "                \"hi\": 1.0,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative numbers\n",
+        "        N = 6\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-1.0, -2.0, -3.0, -4.0, -5.0, -6.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -3.0,\n",
+        "                \"hi\": -1.0,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed positive/negative\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-0.5, 0.0, -1.5, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -1.0,\n",
+        "                \"hi\": 0.5,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large values\n",
+        "        N = 1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -50.9,\n",
+        "                \"hi\": 50.1,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large N\n",
+        "        N = 2048\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"lo\": -25.5,\n",
+        "                \"hi\": 25.05,\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 100000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "            \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"lo\": -51.24,\n",
+        "            \"hi\": 39.51,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/63_interleave.ipynb b/challenges/colab_exports/easy/63_interleave.ipynb
new file mode 100644
index 00000000..3e5ff0f8
--- /dev/null
+++ b/challenges/colab_exports/easy/63_interleave.ipynb
@@ -0,0 +1,542 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that interleaves two arrays of 32-bit floating point numbers.\n  Given two input arrays <code>A</code> and <code>B</code>, each of length <code>N</code>,\n  produce an output array of length <code>2N</code> where elements alternate between the two inputs:\n  <code>[A[0], B[0], A[1], B[1], A[2], B[2], ...]</code>\n</p>\n\n<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 400 180\" width=\"400\" height=\"180\"\n     style=\"display:block; margin:20px auto;\" font-family=\"monospace\" font-size=\"11\">\n  <defs>\n    <marker id=\"arrBlue\" viewBox=\"0 0 10 10\" refX=\"9\" refY=\"5\" markerWidth=\"6\" markerHeight=\"6\" orient=\"auto-start-reverse\">\n      <path d=\"M 0 0 L 10 5 L 0 10 z\" fill=\"#4477bb\"/>\n    </marker>\n    <marker id=\"arrGreen\" viewBox=\"0 0 10 10\" refX=\"9\" refY=\"5\" markerWidth=\"6\" markerHeight=\"6\" orient=\"auto-start-reverse\">\n      <path d=\"M 0 0 L 10 5 L 0 10 z\" fill=\"#44aa66\"/>\n    </marker>\n  </defs>\n  <rect width=\"400\" height=\"180\" rx=\"8\" fill=\"#222\"/>\n\n  <!-- Label A -->\n  <text x=\"60\" y=\"20\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-weight=\"bold\">A</text>\n  <!-- Array A cells -->\n  <rect x=\"10\"  y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"30\"  y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2080;</text>\n  <rect x=\"54\"  y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"74\"  y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2081;</text>\n  <rect x=\"98\"  y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"118\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2082;</text>\n  <rect x=\"142\" y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"162\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2083;</text>\n\n  <!-- Label B -->\n  <text x=\"310\" y=\"20\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-weight=\"bold\">B</text>\n  <!-- Array B cells -->\n  <rect x=\"220\" y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"240\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2080;</text>\n  <rect x=\"264\" y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"284\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2081;</text>\n  <rect x=\"308\" y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"328\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2082;</text>\n  <rect x=\"352\" y=\"26\" width=\"40\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"372\" y=\"45\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2083;</text>\n\n  <!-- Label output -->\n  <text x=\"200\" y=\"108\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-weight=\"bold\">output</text>\n  <!-- Output array cells (alternating blue/green) -->\n  <rect x=\"10\"  y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"31\"  y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2080;</text>\n  <rect x=\"56\"  y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"77\"  y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2080;</text>\n  <rect x=\"102\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"123\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2081;</text>\n  <rect x=\"148\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"169\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2081;</text>\n  <rect x=\"194\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"215\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2082;</text>\n  <rect x=\"240\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"261\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2082;</text>\n  <rect x=\"286\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"307\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">a&#x2083;</text>\n  <rect x=\"332\" y=\"114\" width=\"42\" height=\"28\" rx=\"3\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"353\" y=\"133\" text-anchor=\"middle\" fill=\"#ccc\">b&#x2083;</text>\n\n  <!-- Curved arrows from A to output (dashed, blue) -->\n  <path d=\"M30,54 C30,80 31,90 31,114\" stroke=\"#4477bb\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrBlue)\"/>\n  <path d=\"M74,54 C74,78 123,90 123,114\" stroke=\"#4477bb\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrBlue)\"/>\n  <path d=\"M118,54 C118,78 215,90 215,114\" stroke=\"#4477bb\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrBlue)\"/>\n  <path d=\"M162,54 C162,78 307,90 307,114\" stroke=\"#4477bb\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrBlue)\"/>\n\n  <!-- Curved arrows from B to output (dashed, green) -->\n  <path d=\"M240,54 C240,78 77,90 77,114\" stroke=\"#44aa66\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrGreen)\"/>\n  <path d=\"M284,54 C284,78 169,90 169,114\" stroke=\"#44aa66\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrGreen)\"/>\n  <path d=\"M328,54 C328,78 261,90 261,114\" stroke=\"#44aa66\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrGreen)\"/>\n  <path d=\"M372,54 C372,78 353,90 353,114\" stroke=\"#44aa66\" stroke-width=\"1.2\" stroke-dasharray=\"4,3\" fill=\"none\" marker-end=\"url(#arrGreen)\"/>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  A = [1.0, 2.0, 3.0], B = [4.0, 5.0, 6.0]\nOutput: [1.0, 4.0, 2.0, 5.0, 3.0, 6.0]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  A = [10.0, 20.0], B = [30.0, 40.0]\nOutput: [10.0, 30.0, 20.0, 40.0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 50,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 25,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void interleave_kernel(const float* A, const float* B, float* output, int N) {}\n\n// A, B, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, const float* B, float* output, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    interleave_kernel<<<blocksPerGrid, threadsPerBlock>>>(A, B, output, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, output are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef interleave_kernel(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# A, B, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[interleave_kernel, interleave_kernel]()\n    ctx.enqueue_function(_kernel, A, B, output, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, output are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef interleave_kernel(A_ptr, B_ptr, output_ptr, N, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# A, B, output are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 256\n\n    def grid(meta):\n        return (triton.cdiv(N, meta[\"BLOCK_SIZE\"]),)\n\n    interleave_kernel[grid](A, B, output, N, BLOCK_SIZE=BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Interleave Arrays\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert A.shape == (N,)\n",
+        "        assert B.shape == (N,)\n",
+        "        assert output.shape == (2 * N,)\n",
+        "        assert A.dtype == B.dtype == output.dtype == torch.float32\n",
+        "\n",
+        "        # Interleave: [A[0], B[0], A[1], B[1], ...]\n",
+        "        output[0::2] = A\n",
+        "        output[1::2] = B\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([4.0, 5.0, 6.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(6, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # Basic example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([4.0, 5.0, 6.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(6, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Single element\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Two elements\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([10.0, 20.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([30.0, 40.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Negative values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([-1.0, -2.0, -3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-4.0, -5.0, -6.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(6, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Mixed positive and negative\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-1.0, 2.0, -3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(8, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.ones(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(10, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Large values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1e10, 1e-10], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([1e-10, 1e10], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Medium size random\n",
+        "        N = 1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2 * N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Larger random\n",
+        "        N = 10000\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2 * N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Even larger\n",
+        "        N = 100000\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2 * N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 25000000  # 25 million elements each, 50 million output\n",
+        "        return {\n",
+        "            \"A\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"B\": torch.randn(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"output\": torch.empty(2 * N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/65_geglu.ipynb b/challenges/colab_exports/easy/65_geglu.ipynb
new file mode 100644
index 00000000..cfdb6881
--- /dev/null
+++ b/challenges/colab_exports/easy/65_geglu.ipynb
@@ -0,0 +1,504 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the Gaussian Error Gated Linear Unit (GEGLU) activation function forward pass for 1D input\n  vectors. Given an input tensor of shape [N] where N is the number of elements, compute the output\n  using the elementwise formula. The input and output tensor must be of type <code>float32</code>.\n</p>\n\n<p>\n  GEGLU is defined as:\n  <ol>\n    <li>Split input $x$ into two halves: $x_1$ and $x_2$</li>\n    <li>Compute GELU on the second half:\n      $$\n        \\text{GELU}(x_2) = \\frac{1}{2} x_2 \\left(1 + \\text{erf}\\left(\\frac{x_2}{\\sqrt{2}}\\right)\\right)\n      $$\n    </li>\n    <li>Compute the GEGLU output:\n      $$\n        \\text{GEGLU}(x_1, x_2) = x_1 \\cdot \\text{GELU}(x_2)\n      $$\n    </li>\n  </ol>\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  [1.0, 1.0]  (N=2)\nOutput: [0.8413447]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  [2.0, -1.0, 1.0, 0.5]  (N=4)\nOutput: [1.6826895, -0.3457312]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 1,000,000</li>\n  <li>N is an even number</li>\n  <li>-100.0 \u2264 input values \u2264 100.0</li>\n\n  <li>Performance is measured with <code>N</code> = 1,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void geglu_kernel(const float* input, float* output, int halfN) {}\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int halfN = N / 2;\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (halfN + threadsPerBlock - 1) / threadsPerBlock;\n\n    geglu_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, halfN);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef geglu_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N // 2, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[geglu_kernel, geglu_kernel]()\n    ctx.enqueue_function(_kernel, input, output, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef geglu(input, output, N, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N // 2, BLOCK_SIZE),)\n    geglu[grid](input, output, N, BLOCK_SIZE=BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Gaussian Error Gated Linear Unit\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert N % 2 == 0\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N // 2,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        x1, x2 = input.chunk(2)\n",
+        "        gelu = 0.5 * x2 * (1.0 + torch.erf(x2 / math.sqrt(2.0)))\n",
+        "        output.copy_(x1 * gelu)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 2\n",
+        "        input = torch.tensor([1.0, 1.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(N // 2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_small\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2.0, -1.0, 1.0, 0.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all zeros\n",
+        "        N = 42\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative numbers\n",
+        "        N = 6\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-1.0, -2.0, -3.0, -4.0, -5.0, -6.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed positive/negative\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-0.5, 0.0, -1.5, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large values\n",
+        "        N = 1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large N\n",
+        "        N = 100000\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 1000000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "            \"output\": torch.empty(N // 2, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/66_rgb_to_grayscale.ipynb b/challenges/colab_exports/easy/66_rgb_to_grayscale.ipynb
new file mode 100644
index 00000000..ae1de095
--- /dev/null
+++ b/challenges/colab_exports/easy/66_rgb_to_grayscale.ipynb
@@ -0,0 +1,597 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that converts an RGB image to grayscale on the GPU.\n  Given an input RGB image represented as a 1D array of 32-bit floating point values,\n  compute the corresponding grayscale image using the standard RGB to grayscale conversion formula.\n</p>\n\n<p>\n  The conversion formula is: <code>gray = 0.299 \u00d7 R + 0.587 \u00d7 G + 0.114 \u00d7 B</code>\n</p>\n\n<p>\n  The input array <code>input</code> contains <code>height \u00d7 width \u00d7 3</code> elements,\n  where the RGB values for each pixel are stored consecutively (R, G, B, R, G, B, ...).\n  The output array <code>output</code> should contain <code>height \u00d7 width</code> grayscale values.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the array <code>output</code></li>\n  <li>Use the exact coefficients: 0.299 for red, 0.587 for green, 0.114 for blue</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [255.0, 0.0, 0.0, 0.0, 255.0, 0.0, 0.0, 0.0, 255.0, 128.0, 128.0, 128.0], width=2, height=2\nOutput: output = [76.245, 149.685, 29.07, 128.0]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [100.0, 150.0, 200.0], width=1, height=1\nOutput: output = [140.75]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>width</code> \u2264 4096</li>\n  <li>1 \u2264 <code>height</code> \u2264 4096</li>\n  <li><code>width \u00d7 height</code> \u2264 4,194,304</li>\n  <li>All RGB values are in the range [0.0, 255.0]</li>\n\n  <li>Performance is measured with <code>height</code> = 2,048, <code>width</code> = 2,048</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void rgb_to_grayscale_kernel(const float* input, float* output, int width, int height) {}\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int width, int height) {\n    int total_pixels = width * height;\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (total_pixels + threadsPerBlock - 1) / threadsPerBlock;\n\n    rgb_to_grayscale_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, width, height);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, width: cute.Int32, height: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on GPU\n@jax.jit\ndef solve(input: jax.Array, width: int, height: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef rgb_to_grayscale_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    width: Int32,\n    height: Int32,\n):\n    pass\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    width: Int32,\n    height: Int32,\n) raises:\n    var total_pixels = width * height\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(total_pixels, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[rgb_to_grayscale_kernel, rgb_to_grayscale_kernel]()\n    ctx.enqueue_function(\n        _kernel, input, output, width, height, grid_dim=num_blocks, block_dim=BLOCK_SIZE\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef rgb_to_grayscale_kernel(input, output, width, height, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, width: int, height: int):\n    total_pixels = width * height\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(total_pixels, BLOCK_SIZE),)\n    rgb_to_grayscale_kernel[grid](input, output, width, height, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"RGB to Grayscale\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, width: int, height: int):\n",
+        "        assert input.shape == (height * width * 3,)\n",
+        "        assert output.shape == (height * width,)\n",
+        "        assert input.dtype == output.dtype == torch.float32\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        # Reshape input to (height, width, 3) for easier processing\n",
+        "        rgb_image = input.view(height, width, 3)\n",
+        "\n",
+        "        # Apply RGB to grayscale conversion: gray = 0.299*R + 0.587*G + 0.114*B\n",
+        "        grayscale = (\n",
+        "            0.299 * rgb_image[:, :, 0] + 0.587 * rgb_image[:, :, 1] + 0.114 * rgb_image[:, :, 2]\n",
+        "        )\n",
+        "\n",
+        "        # Flatten and store in output\n",
+        "        output.copy_(grayscale.flatten())\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"width\": (ctypes.c_int, \"in\"),\n",
+        "            \"height\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        width, height = 2, 2\n",
+        "        # RGB values for a 2x2 image\n",
+        "        # Pixel (0,0): R=255, G=0, B=0 (red)\n",
+        "        # Pixel (0,1): R=0, G=255, B=0 (green)\n",
+        "        # Pixel (1,0): R=0, G=0, B=255 (blue)\n",
+        "        # Pixel (1,1): R=128, G=128, B=128 (gray)\n",
+        "        input_data = torch.tensor(\n",
+        "            [\n",
+        "                255.0,\n",
+        "                0.0,\n",
+        "                0.0,  # red\n",
+        "                0.0,\n",
+        "                255.0,\n",
+        "                0.0,  # green\n",
+        "                0.0,\n",
+        "                0.0,\n",
+        "                255.0,  # blue\n",
+        "                128.0,\n",
+        "                128.0,\n",
+        "                128.0,  # gray\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=torch.float32,\n",
+        "        )\n",
+        "        output = torch.zeros(width * height, device=\"cuda\", dtype=torch.float32)\n",
+        "        return {\n",
+        "            \"input\": input_data,\n",
+        "            \"output\": output,\n",
+        "            \"width\": width,\n",
+        "            \"height\": height,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Small test cases\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [255.0, 0.0, 0.0], device=\"cuda\", dtype=torch.float32\n",
+        "                ),  # red pixel\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [0.0, 255.0, 0.0], device=\"cuda\", dtype=torch.float32\n",
+        "                ),  # green pixel\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [0.0, 0.0, 255.0], device=\"cuda\", dtype=torch.float32\n",
+        "                ),  # blue pixel\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 2x2 test case\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [\n",
+        "                        100.0,\n",
+        "                        150.0,\n",
+        "                        200.0,  # mixed color 1\n",
+        "                        50.0,\n",
+        "                        75.0,\n",
+        "                        100.0,  # mixed color 2\n",
+        "                        200.0,\n",
+        "                        100.0,\n",
+        "                        50.0,  # mixed color 3\n",
+        "                        75.0,\n",
+        "                        125.0,\n",
+        "                        175.0,  # mixed color 4\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=torch.float32,\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(4, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 2,\n",
+        "                \"height\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Edge cases: zeros and max values\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(3, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.full((3,), 255.0, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Larger test cases\n",
+        "        for size in [4, 8, 16, 32]:\n",
+        "            input_size = size * size * 3\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.randint(\n",
+        "                        0, 256, (input_size,), device=\"cuda\", dtype=torch.float32\n",
+        "                    ),\n",
+        "                    \"output\": torch.zeros(size * size, device=\"cuda\", dtype=torch.float32),\n",
+        "                    \"width\": size,\n",
+        "                    \"height\": size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Larger realistic sizes\n",
+        "        for w, h in [(100, 100), (64, 48)]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.empty(h * w * 3, device=\"cuda\", dtype=torch.float32).uniform_(\n",
+        "                        0.0, 255.0\n",
+        "                    ),\n",
+        "                    \"output\": torch.zeros(h * w, device=\"cuda\", dtype=torch.float32),\n",
+        "                    \"width\": w,\n",
+        "                    \"height\": h,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Non-square images\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(\n",
+        "                    0, 256, (2 * 3 * 3,), device=\"cuda\", dtype=torch.float32\n",
+        "                ),  # 2x3 image\n",
+        "                \"output\": torch.zeros(2 * 3, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 3,\n",
+        "                \"height\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(\n",
+        "                    0, 256, (3 * 2 * 3,), device=\"cuda\", dtype=torch.float32\n",
+        "                ),  # 3x2 image\n",
+        "                \"output\": torch.zeros(3 * 2, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"width\": 2,\n",
+        "                \"height\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        width, height = 2048, 2048\n",
+        "        input_size = width * height * 3\n",
+        "        output_size = width * height\n",
+        "        return {\n",
+        "            \"input\": torch.randint(0, 256, (input_size,), device=\"cuda\", dtype=torch.float32),\n",
+        "            \"output\": torch.zeros(output_size, device=\"cuda\", dtype=torch.float32),\n",
+        "            \"width\": width,\n",
+        "            \"height\": height,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/68_sigmoid.ipynb b/challenges/colab_exports/easy/68_sigmoid.ipynb
new file mode 100644
index 00000000..9d1cdb29
--- /dev/null
+++ b/challenges/colab_exports/easy/68_sigmoid.ipynb
@@ -0,0 +1,472 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that applies the sigmoid activation function element-wise to a vector of\n  32-bit floating point numbers. For each element <code>x</code> in the input vector <code>X</code>,\n  compute <code>sigmoid(x) = 1 / (1 + exp(-x))</code> and store the result in the output vector\n  <code>Y</code>. The sigmoid function maps any real number to the range (0, 1).\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in vector <code>Y</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  X = [0.0, 1.0, -1.0, 2.0]\nOutput: Y = [0.5, 0.7311, 0.2689, 0.8808]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  X = [0.5, -0.5, 3.0, -3.0]\nOutput: Y = [0.6225, 0.3775, 0.9526, 0.0474]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>Input values are finite 32-bit floating point numbers</li>\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n#include <math.h>\n\n__global__ void sigmoid_kernel(const float* X, float* Y, int N) {}\n\n// X, Y are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* X, float* Y, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    sigmoid_kernel<<<blocksPerGrid, threadsPerBlock>>>(X, Y, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# X, Y are tensors on the GPU\n@cute.jit\ndef solve(X: cute.Tensor, Y: cute.Tensor, N: cute.Uint32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# X is a tensor on GPU\n@jax.jit\ndef solve(X: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef sigmoid_kernel(\n    X: UnsafePointer[Float32, MutExternalOrigin],\n    Y: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# X, Y are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    X: UnsafePointer[Float32, MutExternalOrigin],\n    Y: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var num_blocks = ceildiv(N, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[sigmoid_kernel, sigmoid_kernel]()\n    ctx.enqueue_function(_kernel, X, Y, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# X, Y are tensors on the GPU\ndef solve(X: torch.Tensor, Y: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef sigmoid_kernel(x_ptr, y_ptr, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# X, Y are tensors on the GPU\ndef solve(X: torch.Tensor, Y: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    grid = (triton.cdiv(N, BLOCK_SIZE),)\n    sigmoid_kernel[grid](X, Y, N, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Sigmoid Activation\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, X: torch.Tensor, Y: torch.Tensor, N: int):\n",
+        "        assert X.shape == Y.shape\n",
+        "        assert X.dtype == torch.float32\n",
+        "        assert Y.dtype == torch.float32\n",
+        "        assert X.device.type == \"cuda\"\n",
+        "        assert Y.device.type == \"cuda\"\n",
+        "\n",
+        "        torch.sigmoid(X, out=Y)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"X\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"Y\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_size_t, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        X = torch.tensor([0.0, 1.0, -1.0, 2.0], device=\"cuda\", dtype=dtype)\n",
+        "        Y = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"X\": X,\n",
+        "            \"Y\": Y,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        test_specs = [\n",
+        "            (\"single_zero\", [0.0]),\n",
+        "            (\"single_positive\", [1.0]),\n",
+        "            (\"single_negative\", [-1.0]),\n",
+        "            (\"basic_small\", [0.0, 1.0, -1.0, 2.0]),\n",
+        "            (\"all_zeros\", [0.0] * 16),\n",
+        "            (\"large_positives\", [10.0, 20.0, 100.0, 1000.0]),\n",
+        "            (\"large_negatives\", [-10.0, -20.0, -100.0, -1000.0]),\n",
+        "            (\"mixed_values\", [0.5, -0.5, 1.5, -1.5, 3.0, -3.0, 0.0, 7.0]),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, x_vals in test_specs:\n",
+        "            n = len(x_vals)\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"X\": torch.tensor(x_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"Y\": torch.empty(n, device=\"cuda\", dtype=dtype),\n",
+        "                    \"N\": n,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random and structured test cases\n",
+        "        for size, low, high in [\n",
+        "            (32, -5.0, 5.0),\n",
+        "            (100, -3.0, 3.0),\n",
+        "            (255, -10.0, 10.0),\n",
+        "            (1024, -1.0, 1.0),\n",
+        "            (10000, -5.0, 5.0),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"X\": torch.empty(size, device=\"cuda\", dtype=dtype).uniform_(low, high),\n",
+        "                    \"Y\": torch.empty(size, device=\"cuda\", dtype=dtype),\n",
+        "                    \"N\": size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50000000\n",
+        "        return {\n",
+        "            \"X\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"Y\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/7_color_inversion.ipynb b/challenges/colab_exports/easy/7_color_inversion.ipynb
new file mode 100644
index 00000000..1c0094e1
--- /dev/null
+++ b/challenges/colab_exports/easy/7_color_inversion.ipynb
@@ -0,0 +1,483 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a program to invert the colors of an image. The image is\n  represented as a 1D array of RGBA (Red, Green, Blue, Alpha) values, where each\n  component is an 8-bit unsigned integer (<code>unsigned char</code>).\n</p>\n\n<p>\n  Color inversion is performed by subtracting each color component (R, G, B)\n  from 255. The Alpha component should remain unchanged.\n</p>\n\n<p>\n  The input array\n  <code>image</code> will contain <code>width * height * 4</code> elements. The\n  first 4 elements represent the RGBA values of the top-left pixel, the next 4\n  elements represent the pixel to its right, and so on.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>\n    The final result must be stored in the array\n    <code>image</code>\n  </li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: image = [255, 0, 128, 255, 0, 255, 0, 255], width=1, height=2\nOutput: [0, 255, 127, 255, 255, 0, 255, 255]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: image = [10, 20, 30, 255, 100, 150, 200, 255], width=2, height=1\nOutput: [245, 235, 225, 255, 155, 105, 55, 255]\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>1 &le; <code>width</code> &le; 4096</li>\n  <li>1 &le; <code>height</code> &le; 4096</li>\n  <li><code>width * height</code> &le; 8,388,608.</li>\n\n  <li>Performance is measured with <code>height</code> = 5,120, <code>width</code> = 4,096</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void invert_kernel(unsigned char* image, int width, int height) {}\n// image_input, image_output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(unsigned char* image, int width, int height) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (width * height + threadsPerBlock - 1) / threadsPerBlock;\n\n    invert_kernel<<<blocksPerGrid, threadsPerBlock>>>(image, width, height);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# image are tensors on the GPU\n@cute.jit\ndef solve(image: cute.Tensor, width: cute.Int32, height: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# image is a tensor on the GPU\n@jax.jit\ndef solve(image: jax.Array, width: int, height: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef invert_kernel(image: UnsafePointer[UInt8, MutExternalOrigin], width: Int32, height: Int32):\n    pass\n\n\n# image is a device pointer (i.e. pointer to memory on the GPU)\n@export\ndef solve(image: UnsafePointer[UInt8, MutExternalOrigin], width: Int32, height: Int32) raises:\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var total_pixels = width * height\n    var blocksPerGrid = ceildiv(total_pixels, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[invert_kernel, invert_kernel]()\n    ctx.enqueue_function(\n        _kernel, image, width, height, grid_dim=blocksPerGrid, block_dim=threadsPerBlock\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# image is a tensor on the GPU\ndef solve(image: torch.Tensor, width: int, height: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef invert_kernel(image, width, height, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# image is a tensor on the GPU\ndef solve(image: torch.Tensor, width: int, height: int):\n    BLOCK_SIZE = 1024\n    n_pixels = width * height\n    grid = (triton.cdiv(n_pixels, BLOCK_SIZE),)\n\n    invert_kernel[grid](image, width, height, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Color Inversion\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, image: torch.Tensor, width: int, height: int):\n",
+        "        assert image.shape == (height * width * 4,)\n",
+        "        assert image.dtype == torch.uint8\n",
+        "\n",
+        "        # Reshape to (height, width, 4) for easier processing\n",
+        "        image_reshaped = image.view(height, width, 4)\n",
+        "\n",
+        "        # Invert RGB channels (first 3 channels), keep alpha unchanged\n",
+        "        image_reshaped[:, :, :3] = 255 - image_reshaped[:, :, :3]\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"image\": (ctypes.POINTER(ctypes.c_ubyte), \"inout\"),\n",
+        "            \"width\": (ctypes.c_int, \"in\"),\n",
+        "            \"height\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        width, height = 1, 2\n",
+        "        image = torch.tensor([255, 0, 128, 255, 0, 255, 0, 255], device=\"cuda\", dtype=torch.uint8)\n",
+        "        return {\n",
+        "            \"image\": image,\n",
+        "            \"width\": width,\n",
+        "            \"height\": height,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        return [\n",
+        "            {\n",
+        "                \"image\": torch.tensor(\n",
+        "                    [\n",
+        "                        [[255, 0, 0, 255], [0, 255, 0, 255]],\n",
+        "                        [[0, 0, 255, 255], [128, 128, 128, 255]],\n",
+        "                    ],\n",
+        "                    dtype=torch.uint8,\n",
+        "                    device=\"cuda\",\n",
+        "                ).flatten(),\n",
+        "                \"width\": 2,\n",
+        "                \"height\": 2,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.tensor(\n",
+        "                    [[[100, 50, 200, 255]]], dtype=torch.uint8, device=\"cuda\"\n",
+        "                ).flatten(),\n",
+        "                \"width\": 1,\n",
+        "                \"height\": 1,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.zeros((3, 4, 4), dtype=torch.uint8, device=\"cuda\").flatten(),\n",
+        "                \"width\": 4,\n",
+        "                \"height\": 3,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.full((5, 3, 4), 255, dtype=torch.uint8, device=\"cuda\").flatten(),\n",
+        "                \"width\": 3,\n",
+        "                \"height\": 5,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.tensor(\n",
+        "                    [\n",
+        "                        [[10, 20, 30, 50], [40, 50, 60, 100]],\n",
+        "                        [[70, 80, 90, 150], [100, 110, 120, 200]],\n",
+        "                    ],\n",
+        "                    dtype=torch.uint8,\n",
+        "                    device=\"cuda\",\n",
+        "                ).flatten(),\n",
+        "                \"width\": 2,\n",
+        "                \"height\": 2,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.randint(0, 256, (64 * 64 * 4,), dtype=torch.uint8, device=\"cuda\"),\n",
+        "                \"width\": 64,\n",
+        "                \"height\": 64,\n",
+        "            },\n",
+        "            {\n",
+        "                \"image\": torch.randint(0, 256, (32 * 64 * 4,), dtype=torch.uint8, device=\"cuda\"),\n",
+        "                \"width\": 64,\n",
+        "                \"height\": 32,\n",
+        "            },\n",
+        "        ]\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        width, height = 4096, 5120\n",
+        "        size = width * height * 4\n",
+        "        return {\n",
+        "            \"image\": torch.randint(0, 256, (size,), device=\"cuda\", dtype=torch.uint8),\n",
+        "            \"width\": width,\n",
+        "            \"height\": height,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/8_matrix_addition.ipynb b/challenges/colab_exports/easy/8_matrix_addition.ipynb
new file mode 100644
index 00000000..51dcb172
--- /dev/null
+++ b/challenges/colab_exports/easy/8_matrix_addition.ipynb
@@ -0,0 +1,561 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that performs element-wise addition of two $N \\times N$ matrices containing 32-bit floating point numbers on a GPU.\n  The program should take two input matrices of equal dimensions and produce a single output matrix containing their element-wise sum.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in matrix <code>C</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  A = [[1.0, 2.0],\n             [3.0, 4.0]]\n        B = [[5.0, 6.0],\n             [7.0, 8.0]]\nOutput: C = [[6.0, 8.0],\n             [10.0, 12.0]]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  A = [[1.5, 2.5, 3.5],\n             [4.5, 5.5, 6.5],\n             [7.5, 8.5, 9.5]]\n        B = [[0.5, 0.5, 0.5],\n             [0.5, 0.5, 0.5],\n             [0.5, 0.5, 0.5]]\nOutput: C = [[2.0, 3.0, 4.0],\n             [5.0, 6.0, 7.0],\n             [8.0, 9.0, 10.0]]\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>Input matrices <code>A</code> and <code>B</code> have identical dimensions</li>\n  <li>1 &le; <code>N</code> &le; 4096</li>\n  <li>All elements are 32-bit floating point numbers</li>\n\n  <li>Performance is measured with <code>N</code> = 4,096</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void matrix_add(const float* A, const float* B, float* C, int N) {}\n\n// A, B, C are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, const float* B, float* C, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N * N + threadsPerBlock - 1) / threadsPerBlock;\n\n    matrix_add<<<blocksPerGrid, threadsPerBlock>>>(A, B, C, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, C: cute.Tensor, N: cute.Uint32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef matrix_add_kernel(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n):\n    pass\n\n\n# A, B, C are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    var BLOCK_SIZE: Int32 = 256\n    var ctx = DeviceContext()\n    var n_elements = N * N\n    var num_blocks = ceildiv(n_elements, BLOCK_SIZE)\n\n    var _kernel = ctx.compile_function[matrix_add_kernel, matrix_add_kernel]()\n    ctx.enqueue_function(_kernel, A, B, C, N, grid_dim=num_blocks, block_dim=BLOCK_SIZE)\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef matrix_add_kernel(a, b, c, n_elements, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# a, b, c are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, N: int):\n    BLOCK_SIZE = 1024\n    n_elements = N * N\n    grid = (triton.cdiv(n_elements, BLOCK_SIZE),)\n    matrix_add_kernel[grid](a, b, c, n_elements, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Matrix Addition\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, N: int):\n",
+        "        assert A.shape == (N, N)\n",
+        "        assert B.shape == (N, N)\n",
+        "        assert C.shape == (N, N)\n",
+        "        assert A.dtype == B.dtype == C.dtype\n",
+        "        assert A.device == B.device == C.device\n",
+        "\n",
+        "        torch.add(A, B, out=C)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 2\n",
+        "        A = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        C = torch.empty(N, N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # basic_2x2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[5.0, 6.0], [7.0, 8.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_zeros_4x4\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros((4, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros((4, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.zeros((4, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # identity_plus_identity_3x3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"B\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"C\": torch.zeros((3, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative_values_2x2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[-1.0, -2.0], [-3.0, -4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[-5.0, -6.0], [-7.0, -8.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_positive_negative_2x2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[1.0, -2.0], [-3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[-1.0, 2.0], [3.0, -4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_element_1x1\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[42.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[8.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.zeros((1, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_N_16x16\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"B\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"C\": torch.zeros((16, 16), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # very_small_numbers\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[0.000001, 0.0000001], [0.00000001, 0.000000001]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"B\": torch.tensor(\n",
+        "                    [[0.000001, 0.0000001], [0.00000001, 0.000000001]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"C\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_numbers\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[1000000.0, 10000000.0], [-1000000.0, -10000000.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"B\": torch.tensor(\n",
+        "                    [[1000000.0, -10000000.0], [-1000000.0, 10000000.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"C\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_power_of_two_size_7x7\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((7, 7), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0),\n",
+        "                \"B\": torch.empty((7, 7), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0),\n",
+        "                \"C\": torch.zeros((7, 7), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 7,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size_32x32\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((32, 32), device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"B\": torch.empty((32, 32), device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"C\": torch.zeros((32, 32), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 32,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4096\n",
+        "        return {\n",
+        "            \"A\": torch.empty(N, N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"B\": torch.empty(N, N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "            \"C\": torch.zeros(N, N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/easy/9_1d_convolution.ipynb b/challenges/colab_exports/easy/9_1d_convolution.ipynb
new file mode 100644
index 00000000..ad840de1
--- /dev/null
+++ b/challenges/colab_exports/easy/9_1d_convolution.ipynb
@@ -0,0 +1,520 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that performs a 1D convolution operation. Given an input array and a kernel (filter), compute the convolved\n  output. The convolution should be performed with a \"valid\" boundary condition, meaning the kernel is only applied\n  where it fully overlaps with the input.\n</p>\n\n<svg width=\"420\" height=\"210\" viewBox=\"0 0 420 210\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\" font-family=\"monospace\" font-size=\"13\">\n  <!-- Background -->\n  <rect width=\"420\" height=\"210\" rx=\"8\" fill=\"#222\"/>\n\n  <!-- \"input\" label -->\n  <text x=\"16\" y=\"38\" fill=\"#999\" font-size=\"11\">input</text>\n\n  <!-- Input cells -->\n  <rect x=\"65\"  y=\"20\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <rect x=\"120\" y=\"20\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <rect x=\"175\" y=\"20\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <rect x=\"230\" y=\"20\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <rect x=\"285\" y=\"20\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n  <!-- Input values -->\n  <text x=\"90\"  y=\"41\" text-anchor=\"middle\" fill=\"#ccc\">1</text>\n  <text x=\"145\" y=\"41\" text-anchor=\"middle\" fill=\"#ccc\">2</text>\n  <text x=\"200\" y=\"41\" text-anchor=\"middle\" fill=\"#ccc\">3</text>\n  <text x=\"255\" y=\"41\" text-anchor=\"middle\" fill=\"#ccc\">4</text>\n  <text x=\"310\" y=\"41\" text-anchor=\"middle\" fill=\"#ccc\">5</text>\n\n  <!-- Kernel highlight window over first 3 input cells -->\n  <rect x=\"63\" y=\"18\" width=\"164\" height=\"36\" rx=\"4\" fill=\"none\" stroke=\"#4477bb\" stroke-width=\"2\" stroke-dasharray=\"5,3\"/>\n\n  <!-- \"kernel\" label -->\n  <text x=\"16\" y=\"86\" fill=\"#999\" font-size=\"11\">kernel</text>\n\n  <!-- Kernel cells (aligned under first 3 input cells) -->\n  <rect x=\"65\"  y=\"68\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <rect x=\"120\" y=\"68\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <rect x=\"175\" y=\"68\" width=\"50\" height=\"32\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <!-- Kernel values -->\n  <text x=\"90\"  y=\"89\" text-anchor=\"middle\" fill=\"#88bbff\">1</text>\n  <text x=\"145\" y=\"89\" text-anchor=\"middle\" fill=\"#88bbff\">0</text>\n  <text x=\"200\" y=\"89\" text-anchor=\"middle\" fill=\"#88bbff\">-1</text>\n\n  <!-- Multiplication signs between pairs -->\n  <text x=\"90\"  y=\"118\" text-anchor=\"middle\" fill=\"#777\" font-size=\"11\">1&#xd7;1</text>\n  <text x=\"145\" y=\"118\" text-anchor=\"middle\" fill=\"#777\" font-size=\"11\">2&#xd7;0</text>\n  <text x=\"200\" y=\"118\" text-anchor=\"middle\" fill=\"#777\" font-size=\"11\">3&#xd7;(-1)</text>\n\n  <!-- Computation line -->\n  <text x=\"145\" y=\"140\" text-anchor=\"middle\" fill=\"#aaa\" font-size=\"12\">= 1 + 0 + (-3) = -2</text>\n\n  <!-- Arrow down to output -->\n  <line x1=\"145\" y1=\"148\" x2=\"145\" y2=\"168\" stroke=\"#4477bb\" stroke-width=\"1.5\" marker-end=\"url(#arrowhead)\"/>\n  <defs>\n    <marker id=\"arrowhead\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#4477bb\"/>\n    </marker>\n  </defs>\n\n  <!-- \"output\" label -->\n  <text x=\"16\" y=\"187\" fill=\"#999\" font-size=\"11\">output</text>\n\n  <!-- Output cell -->\n  <rect x=\"120\" y=\"170\" width=\"50\" height=\"30\" rx=\"3\" fill=\"#1a3a1a\" stroke=\"#44aa44\" stroke-width=\"1.5\"/>\n  <text x=\"145\" y=\"190\" text-anchor=\"middle\" fill=\"#66dd66\" font-weight=\"bold\">-2</text>\n\n  <!-- Ellipsis for remaining output -->\n  <text x=\"195\" y=\"190\" fill=\"#666\" font-size=\"14\">&#x2026;</text>\n</svg>\n\n<p>\n  The input consists of two arrays:\n<ul>\n  <li><code>input</code>: A 1D array of 32-bit floating-point numbers.</li>\n  <li><code>kernel</code>: A 1D array of 32-bit floating-point numbers representing the convolution kernel.</li>\n</ul>\nThe output should be written to the <code>output</code> array, which will have a size of <code>input_size - kernel_size + 1</code>.\n</p>\n\n<p>\n  The convolution operation is defined mathematically as:\n</p>\n\n$$\noutput[i] = \\sum_{j=0}^{kernel\\_size-1} input[i + j] \\cdot kernel[j]\n$$\n\n<p>\n  where $i$ ranges from 0 to $input\\_size - kernel\\_size$.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the array\n    <code>output</code>\n  </li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: input = [1, 2, 3, 4, 5], kernel = [1, 0, -1]\nOutput: [-2, -2, -2]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: input = [2, 4, 6, 8], kernel = [0.5, 0.2]\nOutput: [1.8, 3.2, 4.6]\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>1 &le; <code>input_size</code> &le; 1,500,000</li>\n  <li>1 &le; <code>kernel_size</code> &le; 2047</li>\n  <li><code>kernel_size</code> &le; <code>input_size</code></li>\n\n  <li>Performance is measured with <code>input_size</code> = 1,500,000, <code>kernel_size</code> = 2,047</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void convolution_1d_kernel(const float* input, const float* kernel, float* output,\n                                      int input_size, int kernel_size) {}\n\n// input, kernel, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, const float* kernel, float* output, int input_size,\n                      int kernel_size) {\n    int output_size = input_size - kernel_size + 1;\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (output_size + threadsPerBlock - 1) / threadsPerBlock;\n\n    convolution_1d_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, kernel, output, input_size,\n                                                              kernel_size);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, kernel, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    kernel: cute.Tensor,\n    output: cute.Tensor,\n    input_size: cute.Int32,\n    kernel_size: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, kernel are tensors on the GPU\n@jax.jit\ndef solve(input: jax.Array, kernel: jax.Array, input_size: int, kernel_size: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\ndef convolution_1d_kernel(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    kernel: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    input_size: Int32,\n    kernel_size: Int32,\n):\n    pass\n\n\n# input, kernel, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    kernel: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    input_size: Int32,\n    kernel_size: Int32,\n) raises:\n    var output_size = input_size - kernel_size + 1\n    var threadsPerBlock: Int32 = 256\n    var ctx = DeviceContext()\n\n    var blocksPerGrid = ceildiv(output_size, threadsPerBlock)\n\n    var _kernel = ctx.compile_function[convolution_1d_kernel, convolution_1d_kernel]()\n    ctx.enqueue_function(\n        _kernel,\n        input,\n        kernel,\n        output,\n        input_size,\n        kernel_size,\n        grid_dim=blocksPerGrid,\n        block_dim=threadsPerBlock,\n    )\n\n    ctx.synchronize()\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_size: int,\n    kernel_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef conv1d_kernel(input, kernel, output, input_size, kernel_size, BLOCK_SIZE: tl.constexpr):\n    pass\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_size: int,\n    kernel_size: int,\n):\n    BLOCK_SIZE = 1024\n    n_blocks = triton.cdiv(input_size - kernel_size + 1, BLOCK_SIZE)\n    grid = (n_blocks,)\n\n    conv1d_kernel[grid](input, kernel, output, input_size, kernel_size, BLOCK_SIZE)\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"1D Convolution\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        kernel: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        input_size: int,\n",
+        "        kernel_size: int,\n",
+        "    ):\n",
+        "        assert input.shape == (input_size,)\n",
+        "        assert kernel.shape == (kernel_size,)\n",
+        "        assert output.shape == (input_size - kernel_size + 1,)\n",
+        "        assert input.dtype == kernel.dtype == output.dtype\n",
+        "        assert input.device == kernel.device == output.device\n",
+        "\n",
+        "        # Create strided view of input for all windows\n",
+        "        windows = input.unfold(0, kernel_size, 1)\n",
+        "\n",
+        "        # Use einsum for explicit cross-correlation\n",
+        "        # 'ij,j->i' means: for each window i, multiply with kernel j and sum over j\n",
+        "        output.copy_(torch.einsum(\"ij,j->i\", windows, kernel))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"kernel\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"input_size\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_tensor = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device=\"cuda\", dtype=dtype)\n",
+        "        kernel_tensor = torch.tensor([1.0, 0.0, -1.0], device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"kernel\": kernel_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"input_size\": 5,\n",
+        "            \"kernel_size\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_specs = [\n",
+        "            # Basic test cases\n",
+        "            (\"basic_5x3\", [1.0, 2.0, 3.0, 4.0, 5.0], [1.0, 0.0, -1.0]),\n",
+        "            (\"basic_4x2\", [2.0, 4.0, 6.0, 8.0], [0.5, 0.2]),\n",
+        "            (\"identity_kernel\", [1.0, 2.0, 3.0, 4.0], [1.0]),\n",
+        "            (\"edge_detection\", [1.0, 1.0, 1.0, 0.0, 0.0, 0.0], [1.0, -1.0]),\n",
+        "            (\"smoothing\", [1.0, 2.0, 3.0, 4.0, 5.0], [0.25, 0.5, 0.25]),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, input_vals, kernel_vals in test_specs:\n",
+        "            input_size = len(input_vals)\n",
+        "            kernel_size = len(kernel_vals)\n",
+        "            output_size = input_size - kernel_size + 1\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.tensor(input_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"kernel\": torch.tensor(kernel_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"output\": torch.empty(output_size, device=\"cuda\", dtype=dtype),\n",
+        "                    \"input_size\": input_size,\n",
+        "                    \"kernel_size\": kernel_size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random test cases with different sizes\n",
+        "        for _, input_size, kernel_size in [\n",
+        "            (\"small_conv\", 10, 3),\n",
+        "            (\"medium_conv\", 100, 7),\n",
+        "            (\"large_conv\", 1000, 15),\n",
+        "            (\"wide_kernel\", 50, 20),\n",
+        "            (\"narrow_kernel\", 200, 2),\n",
+        "        ]:\n",
+        "            output_size = input_size - kernel_size + 1\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.empty(input_size, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -10.0, 10.0\n",
+        "                    ),\n",
+        "                    \"kernel\": torch.empty(kernel_size, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -1.0, 1.0\n",
+        "                    ),\n",
+        "                    \"output\": torch.empty(output_size, device=\"cuda\", dtype=dtype),\n",
+        "                    \"input_size\": input_size,\n",
+        "                    \"kernel_size\": kernel_size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Edge cases\n",
+        "        for _, input_size, kernel_size in [\n",
+        "            (\"min_input\", 1, 1),\n",
+        "            (\"kernel_equals_input\", 10, 10),\n",
+        "            (\"large_input_small_kernel\", 10000, 3),\n",
+        "        ]:\n",
+        "            output_size = input_size - kernel_size + 1\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"input\": torch.empty(input_size, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -1.0, 1.0\n",
+        "                    ),\n",
+        "                    \"kernel\": torch.empty(kernel_size, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -0.1, 0.1\n",
+        "                    ),\n",
+        "                    \"output\": torch.empty(output_size, device=\"cuda\", dtype=dtype),\n",
+        "                    \"input_size\": input_size,\n",
+        "                    \"kernel_size\": kernel_size,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_size, kernel_size = 1500000, 2047  # Large convolution for performance testing\n",
+        "        output_size = input_size - kernel_size + 1\n",
+        "        return {\n",
+        "            \"input\": torch.empty(input_size, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "            \"kernel\": torch.empty(kernel_size, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "            \"output\": torch.empty(output_size, device=\"cuda\", dtype=dtype),\n",
+        "            \"input_size\": input_size,\n",
+        "            \"kernel_size\": kernel_size,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/12_multi_head_attention.ipynb b/challenges/colab_exports/hard/12_multi_head_attention.ipynb
new file mode 100644
index 00000000..7903d4d7
--- /dev/null
+++ b/challenges/colab_exports/hard/12_multi_head_attention.ipynb
@@ -0,0 +1,495 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program for multi-head self-attention. Given three input matrices $Q$ (queries), $K$ (keys), and $V$ (values) of size $N \\times d_{\\text{model}}$, compute:\n  $$ \\text{MultiHead}(Q,K,V) = \\text{Concat}(\\text{head}_1,\\ldots,\\text{head}_h) $$\n  where each head computes:\n  $$ \\text{head}_i = \\text{softmax}\\left(\\frac{Q_iK_i^T}{\\sqrt{d_k}}\\right)V_i $$\n  with $d_k = d_{\\text{model}}/h$ and $Q_i, K_i, V_i$ being the i-th head's partition of the input matrices.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<p>\nInput:\n$$\n\\begin{align*}\nN &= 2, \\quad d_{\\text{model}} = 4, \\quad h = 2 \\$$1em]\nQ &= \\begin{bmatrix}\n1.0 & 0.0 & 2.0 & 3.0 \\\\\n4.0 & 5.0 & 6.0 & 7.0\n\\end{bmatrix} \\$$1em]\nK &= \\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0\n\\end{bmatrix} \\$$1em]\nV &= \\begin{bmatrix}\n0.5 & 1.0 & 1.5 & 2.0 \\\\\n2.5 & 3.0 & 3.5 & 4.0\n\\end{bmatrix}\n\\end{align*}\n$$\n\nOutput:\n$$\n\\begin{bmatrix}\n2.39 & 2.89 & 3.50 & 4.00 \\\\\n2.50 & 3.00 & 3.50 & 4.00\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Example 2:</h2>\n<p>\nInput:\n$$\n\\begin{align*}\nN &= 1, \\quad d_{\\text{model}} = 2, \\quad h = 1 \\$$1em]\nQ &= \\begin{bmatrix} 1.0 & 1.0 \\end{bmatrix} \\$$1em]\nK &= \\begin{bmatrix} 1.0 & 1.0 \\end{bmatrix} \\$$1em]\nV &= \\begin{bmatrix} 2.0 & 3.0 \\end{bmatrix}\n\\end{align*}\n$$\n\nOutput:\n$$\n\\begin{bmatrix} 2.0 & 3.0 \\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li><code>1 \u2264 N \u2264 10000</code></li>\n  <li><code>2 \u2264 d_model \u2264 1024</code></li>\n  <li><code>1 \u2264 h \u2264 d_model</code></li>\n  <li><code>d_model % h == 0</code></li>\n  <li><code>-10.0 \u2264 values \u2264 10.0</code></li>\n\n  <li>Performance is measured with <code>N</code> = 1,024, <code>d_model</code> = 1,024</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int N,\n                      int d_model, int h) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    d_model: cute.Int32,\n    h: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, K: jax.Array, V: jax.Array, N: int, d_model: int, h: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    d_model: Int32,\n    h: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    d_model: int,\n    h: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    d_model: int,\n    h: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Multi-Head Attention\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        d_model: int,\n",
+        "        h: int,\n",
+        "    ):\n",
+        "        assert Q.shape == (N, d_model)\n",
+        "        assert K.shape == (N, d_model)\n",
+        "        assert V.shape == (N, d_model)\n",
+        "        assert output.shape == (N, d_model)\n",
+        "        assert Q.dtype == K.dtype == V.dtype == output.dtype\n",
+        "        assert Q.device == K.device == V.device == output.device\n",
+        "        d_k = d_model // h\n",
+        "        result = torch.zeros((N, d_model), dtype=Q.dtype, device=Q.device)\n",
+        "        for head in range(h):\n",
+        "            Q_h = Q[:, head * d_k : (head + 1) * d_k]\n",
+        "            K_h = K[:, head * d_k : (head + 1) * d_k]\n",
+        "            V_h = V[:, head * d_k : (head + 1) * d_k]\n",
+        "            scores = torch.matmul(Q_h, K_h.t()) / (d_k**0.5)\n",
+        "            softmax = torch.softmax(scores, dim=1)\n",
+        "            head_output = torch.matmul(softmax, V_h)\n",
+        "            result[:, head * d_k : (head + 1) * d_k] = head_output\n",
+        "        output.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_model\": (ctypes.c_int, \"in\"),\n",
+        "            \"h\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[0.5, 1.0, 1.5, 2.0], [2.5, 3.0, 3.5, 4.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 2,\n",
+        "            \"d_model\": 4,\n",
+        "            \"h\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "        # basic_example\n",
+        "        Q = torch.tensor([[1.0, 0.0, 2.0, 3.0], [4.0, 5.0, 6.0, 7.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[0.5, 1.0, 1.5, 2.0], [2.5, 3.0, 3.5, 4.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"N\": 2, \"d_model\": 4, \"h\": 2})\n",
+        "        # single_head\n",
+        "        Q = torch.tensor([[1.0, 1.0], [2.0, 2.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 1.0], [1.0, 1.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[2.0, 3.0], [4.0, 5.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 2, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"N\": 2, \"d_model\": 2, \"h\": 1})\n",
+        "        # four_heads (random)\n",
+        "        Q = torch.empty(4, 4, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        K = torch.empty(4, 4, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        V = torch.empty(4, 4, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        output = torch.empty(4, 4, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"N\": 4, \"d_model\": 4, \"h\": 4})\n",
+        "        # medium_size (random)\n",
+        "        Q = torch.empty(32, 32, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        K = torch.empty(32, 32, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        V = torch.empty(32, 32, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        output = torch.empty(32, 32, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"N\": 32, \"d_model\": 32, \"h\": 8}\n",
+        "        )\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.empty(1024, 1024, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        K = torch.empty(1024, 1024, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        V = torch.empty(1024, 1024, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        output = torch.zeros(1024, 1024, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 1024,\n",
+        "            \"d_model\": 1024,\n",
+        "            \"h\": 16,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/14_multi_agent_sim.ipynb b/challenges/colab_exports/hard/14_multi_agent_sim.ipynb
new file mode 100644
index 00000000..02c4e268
--- /dev/null
+++ b/challenges/colab_exports/hard/14_multi_agent_sim.ipynb
@@ -0,0 +1,483 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program for a multi-agent flocking simulation (boids). The input consists of:\n</p>\n  <li>An array <code>agents</code> containing <code>N</code> agents, where <code>N</code> is the total number of agents</li>\n  <li>Each agent occupies 4 consecutive 32-bit floating point numbers in the array: $[x, y, v_x, v_y]$, where:\n      <ul>\n          <li>$(x, y)$ represents the agent's position in 2D space</li>\n          <li>$(v_x, v_y)$ represents the agent's velocity vector</li>\n      </ul>\n  </li>\n  <li>The total array size is <code>4 * N</code> floats, with agent $i$'s data stored at indices <code>[4i, 4i+1, 4i+2, 4i+3]</code></li>\n</ul>\n\n<h2>Simulation Rules</h2>\n<ol>\n  <li>For each agent $i$, identify all neighbors $j$ (where $i \\neq j$) within radius $r = 5.0$ using:\n      $$\n      \\sqrt{(x_i - x_j)^2 + (y_i - y_j)^2} < r\n      $$\n  </li>\n  <li>Compute average velocity of neighboring agents:\n      $$\n      \\vec{v}_{avg} = \\begin{cases}\n      \\frac{1}{|N_i|} \\sum_{j \\in N_i} \\vec{v}_j & \\text{if } |N_i| > 0 \\\\\n      \\vec{v}_i & \\text{if } |N_i| = 0\n      \\end{cases}\n      $$\n      where $N_i$ is the set of neighbors for agent $i$\n  </li>\n  <li>Update velocity:\n      $$\n      \\vec{v}_{new} = \\vec{v} + \\alpha(\\vec{v}_{avg} - \\vec{v}), \\text{ where } \\alpha = 0.05\n      $$\n  </li>\n  <li>Update position:\n      $$\n      \\vec{p}_{new} = \\vec{p} + \\vec{v}_{new}\n      $$\n  </li>\n</ol>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>agents_next</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: N = 2\nagents = [\n  0.0, 0.0, 1.0, 0.0,    // Agent 0: [x, y, vx, vy]\n  3.0, 4.0, 0.0, -1.0    // Agent 1: [x, y, vx, vy]\n]\n\nOutput:\nagents_next = [\n  1.0, 0.0, 1.0, 0.0,    // Agent 0: [x, y, vx, vy]\n  3.0, 3.0, 0.0, -1.0    // Agent 1: [x, y, vx, vy]\n]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n<li>1 &le; <code>N</code> &le; 100,000</li>\n<li>Each agent's position and velocity components are 32-bit floats</li>\n\n  <li>Performance is measured with <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// agents, agents_next are device pointers\nextern \"C\" void solve(const float* agents, float* agents_next, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# agents, agents_next are tensors on the GPU\n@cute.jit\ndef solve(agents: cute.Tensor, agents_next: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# agents is a tensor on the GPU\n@jax.jit\ndef solve(agents: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    agents: UnsafePointer[Float32, MutExternalOrigin],\n    agents_next: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# agents, agents_next are tensors on the GPU\ndef solve(agents: torch.Tensor, agents_next: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# agents, agents_next are tensors on the GPU\ndef solve(agents: torch.Tensor, agents_next: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Multi-Agent Simulation\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, agents: torch.Tensor, agents_next: torch.Tensor, N: int):\n",
+        "        assert agents.shape == (4 * N,)\n",
+        "        assert agents_next.shape == (4 * N,)\n",
+        "        assert agents.dtype == agents_next.dtype\n",
+        "        assert agents.device == agents_next.device\n",
+        "        r = 5.0\n",
+        "        r2 = r * r\n",
+        "        alpha = 0.05\n",
+        "        agents_reshaped = agents.view(N, 4)\n",
+        "        agents_next_reshaped = agents_next.view(N, 4)\n",
+        "        positions = agents_reshaped[:, :2]\n",
+        "        velocities = agents_reshaped[:, 2:]\n",
+        "        diff = positions.unsqueeze(1) - positions.unsqueeze(0)\n",
+        "        dist_sq = (diff**2).sum(dim=2)\n",
+        "        dist_sq.fill_diagonal_(r2 + 1)\n",
+        "        neighbor_mask = dist_sq < r2\n",
+        "        sum_velocities = neighbor_mask.float() @ velocities\n",
+        "        neighbor_counts = neighbor_mask.sum(dim=1, keepdim=True)\n",
+        "        avg_velocities = torch.empty_like(velocities)\n",
+        "        nonzero_mask = neighbor_counts[:, 0] > 0\n",
+        "        avg_velocities[nonzero_mask] = sum_velocities[nonzero_mask] / neighbor_counts[nonzero_mask]\n",
+        "        avg_velocities[~nonzero_mask] = velocities[~nonzero_mask]\n",
+        "        new_velocities = velocities + alpha * (avg_velocities - velocities)\n",
+        "        new_positions = positions + new_velocities\n",
+        "        agents_next_reshaped[:] = torch.cat([new_positions, new_velocities], dim=1)\n",
+        "        agents_next.copy_(agents_next_reshaped.view(-1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"agents\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"agents_next\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 2\n",
+        "        agents = torch.tensor([0.0, 0.0, 1.0, 0.0, 5.0, 0.0, 0.0, 1.0], device=\"cuda\", dtype=dtype)\n",
+        "        agents_next = torch.empty(4 * N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"agents\": agents,\n",
+        "            \"agents_next\": agents_next,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "        # basic_example\n",
+        "        agents = torch.tensor([0.0, 0.0, 1.0, 0.0, 3.0, 4.0, 0.0, -1.0], device=\"cuda\", dtype=dtype)\n",
+        "        agents_next = torch.empty(8, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 2})\n",
+        "        # single_agent\n",
+        "        agents = torch.tensor([10.0, 15.0, 1.0, -1.0], device=\"cuda\", dtype=dtype)\n",
+        "        agents_next = torch.empty(4, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 1})\n",
+        "        # two_agents_interacting\n",
+        "        agents = torch.tensor([0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 1.0], device=\"cuda\", dtype=dtype)\n",
+        "        agents_next = torch.empty(8, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 2})\n",
+        "        # four_agents\n",
+        "        agents = torch.tensor(\n",
+        "            [0.0, 0.0, 1.0, 0.0, 2.0, 2.0, 0.0, 1.0, 4.0, 4.0, -1.0, 0.0, 6.0, 6.0, 0.0, -1.0],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        agents_next = torch.empty(16, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 4})\n",
+        "        # boundary_distance\n",
+        "        agents = torch.tensor(\n",
+        "            [0.0, 0.0, 1.0, 1.0, 3.0, 4.0, -1.0, -1.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        agents_next = torch.empty(8, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 2})\n",
+        "        # medium_simulation (random)\n",
+        "        agents = torch.empty(4096, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0)\n",
+        "        agents_next = torch.empty(4096, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append({\"agents\": agents, \"agents_next\": agents_next, \"N\": 1024})\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        agents = torch.empty(40000, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0)\n",
+        "        agents_next = torch.empty(40000, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"agents\": agents,\n",
+        "            \"agents_next\": agents_next,\n",
+        "            \"N\": 10000,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/15_sorting.ipynb b/challenges/colab_exports/hard/15_sorting.ipynb
new file mode 100644
index 00000000..454fbb9c
--- /dev/null
+++ b/challenges/colab_exports/hard/15_sorting.ipynb
@@ -0,0 +1,456 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Write a program that sorts an array of 32-bit floating-point numbers in ascending order. You are free to choose any sorting algorithm.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only native features (external libraries are not permitted)</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The sorted result must be stored back in the input <code>data</code> array</li>\n</ul>\n\n<h2>Example</h2>\n<pre>\nInput: data = [5.0, 2.0, 8.0, 1.0, 9.0, 4.0], N = 6\nOutput: data = [1.0, 2.0, 4.0, 5.0, 8.0, 9.0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n    <li>1 \u2264 <code>N</code> \u2264 1,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 1,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// data is device pointer\nextern \"C\" void solve(float* data, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# data are tensors on the GPU\n@cute.jit\ndef solve(data: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# data is a tensor on the GPU\n@jax.jit\ndef solve(data: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(data: UnsafePointer[Float32, MutExternalOrigin], N: Int32) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# data is a tensor on the GPU\ndef solve(data: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# data is a tensor on the GPU\ndef solve(data: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Sorting\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, data: torch.Tensor, N: int):\n",
+        "        assert data.shape == (N,)\n",
+        "        data.copy_(data.sort()[0])\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"data\": (ctypes.POINTER(ctypes.c_float), \"inout\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        data = torch.tensor([5.0, 2.0, 8.0, 1.0, 9.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"data\": data,\n",
+        "            \"N\": 6,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # already_sorted\n",
+        "        tests.append(\n",
+        "            {\"data\": torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device=\"cuda\", dtype=dtype), \"N\": 5}\n",
+        "        )\n",
+        "        # reverse_sorted\n",
+        "        tests.append(\n",
+        "            {\"data\": torch.tensor([5.0, 4.0, 3.0, 2.0, 1.0], device=\"cuda\", dtype=dtype), \"N\": 5}\n",
+        "        )\n",
+        "        # all_same\n",
+        "        tests.append({\"data\": torch.tensor([5.0] * 10, device=\"cuda\", dtype=dtype), \"N\": 10})\n",
+        "        # single_element\n",
+        "        tests.append({\"data\": torch.tensor([7.0], device=\"cuda\", dtype=dtype), \"N\": 1})\n",
+        "        # power_of_two\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"data\": torch.empty(1024, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "        # non_power_of_two\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"data\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"N\": 1000,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_array\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"data\": torch.empty(32768, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "                \"N\": 32768,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 1000000\n",
+        "        data = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0)\n",
+        "        return {\n",
+        "            \"data\": data,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/20_kmeans_clustering.ipynb b/challenges/colab_exports/hard/20_kmeans_clustering.ipynb
new file mode 100644
index 00000000..19356391
--- /dev/null
+++ b/challenges/colab_exports/hard/20_kmeans_clustering.ipynb
@@ -0,0 +1,682 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the k-means clustering algorithm for 2D points. Given arrays of x and y coordinates for data points, initial centroids, and other parameters, assign each point to the nearest centroid and update the centroids iteratively. The final centroids and labels should be stored in the output arrays.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>labels</code>, <code>final_centroid_x</code>, and <code>final_centroid_y</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\nsample_size = 4, k = 2, max_iterations = 10\ndata_x = [1.0, 2.0, 8.0, 9.0]\ndata_y = [1.0, 2.0, 8.0, 9.0]\ninitial_centroid_x = [1.0, 8.0]\ninitial_centroid_y = [1.0, 8.0]\nOutput: (see reference implementation for expected output)\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 sample_size \u2264 1000000</li>\n  <li>1 \u2264 k \u2264 1000</li>\n  <li>All arrays are float32 except labels, which is int32</li>\n\n  <li>Performance is measured with <code>k</code> = 5, <code>max_iterations</code> = 30, <code>sample_size</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// data_x, data_y, labels, initial_centroid_x, initial_centroid_y,\n// final_centroid_x, final_centroid_y are device pointers\nextern \"C\" void solve(const float* data_x, const float* data_y, int* labels,\n                      float* initial_centroid_x, float* initial_centroid_y, float* final_centroid_x,\n                      float* final_centroid_y, int sample_size, int k, int max_iterations) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# data_x, data_y, labels, initial_centroid_x, initial_centroid_y,\n# final_centroid_x, final_centroid_y are tensors on the GPU\n@cute.jit\ndef solve(\n    data_x: cute.Tensor,\n    data_y: cute.Tensor,\n    labels: cute.Tensor,\n    initial_centroid_x: cute.Tensor,\n    initial_centroid_y: cute.Tensor,\n    final_centroid_x: cute.Tensor,\n    final_centroid_y: cute.Tensor,\n    sample_size: cute.Int32,\n    k: cute.Int32,\n    max_iterations: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# data_x, data_y, initial_centroid_x, initial_centroid_y are tensors on the GPU\n@jax.jit\ndef solve(\n    data_x: jax.Array,\n    data_y: jax.Array,\n    initial_centroid_x: jax.Array,\n    initial_centroid_y: jax.Array,\n    sample_size: int,\n    k: int,\n    max_iterations: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    data_x: UnsafePointer[Float32, MutExternalOrigin],\n    data_y: UnsafePointer[Float32, MutExternalOrigin],\n    labels: UnsafePointer[Int32, MutExternalOrigin],\n    initial_centroid_x: UnsafePointer[Float32, MutExternalOrigin],\n    initial_centroid_y: UnsafePointer[Float32, MutExternalOrigin],\n    final_centroid_x: UnsafePointer[Float32, MutExternalOrigin],\n    final_centroid_y: UnsafePointer[Float32, MutExternalOrigin],\n    sample_size: Int32,\n    k: Int32,\n    max_iterations: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# data_x, data_y, labels, initial_centroid_x,\n# initial_centroid_y, final_centroid_x, final_centroid_y are tensors on the GPU\ndef solve(\n    data_x: torch.Tensor,\n    data_y: torch.Tensor,\n    labels: torch.Tensor,\n    initial_centroid_x: torch.Tensor,\n    initial_centroid_y: torch.Tensor,\n    final_centroid_x: torch.Tensor,\n    final_centroid_y: torch.Tensor,\n    sample_size: int,\n    k: int,\n    max_iterations: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# data_x, data_y, labels, initial_centroid_x,\n# initial_centroid_y, final_centroid_x, final_centroid_y are tensors on the GPU\ndef solve(\n    data_x: torch.Tensor,\n    data_y: torch.Tensor,\n    labels: torch.Tensor,\n    initial_centroid_x: torch.Tensor,\n    initial_centroid_y: torch.Tensor,\n    final_centroid_x: torch.Tensor,\n    final_centroid_y: torch.Tensor,\n    sample_size: int,\n    k: int,\n    max_iterations: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"K-Means Clustering\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        data_x: torch.Tensor,\n",
+        "        data_y: torch.Tensor,\n",
+        "        labels: torch.Tensor,\n",
+        "        initial_centroid_x: torch.Tensor,\n",
+        "        initial_centroid_y: torch.Tensor,\n",
+        "        final_centroid_x: torch.Tensor,\n",
+        "        final_centroid_y: torch.Tensor,\n",
+        "        sample_size: int,\n",
+        "        k: int,\n",
+        "        max_iterations: int,\n",
+        "    ):\n",
+        "        assert data_x.shape == (sample_size,)\n",
+        "        assert data_y.shape == (sample_size,)\n",
+        "        assert initial_centroid_x.shape == (k,)\n",
+        "        assert initial_centroid_y.shape == (k,)\n",
+        "        assert final_centroid_x.shape == (k,)\n",
+        "        assert final_centroid_y.shape == (k,)\n",
+        "        assert labels.shape == (sample_size,)\n",
+        "        final_centroid_x.copy_(initial_centroid_x)\n",
+        "        final_centroid_y.copy_(initial_centroid_y)\n",
+        "        for _ in range(max_iterations):\n",
+        "            expanded_x = data_x.view(-1, 1) - final_centroid_x.view(1, -1)\n",
+        "            expanded_y = data_y.view(-1, 1) - final_centroid_y.view(1, -1)\n",
+        "            distances = expanded_x**2 + expanded_y**2\n",
+        "            labels.copy_(torch.argmin(distances, dim=1))\n",
+        "            for i in range(k):\n",
+        "                mask = labels == i\n",
+        "                if mask.any():\n",
+        "                    final_centroid_x[i] = data_x[mask].mean()\n",
+        "                    final_centroid_y[i] = data_y[mask].mean()\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"data_x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"data_y\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"labels\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"initial_centroid_x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"initial_centroid_y\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"final_centroid_x\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"final_centroid_y\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"sample_size\": (ctypes.c_int, \"in\"),\n",
+        "            \"k\": (ctypes.c_int, \"in\"),\n",
+        "            \"max_iterations\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        sample_size, k, max_iterations = 4, 2, 10\n",
+        "        data_x = torch.tensor([1.0, 2.0, 8.0, 9.0], device=\"cuda\", dtype=dtype)\n",
+        "        data_y = torch.tensor([1.0, 2.0, 8.0, 9.0], device=\"cuda\", dtype=dtype)\n",
+        "        labels = torch.empty(sample_size, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor([1.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        initial_centroid_y = torch.tensor([1.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_x = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"data_x\": data_x,\n",
+        "            \"data_y\": data_y,\n",
+        "            \"labels\": labels,\n",
+        "            \"initial_centroid_x\": initial_centroid_x,\n",
+        "            \"initial_centroid_y\": initial_centroid_y,\n",
+        "            \"final_centroid_x\": final_centroid_x,\n",
+        "            \"final_centroid_y\": final_centroid_y,\n",
+        "            \"sample_size\": sample_size,\n",
+        "            \"k\": k,\n",
+        "            \"max_iterations\": max_iterations,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "        # basic_clustering\n",
+        "        data_x = torch.tensor(\n",
+        "            [1.0, 1.5, 1.2, 1.3, 1.1, 5.0, 5.2, 5.1, 5.3, 5.4, 10.1, 10.2, 10.0, 10.3, 10.5],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        data_y = torch.tensor(\n",
+        "            [1.0, 1.5, 1.2, 1.3, 1.1, 5.0, 5.2, 5.1, 5.3, 5.4, 10.1, 10.2, 10.0, 10.3, 10.5],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        labels = torch.empty(15, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor([3.4, 7.1, 8.5], device=\"cuda\", dtype=dtype)\n",
+        "        initial_centroid_y = torch.tensor([3.4, 7.1, 8.5], device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_x = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"data_x\": data_x,\n",
+        "                \"data_y\": data_y,\n",
+        "                \"labels\": labels,\n",
+        "                \"initial_centroid_x\": initial_centroid_x,\n",
+        "                \"initial_centroid_y\": initial_centroid_y,\n",
+        "                \"final_centroid_x\": final_centroid_x,\n",
+        "                \"final_centroid_y\": final_centroid_y,\n",
+        "                \"sample_size\": 15,\n",
+        "                \"k\": 3,\n",
+        "                \"max_iterations\": 20,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_cluster\n",
+        "        data_x = torch.tensor(\n",
+        "            [1.0, 1.2, 1.1, 1.3, 1.5, 1.4, 1.6, 1.2, 1.3, 1.1], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        data_y = torch.tensor(\n",
+        "            [1.0, 1.2, 1.1, 1.3, 1.5, 1.4, 1.6, 1.2, 1.3, 1.1], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        labels = torch.empty(10, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor([1.0, 5.0, 10.0], device=\"cuda\", dtype=dtype)\n",
+        "        initial_centroid_y = torch.tensor([1.0, 5.0, 10.0], device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_x = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"data_x\": data_x,\n",
+        "                \"data_y\": data_y,\n",
+        "                \"labels\": labels,\n",
+        "                \"initial_centroid_x\": initial_centroid_x,\n",
+        "                \"initial_centroid_y\": initial_centroid_y,\n",
+        "                \"final_centroid_x\": final_centroid_x,\n",
+        "                \"final_centroid_y\": final_centroid_y,\n",
+        "                \"sample_size\": 10,\n",
+        "                \"k\": 3,\n",
+        "                \"max_iterations\": 10,\n",
+        "            }\n",
+        "        )\n",
+        "        # empty_clusters\n",
+        "        data_x = torch.tensor(\n",
+        "            [\n",
+        "                1.0,\n",
+        "                1.5,\n",
+        "                1.2,\n",
+        "                1.3,\n",
+        "                1.1,\n",
+        "                1.4,\n",
+        "                1.6,\n",
+        "                1.2,\n",
+        "                1.7,\n",
+        "                1.3,\n",
+        "                10.0,\n",
+        "                10.5,\n",
+        "                10.2,\n",
+        "                10.3,\n",
+        "                10.1,\n",
+        "                10.4,\n",
+        "                10.6,\n",
+        "                10.2,\n",
+        "                10.7,\n",
+        "                10.3,\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        data_y = torch.tensor(\n",
+        "            [\n",
+        "                1.0,\n",
+        "                1.5,\n",
+        "                1.2,\n",
+        "                1.3,\n",
+        "                1.1,\n",
+        "                1.4,\n",
+        "                1.6,\n",
+        "                1.2,\n",
+        "                1.7,\n",
+        "                1.3,\n",
+        "                10.0,\n",
+        "                10.5,\n",
+        "                10.2,\n",
+        "                10.3,\n",
+        "                10.1,\n",
+        "                10.4,\n",
+        "                10.6,\n",
+        "                10.2,\n",
+        "                10.7,\n",
+        "                10.3,\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        labels = torch.empty(20, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor([1.5, 5.0, 10.5], device=\"cuda\", dtype=dtype)\n",
+        "        initial_centroid_y = torch.tensor([1.5, 5.0, 10.5], device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_x = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"data_x\": data_x,\n",
+        "                \"data_y\": data_y,\n",
+        "                \"labels\": labels,\n",
+        "                \"initial_centroid_x\": initial_centroid_x,\n",
+        "                \"initial_centroid_y\": initial_centroid_y,\n",
+        "                \"final_centroid_x\": final_centroid_x,\n",
+        "                \"final_centroid_y\": final_centroid_y,\n",
+        "                \"sample_size\": 20,\n",
+        "                \"k\": 3,\n",
+        "                \"max_iterations\": 15,\n",
+        "            }\n",
+        "        )\n",
+        "        # max_iterations_limit\n",
+        "        data_x = torch.tensor(\n",
+        "            [1.0, 1.5, 1.2, 1.3, 1.1, 5.0, 5.2, 5.1, 5.3, 5.4, 10.1, 10.2, 10.0, 10.3, 10.5],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        data_y = torch.tensor(\n",
+        "            [1.0, 1.5, 1.2, 1.3, 1.1, 5.0, 5.2, 5.1, 5.3, 5.4, 10.1, 10.2, 10.0, 10.3, 10.5],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        labels = torch.empty(15, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor([3.4, 7.1, 8.5], device=\"cuda\", dtype=dtype)\n",
+        "        initial_centroid_y = torch.tensor([3.4, 7.1, 8.5], device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_x = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"data_x\": data_x,\n",
+        "                \"data_y\": data_y,\n",
+        "                \"labels\": labels,\n",
+        "                \"initial_centroid_x\": initial_centroid_x,\n",
+        "                \"initial_centroid_y\": initial_centroid_y,\n",
+        "                \"final_centroid_x\": final_centroid_x,\n",
+        "                \"final_centroid_y\": final_centroid_y,\n",
+        "                \"sample_size\": 15,\n",
+        "                \"k\": 3,\n",
+        "                \"max_iterations\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # medium_random\n",
+        "        sample_size = 100\n",
+        "        k = 5\n",
+        "        data_x = torch.empty(sample_size, device=\"cuda\", dtype=dtype).uniform_(0.0, 100.0)\n",
+        "        data_y = torch.empty(sample_size, device=\"cuda\", dtype=dtype).uniform_(0.0, 100.0)\n",
+        "        labels = torch.empty(sample_size, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor(\n",
+        "            [20.0, 40.0, 60.0, 80.0, 10.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        initial_centroid_y = torch.tensor(\n",
+        "            [20.0, 40.0, 60.0, 80.0, 50.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        final_centroid_x = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"data_x\": data_x,\n",
+        "                \"data_y\": data_y,\n",
+        "                \"labels\": labels,\n",
+        "                \"initial_centroid_x\": initial_centroid_x,\n",
+        "                \"initial_centroid_y\": initial_centroid_y,\n",
+        "                \"final_centroid_x\": final_centroid_x,\n",
+        "                \"final_centroid_y\": final_centroid_y,\n",
+        "                \"sample_size\": sample_size,\n",
+        "                \"k\": k,\n",
+        "                \"max_iterations\": 30,\n",
+        "            }\n",
+        "        )\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        sample_size = 10000\n",
+        "        k = 5\n",
+        "        data_x = torch.empty(sample_size, device=\"cuda\", dtype=dtype).uniform_(0.0, 1000.0)\n",
+        "        data_y = torch.empty(sample_size, device=\"cuda\", dtype=dtype).uniform_(0.0, 1000.0)\n",
+        "        labels = torch.empty(sample_size, device=\"cuda\", dtype=torch.int32)\n",
+        "        initial_centroid_x = torch.tensor(\n",
+        "            [100.0, 200.0, 300.0, 400.0, 500.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        initial_centroid_y = torch.tensor(\n",
+        "            [100.0, 200.0, 300.0, 400.0, 500.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        final_centroid_x = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        final_centroid_y = torch.empty(k, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"data_x\": data_x,\n",
+        "            \"data_y\": data_y,\n",
+        "            \"labels\": labels,\n",
+        "            \"initial_centroid_x\": initial_centroid_x,\n",
+        "            \"initial_centroid_y\": initial_centroid_y,\n",
+        "            \"final_centroid_x\": final_centroid_x,\n",
+        "            \"final_centroid_y\": final_centroid_y,\n",
+        "            \"sample_size\": sample_size,\n",
+        "            \"k\": k,\n",
+        "            \"max_iterations\": 30,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/36_radix_sort.ipynb b/challenges/colab_exports/hard/36_radix_sort.ipynb
new file mode 100644
index 00000000..6b85ae4d
--- /dev/null
+++ b/challenges/colab_exports/hard/36_radix_sort.ipynb
@@ -0,0 +1,514 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a radix sort algorithm that sorts an array of 32-bit unsigned integers on a GPU.\n    The program should take an input array of unsigned integers and sort them in ascending order using the radix sort algorithm.\n    The <code>input</code> parameter contains the unsorted array, and the sorted result should be stored in the <code>output</code> array.\n  </p>\n\n  <h2>Implementation Requirements</h2>\n  <ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final sorted result must be stored in the <code>output</code> array</li>\n    <li>Use radix sort algorithm (not other sorting algorithms)</li>\n    <li>Sort in ascending order</li>\n  </ul>\n\n  <h2>Example 1:</h2>\n  <pre>\n  Input:  [170, 45, 75, 90, 2, 802, 24, 66]\n  Output: [2, 24, 45, 66, 75, 90, 170, 802]\n  </pre>\n\n  <h2>Example 2:</h2>\n  <pre>\n  Input:  [1, 4, 1, 3, 555, 1000, 2]\n  Output: [1, 1, 2, 3, 4, 555, 1000]\n  </pre>\n\n  <h2>Constraints</h2>\n  <ul>\n    <li><code>1 \u2264 N \u2264 100,000,000</code></li>\n    <li><code>0 \u2264 input[i] \u2264 4,294,967,295</code> (32-bit unsigned integers)</li>\n\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const unsigned int* input, unsigned int* output, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[UInt32, MutExternalOrigin],\n    output: UnsafePointer[UInt32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef radix_sort_kernel(input, output, N):\n    input = input.to(tl.pointer_type(tl.uint32))\n    output = output.to(tl.pointer_type(tl.uint32))\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Radix Sort\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "\n",
+        "        assert input.dtype == torch.uint32\n",
+        "        assert output.dtype == torch.uint32\n",
+        "        assert input.shape == output.shape == (N,)\n",
+        "\n",
+        "        # Convert uint32 to int64 for sorting (since torch.sort doesn't support uint32)\n",
+        "        input_int64 = input.to(torch.int64)\n",
+        "        sorted_tensor = torch.sort(input_int64)[0]\n",
+        "        # Convert back to uint32\n",
+        "        output.copy_(sorted_tensor.to(torch.uint32))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_uint32), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_uint32), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.uint32\n",
+        "        N = 8\n",
+        "        input_data = torch.tensor([170, 45, 75, 90, 2, 802, 24, 66], device=\"cuda\", dtype=dtype)\n",
+        "        output_data = torch.zeros(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_data,\n",
+        "            \"output\": output_data,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.uint32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Test case 1: basic example\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [170, 45, 75, 90, 2, 802, 24, 66], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(8, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 8,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: duplicate numbers\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1, 4, 1, 3, 555, 1000, 2], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(7, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 7,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: single element\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([42], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: already sorted\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1, 2, 3, 4, 5, 6], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(6, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 6,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: reverse sorted\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([6, 5, 4, 3, 2, 1], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(6, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 6,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: large numbers\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [4294967295, 1000000000, 500000000, 2000000000, 100000000],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 7: medium random\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 1000001, (1024,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 8: large random\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 4294967296, (10000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(10000, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.uint32\n",
+        "        N = 50000000\n",
+        "        return {\n",
+        "            \"input\": torch.randint(0, 4294967296, (N,), device=\"cuda\", dtype=dtype),\n",
+        "            \"output\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/39_Fast_Fourier_transform.ipynb b/challenges/colab_exports/hard/39_Fast_Fourier_transform.ipynb
new file mode 100644
index 00000000..aac640a1
--- /dev/null
+++ b/challenges/colab_exports/hard/39_Fast_Fourier_transform.ipynb
@@ -0,0 +1,483 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that computes the Fast Fourier Transform (FFT) of a\n  complex-valued 1-D signal. Given an input <code>signal</code> array containing\n  <code>N</code> complex numbers stored as interleaved real/imaginary pairs,\n  compute the discrete Fourier transform and store the result in the\n  <code>spectrum</code> array. The FFT converts a time-domain signal into its\n  frequency-domain representation using the formula: $$ X_k = \\sum_{n=0}^{N-1}\n  x_n \\cdot e^{-j 2\\pi kn / N} \\quad \\text{for } k = 0, 1, \\ldots, N-1 $$ The\n  FFT algorithm reduces the computational complexity from O(N\u00b2) to O(N log N) by\n  exploiting symmetries in the twiddle factors.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries (cuFFT etc.) are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>spectrum</code> array</li>\n  <li>The kernel must be entirely GPU-resident\u2014no host-side FFT calls</li>\n  <li>\n    Both input and output use interleaved real/imaginary layout:\n    <code>[real\u2080, imag\u2080, real\u2081, imag\u2081, ...]</code>\n  </li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  N = 4\n        signal = [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]\n        (represents: [1+0j, 0+0j, 0+0j, 0+0j])\n\nOutput: spectrum = [1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0]\n        (represents: [1+0j, 1+0j, 1+0j, 1+0j])\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  N = 2\n        signal = [1.0, 0.0, 1.0, 0.0]\n        (represents: [1+0j, 1+0j])\n\nOutput: spectrum = [2.0, 0.0, 0.0, 0.0]\n        (represents: [2+0j, 0+0j])\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li><code>1 \u2264 N \u2264 262,144</code></li>\n  <li>All values are 32-bit floating point numbers</li>\n  <li>Absolute error \u2264 1e-3 and relative error \u2264 1e-3</li>\n  <li>Input and output arrays have length <code>2 \u00d7 N</code></li>\n\n  <li>Performance is measured with <code>N</code> = 262,144</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// signal and spectrum are device pointers\nextern \"C\" void solve(const float* signal, float* spectrum, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# signal, spectrum are tensors on the GPU\n@cute.jit\ndef solve(signal: cute.Tensor, spectrum: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# signal is a tensor on GPU\n@jax.jit\ndef solve(signal: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# signal and spectrum are device pointers\n@export\ndef solve(\n    signal: UnsafePointer[Float32, MutExternalOrigin],\n    spectrum: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# signal and spectrum are device pointers\ndef solve(signal: torch.Tensor, spectrum: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# signal and spectrum are tensors on the GPU\ndef solve(signal: torch.Tensor, spectrum: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Fast Fourier Transform\", atol=1e-3, rtol=1e-3, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, signal: torch.Tensor, spectrum: torch.Tensor, N: int):\n",
+        "        \"\"\"\n",
+        "        Ground-truth implementation using torch.fft. Assumes both tensors are\n",
+        "        on the same device (CPU or CUDA).  Works for any N (power-of-two not\n",
+        "        required, but contestants may optimise for radix-2).\n",
+        "\n",
+        "        Args\n",
+        "        ----\n",
+        "        signal     : flattened real/imag interleaved input  (len == 2 \u00d7 N)\n",
+        "        spectrum   : flattened real/imag interleaved output (len == 2 \u00d7 N)\n",
+        "        N          : number of complex samples\n",
+        "        \"\"\"\n",
+        "        assert signal.shape == (2 * N,)\n",
+        "        assert spectrum.shape == (2 * N,)\n",
+        "        assert signal.dtype == spectrum.dtype\n",
+        "        assert signal.device == spectrum.device\n",
+        "\n",
+        "        # View as (N, 2) \u2192 complex tensor\n",
+        "        sig_ri = signal.view(N, 2)\n",
+        "        sig_c = torch.complex(sig_ri[:, 0], sig_ri[:, 1])\n",
+        "\n",
+        "        # Torch reference FFT\n",
+        "        spec_c = torch.fft.fft(sig_c)\n",
+        "\n",
+        "        # Write back as interleaved real/imag\n",
+        "        spec_ri = torch.stack((spec_c.real, spec_c.imag), dim=1).contiguous()\n",
+        "        spectrum.copy_(spec_ri.view(-1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"signal\": (ctypes.POINTER(ctypes.c_float), \"in\"),  # in  (2 \u00d7 N),\n",
+        "            \"spectrum\": (ctypes.POINTER(ctypes.c_float), \"out\"),  # out (2 \u00d7 N),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        # Impulse signal \u03b4[n] = 1 when n=0 else 0 (expected flat spectrum)\n",
+        "        signal = torch.tensor([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype)\n",
+        "        spectrum = torch.empty(2 * N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"signal\": signal, \"spectrum\": spectrum, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        cases: List[Dict[str, Any]] = []\n",
+        "\n",
+        "        # 1. Constant signal  (all ones) \u2013 DC spike only\n",
+        "        N = 8\n",
+        "        const_sig = torch.ones(2 * N, device=\"cuda\", dtype=dtype)\n",
+        "        const_spec = torch.empty_like(const_sig)\n",
+        "        cases.append({\"signal\": const_sig, \"spectrum\": const_spec, \"N\": N})\n",
+        "\n",
+        "        # 2. Single-frequency sinusoid  (real: cos, imag: sin)\n",
+        "        N = 16\n",
+        "        k = 3  # frequency bin\n",
+        "        n = torch.arange(N, device=\"cuda\", dtype=dtype)\n",
+        "        real = torch.cos(2.0 * torch.pi * k * n / N)\n",
+        "        imag = torch.sin(2.0 * torch.pi * k * n / N)\n",
+        "        sinusoid = torch.stack((real, imag), dim=1).contiguous().view(-1)\n",
+        "        sinusoid_spec = torch.empty_like(sinusoid)\n",
+        "        cases.append({\"signal\": sinusoid, \"spectrum\": sinusoid_spec, \"N\": N})\n",
+        "\n",
+        "        # 3. Random complex signal, power-of-two length\n",
+        "        N = 256\n",
+        "        rnd = torch.empty(2 * N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        rnd_spec = torch.empty_like(rnd)\n",
+        "        cases.append({\"signal\": rnd, \"spectrum\": rnd_spec, \"N\": N})\n",
+        "\n",
+        "        # 4. Random complex signal, non-power-of-two length\n",
+        "        N = 250\n",
+        "        rnd_np2 = torch.empty(2 * N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        rnd_np2_spec = torch.empty_like(rnd_np2)\n",
+        "        cases.append({\"signal\": rnd_np2, \"spectrum\": rnd_np2_spec, \"N\": N})\n",
+        "\n",
+        "        # 5. Medium-size signal (performance sanity)\n",
+        "        N = 4096\n",
+        "        med = torch.empty(2 * N, device=\"cuda\", dtype=dtype).normal_(0.0, 0.5)\n",
+        "        med_spec = torch.empty_like(med)\n",
+        "        cases.append({\"signal\": med, \"spectrum\": med_spec, \"N\": N})\n",
+        "\n",
+        "        return cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 262_144  # 256 K complex samples  (~2 MiB real/imag)\n",
+        "        big_sig = torch.empty(2 * N, device=\"cuda\", dtype=dtype).normal_(0.0, 1.0)\n",
+        "        big_spec = torch.empty_like(big_sig)\n",
+        "        return {\"signal\": big_sig, \"spectrum\": big_spec, \"N\": N}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/46_bfs_shortest_path.ipynb b/challenges/colab_exports/hard/46_bfs_shortest_path.ipynb
new file mode 100644
index 00000000..2c688cb9
--- /dev/null
+++ b/challenges/colab_exports/hard/46_bfs_shortest_path.ipynb
@@ -0,0 +1,627 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that finds the shortest path in an unweighted 2D grid using Breadth-First Search (BFS). Given a grid with obstacles and start/end positions, return the minimum number of steps needed to reach the destination.\n</p>\n\n<svg width=\"280\" height=\"280\" viewBox=\"0 0 280 280\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\" font-family=\"monospace\">\n  <rect width=\"280\" height=\"280\" rx=\"8\" fill=\"#1a1a1a\"/>\n  <g transform=\"translate(15, 15)\">\n    <!-- Row 0: S . . # . -->\n    <rect x=\"0\"   y=\"0\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"2\"/>\n    <rect x=\"50\"  y=\"0\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"100\" y=\"0\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"150\" y=\"0\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"200\" y=\"0\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <!-- Row 1: # # . # . -->\n    <rect x=\"0\"   y=\"50\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"50\"  y=\"50\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"100\" y=\"50\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"150\" y=\"50\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"200\" y=\"50\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <!-- Row 2: . . . . . -->\n    <rect x=\"0\"   y=\"100\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"50\"  y=\"100\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"100\" y=\"100\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"150\" y=\"100\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"200\" y=\"100\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <!-- Row 3: . # # . # -->\n    <rect x=\"0\"   y=\"150\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"50\"  y=\"150\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"100\" y=\"150\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <rect x=\"150\" y=\"150\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"200\" y=\"150\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#3d1e1e\" stroke=\"#552222\" stroke-width=\"1\"/>\n    <!-- Row 4: . . . . E -->\n    <rect x=\"0\"   y=\"200\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"50\"  y=\"200\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"100\" y=\"200\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#333\" stroke=\"#444\" stroke-width=\"1\"/>\n    <rect x=\"150\" y=\"200\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#554400\" stroke=\"#665500\" stroke-width=\"1\"/>\n    <rect x=\"200\" y=\"200\" width=\"46\" height=\"46\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"2\"/>\n    <!-- Obstacle # marks -->\n    <text x=\"173\" y=\"29\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"23\"  y=\"79\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"73\"  y=\"79\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"173\" y=\"79\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"73\"  y=\"179\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"123\" y=\"179\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <text x=\"223\" y=\"179\" font-size=\"16\" fill=\"#884444\" text-anchor=\"middle\">#</text>\n    <!-- Start cell -->\n    <text x=\"23\" y=\"22\" font-size=\"18\" font-weight=\"bold\" fill=\"#44aa66\" text-anchor=\"middle\">S</text>\n    <text x=\"23\" y=\"38\" font-size=\"8\" fill=\"#66cc88\" text-anchor=\"middle\">Start</text>\n    <!-- End cell -->\n    <text x=\"223\" y=\"222\" font-size=\"18\" font-weight=\"bold\" fill=\"#4477bb\" text-anchor=\"middle\">E</text>\n    <text x=\"223\" y=\"238\" font-size=\"8\" fill=\"#6699cc\" text-anchor=\"middle\">End</text>\n    <!-- Path step numbers -->\n    <text x=\"73\"  y=\"28\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">1</text>\n    <text x=\"123\" y=\"28\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">2</text>\n    <text x=\"123\" y=\"78\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">3</text>\n    <text x=\"123\" y=\"128\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">4</text>\n    <text x=\"173\" y=\"128\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">5</text>\n    <text x=\"173\" y=\"178\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">6</text>\n    <text x=\"173\" y=\"228\" font-size=\"13\" fill=\"#aa9933\" text-anchor=\"middle\">7</text>\n  </g>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>Return the shortest path length, or -1 if no path exists</li>\n  <li>Grid cells with value 0 are free, cells with value 1 are obstacles</li>\n  <li>Movement is allowed in 4 directions: up, down, left, right</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\n  grid (4x4) = [\n    [0, 0, 0, 0],\n    [1, 1, 0, 1],\n    [0, 0, 0, 0],\n    [0, 1, 1, 0]\n  ]\n  start_row = 0, start_col = 0\n  end_row = 3, end_col = 3\n\nOutput: 6\n\nExplanation: One possible shortest path:\n(0,0) \u2192 (0,1) \u2192 (0,2) \u2192 (1,2) \u2192 (2,2) \u2192 (2,3) \u2192 (3,3)\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:\n  grid (3x3) = [\n    [0, 1, 0],\n    [1, 1, 1],\n    [0, 0, 0]\n  ]\n  start_row = 0, start_col = 0\n  end_row = 0, end_col = 2\n\nOutput: -1\n\nExplanation: No path exists due to obstacles completely blocking the way.\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>rows</code>, <code>cols</code> \u2264 1000</li>\n  <li>Grid values are either 0 (free) or 1 (obstacle)</li>\n  <li>Start and end positions are guaranteed to be within bounds and on free cells (value 0)</li>\n  <li>Start and end positions may be the same (return 0 in this case)</li>\n\n  <li>Performance is measured with <code>cols</code> = 500, <code>rows</code> = 500</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// grid, result are device pointers\nextern \"C\" void solve(const int* grid, int* result, int rows, int cols, int start_row,\n                      int start_col, int end_row, int end_col) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# grid, result are tensors on the GPU\n@cute.jit\ndef solve(\n    grid: cute.Tensor,\n    result: cute.Tensor,\n    rows: cute.Int32,\n    cols: cute.Int32,\n    start_row: cute.Int32,\n    start_col: cute.Int32,\n    end_row: cute.Int32,\n    end_col: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# grid is a tensor on the GPU\n@jax.jit\ndef solve(\n    grid: jax.Array,\n    rows: int,\n    cols: int,\n    start_row: int,\n    start_col: int,\n    end_row: int,\n    end_col: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.memory import UnsafePointer\n\n\n# grid, result are device pointers\n@export\ndef solve(\n    grid: UnsafePointer[Int32, MutExternalOrigin],\n    result: UnsafePointer[Int32, MutExternalOrigin],\n    rows: Int32,\n    cols: Int32,\n    start_row: Int32,\n    start_col: Int32,\n    end_row: Int32,\n    end_col: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# grid, result are tensors on the GPU\ndef solve(\n    grid: torch.Tensor,\n    result: torch.Tensor,\n    rows: int,\n    cols: int,\n    start_row: int,\n    start_col: int,\n    end_row: int,\n    end_col: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# grid, result are tensors on the GPU\ndef solve(\n    grid: torch.Tensor,\n    result: torch.Tensor,\n    rows: int,\n    cols: int,\n    start_row: int,\n    start_col: int,\n    end_row: int,\n    end_col: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"BFS Shortest Path\", atol=0, rtol=0, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        grid: torch.Tensor,\n",
+        "        result: torch.Tensor,\n",
+        "        rows: int,\n",
+        "        cols: int,\n",
+        "        start_row: int,\n",
+        "        start_col: int,\n",
+        "        end_row: int,\n",
+        "        end_col: int,\n",
+        "    ):\n",
+        "        \"\"\"\n",
+        "        Reference implementation that finds shortest path using BFS.\n",
+        "\n",
+        "        Args:\n",
+        "            grid: Flattened 2D grid of size rows*cols (0=free, 1=obstacle)\n",
+        "            result: Single element tensor to store the result\n",
+        "            rows, cols: Grid dimensions\n",
+        "            start_row, start_col: Starting position\n",
+        "            end_row, end_col: Target position\n",
+        "        \"\"\"\n",
+        "        assert grid.dtype == torch.int32\n",
+        "        assert result.dtype == torch.int32\n",
+        "        assert grid.shape == (rows * cols,)\n",
+        "        assert result.shape == (1,)\n",
+        "        assert 0 <= start_row < rows and 0 <= start_col < cols\n",
+        "        assert 0 <= end_row < rows and 0 <= end_col < cols\n",
+        "\n",
+        "        # If start and end are the same\n",
+        "        if start_row == end_row and start_col == end_col:\n",
+        "            result[0] = 0\n",
+        "            return\n",
+        "\n",
+        "        # Reshape grid for easier indexing\n",
+        "        grid_2d = grid.view(rows, cols)\n",
+        "\n",
+        "        # BFS implementation\n",
+        "        from collections import deque\n",
+        "\n",
+        "        # Directions: up, down, left, right\n",
+        "        directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]\n",
+        "\n",
+        "        # Initialize visited array\n",
+        "        visited = torch.zeros((rows, cols), dtype=torch.bool, device=grid.device)\n",
+        "\n",
+        "        # BFS queue: (row, col, distance)\n",
+        "        queue = deque([(start_row, start_col, 0)])\n",
+        "        visited[start_row, start_col] = True\n",
+        "\n",
+        "        while queue:\n",
+        "            row, col, dist = queue.popleft()\n",
+        "\n",
+        "            # Check if we reached the target\n",
+        "            if row == end_row and col == end_col:\n",
+        "                result[0] = dist\n",
+        "                return\n",
+        "\n",
+        "            # Explore neighbors\n",
+        "            for dr, dc in directions:\n",
+        "                new_row, new_col = row + dr, col + dc\n",
+        "\n",
+        "                # Check bounds\n",
+        "                if 0 <= new_row < rows and 0 <= new_col < cols:\n",
+        "                    # Check if not visited and not obstacle\n",
+        "                    if not visited[new_row, new_col] and grid_2d[new_row, new_col] == 0:\n",
+        "                        visited[new_row, new_col] = True\n",
+        "                        queue.append((new_row, new_col, dist + 1))\n",
+        "\n",
+        "        # No path found\n",
+        "        result[0] = -1\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"grid\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"result\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"cols\": (ctypes.c_int, \"in\"),\n",
+        "            \"start_row\": (ctypes.c_int, \"in\"),\n",
+        "            \"start_col\": (ctypes.c_int, \"in\"),\n",
+        "            \"end_row\": (ctypes.c_int, \"in\"),\n",
+        "            \"end_col\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype_int = torch.int32\n",
+        "\n",
+        "        # Example from challenge.html\n",
+        "        # Grid: [[0,0,0,0], [1,1,0,1], [0,0,0,0], [0,1,1,0]]\n",
+        "        grid_data = torch.tensor(\n",
+        "            [0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],  # row 0  # row 1  # row 2  # row 3\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype_int,\n",
+        "        )\n",
+        "\n",
+        "        result_data = torch.tensor([0], device=\"cuda\", dtype=dtype_int)\n",
+        "\n",
+        "        return {\n",
+        "            \"grid\": grid_data,\n",
+        "            \"result\": result_data,\n",
+        "            \"rows\": 4,\n",
+        "            \"cols\": 4,\n",
+        "            \"start_row\": 0,\n",
+        "            \"start_col\": 0,\n",
+        "            \"end_row\": 3,\n",
+        "            \"end_col\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype_int = torch.int32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Test case 1: Simple path exists\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": torch.tensor([0, 0, 1, 0, 0, 0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 2,\n",
+        "                \"cols\": 3,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 1,\n",
+        "                \"end_col\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: No path (blocked)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": torch.tensor([0, 1, 0, 1, 0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 1,\n",
+        "                \"cols\": 5,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 0,\n",
+        "                \"end_col\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: Same start and end\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": torch.tensor([0, 1, 0, 0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 2,\n",
+        "                \"cols\": 2,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 0,\n",
+        "                \"end_col\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: Single cell\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 1,\n",
+        "                \"cols\": 1,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 0,\n",
+        "                \"end_col\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: Larger grid with path\n",
+        "        large_grid = torch.zeros(25, device=\"cuda\", dtype=dtype_int)  # 5x5 grid\n",
+        "        large_grid[6] = 1  # obstacle at (1,1)\n",
+        "        large_grid[7] = 1  # obstacle at (1,2)\n",
+        "        large_grid[8] = 1  # obstacle at (1,3)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": large_grid,\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 5,\n",
+        "                \"cols\": 5,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 4,\n",
+        "                \"end_col\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: Complex maze\n",
+        "        maze_grid = torch.tensor(\n",
+        "            [0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype_int,\n",
+        "        )\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"grid\": maze_grid,\n",
+        "                \"result\": torch.tensor([0], device=\"cuda\", dtype=dtype_int),\n",
+        "                \"rows\": 5,\n",
+        "                \"cols\": 5,\n",
+        "                \"start_row\": 0,\n",
+        "                \"start_col\": 0,\n",
+        "                \"end_row\": 4,\n",
+        "                \"end_col\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype_int = torch.int32\n",
+        "        rows, cols = 500, 500\n",
+        "\n",
+        "        # Create a large grid with some random obstacles\n",
+        "        torch.manual_seed(42)\n",
+        "        grid = torch.randint(0, 2, (rows * cols,), device=\"cuda\", dtype=dtype_int)\n",
+        "\n",
+        "        # Ensure start and end are free\n",
+        "        grid[0] = 0  # start at (0,0)\n",
+        "        grid[-1] = 0  # end at (rows-1, cols-1)\n",
+        "\n",
+        "        # Create some clear paths to avoid always getting -1\n",
+        "        for i in range(0, rows * cols, cols):\n",
+        "            if i + cols - 1 < rows * cols:\n",
+        "                grid[i : i + min(cols, 10)] = 0  # Clear first 10 cells of each row\n",
+        "\n",
+        "        result = torch.tensor([0], device=\"cuda\", dtype=dtype_int)\n",
+        "\n",
+        "        return {\n",
+        "            \"grid\": grid,\n",
+        "            \"result\": result,\n",
+        "            \"rows\": rows,\n",
+        "            \"cols\": cols,\n",
+        "            \"start_row\": 0,\n",
+        "            \"start_col\": 0,\n",
+        "            \"end_row\": rows - 1,\n",
+        "            \"end_col\": cols - 1,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/53_casual_attention.ipynb b/challenges/colab_exports/hard/53_casual_attention.ipynb
new file mode 100644
index 00000000..1911bf61
--- /dev/null
+++ b/challenges/colab_exports/hard/53_casual_attention.ipynb
@@ -0,0 +1,510 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p> Implement Causal (masked) Self-Attention for a given set of matrices.\n  Given the query matrix <code>Q</code> of size <code>M\u00d7d</code>, key matrix <code>K</code> of size <code>M\u00d7d</code>, and value matrix\n  <code>V</code> of size <code>M\u00d7d</code>, your program should compute the output matrix using the formula:\n  $$\\text{Attention}_{\\text{causal}}(Q, K, V) = \\text{softmax}\\Bigl(\\text{masked}\\Bigl( \\frac{QK^T}{\\sqrt{d}} \\Bigr)\\Bigr)V$$\n</p>\n\n\n<p>\n  where <code>mask</code> is a causal mask that sets all positions corresponding to keys <strong>after</strong> the current query to $-\\infty$.\n  $$$$\n  i.e., for query <code>i</code> and key <code>j</code>:\n  $$\n  \\text{masked}(a_{ij}) =\n  \\begin{cases}\n  a_{ij}, & j \\le i \\\\\n  -\\infty, & j > i\n  \\end{cases}\n  $$\n  The softmax function is applied row-wise. <code>Q</code>, <code>K</code>, <code>V</code>, and <code>output</code> are all of data type <code>float32</code>;\n  <code>M</code>, and <code>d</code> are of data type <code>int32</code>.\n</p>\n\n<svg width=\"310\" height=\"310\" viewBox=\"0 0 310 310\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\" font-family=\"monospace\" font-size=\"10\">\n  <rect width=\"310\" height=\"310\" rx=\"8\" fill=\"#222\"/>\n\n  <!-- Axis labels -->\n  <text x=\"175\" y=\"16\" text-anchor=\"middle\" fill=\"#999\" font-size=\"10\">key position &#x2192;</text>\n  <text x=\"14\" y=\"170\" fill=\"#999\" font-size=\"10\" transform=\"rotate(-90,14,170)\">query &#x2192;</text>\n\n  <!-- Column headers -->\n  <text x=\"62\"  y=\"34\" text-anchor=\"middle\" fill=\"#777\">0</text>\n  <text x=\"100\" y=\"34\" text-anchor=\"middle\" fill=\"#777\">1</text>\n  <text x=\"138\" y=\"34\" text-anchor=\"middle\" fill=\"#777\">2</text>\n  <text x=\"176\" y=\"34\" text-anchor=\"middle\" fill=\"#777\">3</text>\n  <text x=\"214\" y=\"34\" text-anchor=\"middle\" fill=\"#777\">4</text>\n  <text x=\"252\" y=\"34\" text-anchor=\"middle\" fill=\"#777\">5</text>\n\n  <!-- Row headers -->\n  <text x=\"32\" y=\"58\"  text-anchor=\"middle\" fill=\"#777\">0</text>\n  <text x=\"32\" y=\"96\"  text-anchor=\"middle\" fill=\"#777\">1</text>\n  <text x=\"32\" y=\"134\" text-anchor=\"middle\" fill=\"#777\">2</text>\n  <text x=\"32\" y=\"172\" text-anchor=\"middle\" fill=\"#777\">3</text>\n  <text x=\"32\" y=\"210\" text-anchor=\"middle\" fill=\"#777\">4</text>\n  <text x=\"32\" y=\"248\" text-anchor=\"middle\" fill=\"#777\">5</text>\n\n  <!-- 6x6 grid: lower triangle = attend (blue), upper = masked (dark red) -->\n  <!-- Row 0 -->\n  <rect x=\"44\"  y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"120\" y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"158\" y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"196\" y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"234\" y=\"42\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <!-- Row 1 -->\n  <rect x=\"44\"  y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"120\" y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"158\" y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"196\" y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"234\" y=\"80\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <!-- Row 2 -->\n  <rect x=\"44\"  y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"120\" y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"158\" y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"196\" y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"234\" y=\"118\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <!-- Row 3 -->\n  <rect x=\"44\"  y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"120\" y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"158\" y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"196\" y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <rect x=\"234\" y=\"156\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <!-- Row 4 -->\n  <rect x=\"44\"  y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"120\" y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"158\" y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"196\" y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"234\" y=\"194\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <!-- Row 5 -->\n  <rect x=\"44\"  y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"82\"  y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"120\" y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"158\" y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"196\" y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <rect x=\"234\" y=\"232\" width=\"36\" height=\"28\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n\n  <!-- Checkmarks in attend cells -->\n  <!-- Row 0 -->\n  <text x=\"62\"  y=\"60\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <!-- Row 1 -->\n  <text x=\"62\"  y=\"98\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"100\" y=\"98\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <!-- Row 2 -->\n  <text x=\"62\"  y=\"136\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"100\" y=\"136\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"138\" y=\"136\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <!-- Row 3 -->\n  <text x=\"62\"  y=\"174\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"100\" y=\"174\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"138\" y=\"174\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"176\" y=\"174\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <!-- Row 4 -->\n  <text x=\"62\"  y=\"212\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"100\" y=\"212\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"138\" y=\"212\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"176\" y=\"212\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"214\" y=\"212\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <!-- Row 5 -->\n  <text x=\"62\"  y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"100\" y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"138\" y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"176\" y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"214\" y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n  <text x=\"252\" y=\"250\" text-anchor=\"middle\" fill=\"#66cc66\" font-size=\"12\">&#x2713;</text>\n\n  <!-- Text in masked cells: -\u221e -->\n  <!-- Row 0 -->\n  <text x=\"100\" y=\"60\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"138\" y=\"60\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"176\" y=\"60\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"214\" y=\"60\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"252\" y=\"60\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <!-- Row 1 -->\n  <text x=\"138\" y=\"98\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"176\" y=\"98\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"214\" y=\"98\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"252\" y=\"98\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <!-- Row 2 -->\n  <text x=\"176\" y=\"136\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"214\" y=\"136\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"252\" y=\"136\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <!-- Row 3 -->\n  <text x=\"214\" y=\"174\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <text x=\"252\" y=\"174\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n  <!-- Row 4 -->\n  <text x=\"252\" y=\"212\" text-anchor=\"middle\" fill=\"#cc6666\" font-size=\"9\">-&#x221e;</text>\n\n  <!-- Diagonal boundary highlight -->\n  <line x1=\"82\"  y1=\"42\" x2=\"82\"  y2=\"70\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"82\"  y1=\"70\" x2=\"118\" y2=\"70\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"118\" y1=\"70\" x2=\"118\" y2=\"108\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"118\" y1=\"108\" x2=\"156\" y2=\"108\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"156\" y1=\"108\" x2=\"156\" y2=\"146\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"156\" y1=\"146\" x2=\"194\" y2=\"146\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"194\" y1=\"146\" x2=\"194\" y2=\"184\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"194\" y1=\"184\" x2=\"232\" y2=\"184\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"232\" y1=\"184\" x2=\"232\" y2=\"222\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n  <line x1=\"232\" y1=\"222\" x2=\"270\" y2=\"222\" stroke=\"#ffffff\" stroke-width=\"1.5\" stroke-opacity=\"0.3\"/>\n\n  <!-- Legend -->\n  <rect x=\"44\" y=\"275\" width=\"14\" height=\"14\" rx=\"2\" fill=\"#2a4a7a\" stroke=\"#3a5a8a\"/>\n  <text x=\"64\" y=\"287\" fill=\"#999\" font-size=\"10\">attend</text>\n  <rect x=\"160\" y=\"275\" width=\"14\" height=\"14\" rx=\"2\" fill=\"#3d1e1e\" stroke=\"#4d2e2e\"/>\n  <text x=\"180\" y=\"287\" fill=\"#999\" font-size=\"10\">masked</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the output matrix\n    <code>output</code>\n  </li>\n</ul>\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n3.4898374 & 4.4898374 & 5.4898374 & 6.4898374\n\\end{bmatrix}\n$$\n</p>\n\n\n<h2>Example 2:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n0.0 & 0.0 \\\\\n1.0 & 1.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 \\\\\n0.0 & 1.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\n</p>\n\n\n<h2>Constraints</h2>\n<ul>\n  <li>Matrix <code>Q</code>, <code>K</code>, and <code>V</code> are all of size <code>M\u00d7d</code></li>\n  <li>1 &le; <code>M</code> &le; 10000</li>\n  <li>1 &le; <code>d</code> &le; 128</li>\n  <li>All elements in <code>Q</code>, <code>K</code>, and <code>V</code> are sampled from<code>[-100.0, 100.0]</code></li>\n  <li>Data type for all matrices is <code>float32</code></li>\n\n  <li>Performance is measured with <code>M</code> = 5,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int M, int d) {\n\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    d: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, K: jax.Array, V: jax.Array, M: int, d: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    d: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Causal Self-Attention\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        d: int,\n",
+        "    ):\n",
+        "        scale = d**0.5\n",
+        "        attn = torch.matmul(Q, K.t()) / scale\n",
+        "\n",
+        "        # add mask\n",
+        "        mask = torch.triu(torch.ones(M, M, device=attn.device), diagonal=1).bool()\n",
+        "        attn = attn.masked_fill(mask, float(\"-inf\"))\n",
+        "        attn = torch.softmax(attn, dim=1)\n",
+        "        torch.matmul(attn, V, out=output)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"d\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": 2, \"d\": 4}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"d\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, 5, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"d\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(4, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"d\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(128, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 128,\n",
+        "                \"d\": 32,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, d = 5000, 128\n",
+        "        Q = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        K = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        V = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        output = torch.empty(M, d, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": M, \"d\": d}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/56_linear_attention.ipynb b/challenges/colab_exports/hard/56_linear_attention.ipynb
new file mode 100644
index 00000000..5f15d4da
--- /dev/null
+++ b/challenges/colab_exports/hard/56_linear_attention.ipynb
@@ -0,0 +1,530 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement <strong>Linear Attention</strong> for a given set of matrices, following the method described in\n  <a href=\"https://arxiv.org/pdf/2006.16236\" target=\"_blank\">\n  \"Transformers are RNNs: Fast Autoregressive Transformers with Linear Attention\"\n  </a>.\n  Given the query matrix <code>Q</code> of size <code>M\u00d7d</code>, key matrix <code>K</code> of size <code>M\u00d7d</code>, and value matrix\n  <code>V</code> of size <code>M\u00d7d</code>, your program should compute the output matrix using the formula:\n  $$\n  \\text{LinearAttention}(Q, K, V) = \\frac{\\phi(Q) \\left(\\phi(K)^T V \\right)}{\\phi(Q) \\left(\\sum_j \\phi(K_j) \\right)}\n  $$\n  </p>\n\n  <p>\n  where $ \\phi(x) $ is a feature map applied element-wise, for example:\n  $$\n  \\phi(x) = \\text{ELU}(x) + 1 =\n  \\begin{cases}\n  x + 1, & x > 0 \\\\\n  e^x, & x \\le 0\n  \\end{cases}\n  $$\n  All matrices <code>Q</code>, <code>K</code>, <code>V</code>, and <code>output</code> are of type <code>float32</code>, and <code>M</code> and <code>d</code> are of type <code>int32</code>.\n  </p>\n\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the output matrix\n    <code>output</code>\n  </li>\n</ul>\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n2.8461537 & 3.8461537 & 4.8461537 & 5.8461537 \\\\\n3.1538463 & 4.1538463 & 5.1538463 & 6.1538463\n\\end{bmatrix}\n$$\n</p>\n\n\n<h2>Example 2:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n0.0 & 0.0 \\\\\n1.0 & 1.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 \\\\\n0.0 & 1.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n4.0 & 5.0 \\\\\n4.0 & 5.0\n\\end{bmatrix}\n$$\n</p>\n\n\n<h2>Constraints</h2>\n<ul>\n  <li>Matrix <code>Q</code>, <code>K</code>, and <code>V</code> are all of size <code>M\u00d7d</code></li>\n  <li>1 &le; <code>M</code> &le; 10000</li>\n  <li>1 &le; <code>d</code> &le; 128</li>\n  <li>All elements in <code>Q</code>, <code>K</code>, and <code>V</code> are sampled from<code>[-100.0, 100.0]</code></li>\n  <li>Data type for all matrices is <code>float32</code></li>\n\n  <li>Performance is measured with <code>M</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int M, int d) {\n\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    d: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, K: jax.Array, V: jax.Array, M: int, d: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    d: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, d: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Linear Self-Attention\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        d: int,\n",
+        "    ):\n",
+        "        assert Q.shape == K.shape == V.shape == output.shape == (M, d)\n",
+        "        # \u03c6(x) = ELU(x) + 1\n",
+        "        phi_Q = torch.where(Q > 0, Q + 1, torch.exp(Q))\n",
+        "        phi_K = torch.where(K > 0, K + 1, torch.exp(K))\n",
+        "\n",
+        "        # S = sum_j \u03c6(K_j) V_j^T = \u03c6(K)^T V\n",
+        "        S = phi_K.T @ V  # (d,M) @ (M,d) = (d, d)\n",
+        "        # z = sum_j \u03c6(K_j)\n",
+        "        z = phi_K.sum(dim=0)  # (d,)\n",
+        "\n",
+        "        # numerator: \u03c6(Q_i) @ S  \u2192 (M,d)\n",
+        "        numerator = phi_Q @ S  # (M,d) @ (d,d) = (M,d)\n",
+        "        # denominator: \u03c6(Q_i) @ z  \u2192 (scalar)\n",
+        "        denominator = phi_Q @ z  # (M,d) @ (d,) = (M,)\n",
+        "\n",
+        "        output.copy_(numerator / denominator.unsqueeze(-1))  # (M, d)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"d\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": 2, \"d\": 4}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"d\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor([[0.0, 0.0], [1.0, 1.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.tensor([[1.0, 0.0], [0.0, 1.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.tensor([[3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"d\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, 5, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"d\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(4, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"d\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(128, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 128,\n",
+        "                \"d\": 32,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, d = 10000, 128\n",
+        "        Q = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        K = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        V = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        output = torch.empty(M, d, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": M, \"d\": d}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/59_sliding_window_attn.ipynb b/challenges/colab_exports/hard/59_sliding_window_attn.ipynb
new file mode 100644
index 00000000..463c81f8
--- /dev/null
+++ b/challenges/colab_exports/hard/59_sliding_window_attn.ipynb
@@ -0,0 +1,544 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement <strong>Sliding Window Self-Attention</strong> for a given set of matrices.\n  Before introducing the sliding window version, let's first recall standard Self-Attention.\n</p>\n\n<h3>1. Standard Softmax Attention</h3>\n<p>\n  Given query matrix <code>Q</code>, key matrix <code>K</code>, and value matrix <code>V</code>, each position <code>i</code> attends to all positions <code>j</code> using a softmax-weighted sum:\n</p>\n\n<p style=\"text-align:center;\">\n  $ \\text{score}_{i,j} = \\frac{Q_i \\cdot K_j}{\\sqrt{d}} $\n</p>\n\n<p style=\"text-align:center;\">\n  $ \\text{output}_i = \\sum_{j=1}^{M} \\text{softmax}(\\text{score}_{i,*})_j \\cdot V_j $\n</p>\n\n<p>\n  In other words, each query computes similarity with all keys, applies a softmax to get attention weights, and then computes a weighted sum of values.\n</p>\n\n<h3>2. Sliding Window Self-Attention</h3>\n<p>\n  Sliding Window Attention modifies standard attention by restricting each query to attend only to a local window around its position.\n</p>\n\n<ul>\n  <li>For each position <code>i</code>, only consider the keys and values within a window of size <code>window_size</code> around <code>i</code> (positions <code>[i-window_size, ..., i+window_size]</code>).</li>\n  <li>Compute similarity scores between <code>Q<sub>i</sub></code> and the keys in this window:</li>\n</ul>\n\n<p style=\"text-align:center;\">\n  $ \\text{score}_{i,j} = \\frac{Q_i \\cdot K_j}{\\sqrt{d}} $\n</p>\n\n<ul>\n  <li>Apply <code>softmax</code> over these local scores to obtain attention weights.</li>\n  <li>Use the weights to compute a weighted average of the values in the same window:</li>\n</ul>\n\n<p style=\"text-align:center;\">\n  $ \\text{output}_i = \\sum_{j \\in [i-\\text{window_size}, \\, i+\\text{window_size}]} \\text{softmax}(\\text{score}_{i,*})_j \\cdot V_j $\n</p>\n\n<p>\n  In short, each query only attends to its nearby neighbors.\n</p>\n\n<svg width=\"320\" height=\"320\" viewBox=\"0 0 320 320\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\">\n  <style>\n    text { font-family: monospace; fill: #cccccc; }\n  </style>\n  <!-- Background -->\n  <rect width=\"320\" height=\"320\" fill=\"#1a1a1a\" rx=\"6\"/>\n\n  <!-- Axis labels -->\n  <text x=\"185\" y=\"16\" text-anchor=\"middle\" font-size=\"11\">key position &#x2192;</text>\n  <text x=\"14\" y=\"168\" text-anchor=\"middle\" font-size=\"11\" transform=\"rotate(-90,14,168)\">query position</text>\n\n  <!-- Column headers (positions 0-7) -->\n  <g font-size=\"10\" text-anchor=\"middle\">\n    <text x=\"62\" y=\"36\">0</text>\n    <text x=\"93\" y=\"36\">1</text>\n    <text x=\"124\" y=\"36\">2</text>\n    <text x=\"155\" y=\"36\">3</text>\n    <text x=\"186\" y=\"36\">4</text>\n    <text x=\"217\" y=\"36\">5</text>\n    <text x=\"248\" y=\"36\">6</text>\n    <text x=\"279\" y=\"36\">7</text>\n  </g>\n\n  <!-- Row headers (positions 0-7) -->\n  <g font-size=\"10\" text-anchor=\"middle\">\n    <text x=\"34\" y=\"58\">0</text>\n    <text x=\"34\" y=\"89\">1</text>\n    <text x=\"34\" y=\"120\">2</text>\n    <text x=\"34\" y=\"151\">3</text>\n    <text x=\"34\" y=\"182\">4</text>\n    <text x=\"34\" y=\"213\">5</text>\n    <text x=\"34\" y=\"244\">6</text>\n    <text x=\"34\" y=\"275\">7</text>\n  </g>\n\n  <!-- 8x8 grid: cell is blue (#2a4a7a) if |row-col| <= 2, else dark (#2a2a2a) -->\n  <!-- Row 0: cols 0,1,2 blue -->\n  <rect x=\"46\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"42\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 1: cols 0,1,2,3 blue -->\n  <rect x=\"46\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"73\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 2: cols 0,1,2,3,4 blue -->\n  <rect x=\"46\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"104\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 3: cols 1,2,3,4,5 blue -->\n  <rect x=\"46\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"135\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 4: cols 2,3,4,5,6 blue -->\n  <rect x=\"46\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"166\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 5: cols 3,4,5,6,7 blue -->\n  <rect x=\"46\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"197\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 6: cols 4,5,6,7 blue -->\n  <rect x=\"46\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"228\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <!-- Row 7: cols 5,6,7 blue -->\n  <rect x=\"46\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"77\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"108\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"139\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"170\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a2a2a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"201\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"232\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n  <rect x=\"263\" y=\"259\" width=\"31\" height=\"31\" fill=\"#2a4a7a\" stroke=\"#1a1a1a\" stroke-width=\"1\"/>\n\n  <!-- Grid border -->\n  <rect x=\"46\" y=\"42\" width=\"248\" height=\"248\" fill=\"none\" stroke=\"#555555\" stroke-width=\"1\"/>\n\n  <!-- Label: window_size = 2 -->\n  <text x=\"170\" y=\"308\" text-anchor=\"middle\" font-size=\"12\" fill=\"#999999\">window_size = 2</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the output matrix\n    <code>output</code>\n  </li>\n</ul>\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0\n\\end{bmatrix}\n$$\n<code>window_size</code>: 1\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n2.5101628 & 3.5101628 & 4.510163 & 5.510163 \\\\\n3.4898374 & 4.4898376 & 5.4898376 & 6.489837\n\\end{bmatrix}\n$$\n</p>\n\n\n<h2>Example 2:</h2>\n<p>\n  <strong>Input:</strong><br>\n  <code>Q</code> (2\u00d73):\n  $$\n  \\begin{bmatrix}\n  0.0 & 0.0 & 0.0 \\\\\n  0.0 & 1.0 & 0.0\n  \\end{bmatrix}\n  $$\n  <code>K</code> (2\u00d73):\n  $$\n  \\begin{bmatrix}\n  1.0 & 0.0 & 0.0 \\\\\n  0.0 & 1.0 & 0.0\n  \\end{bmatrix}\n  $$\n  <code>V</code> (2\u00d73):\n  $$\n  \\begin{bmatrix}\n  1.0 & 2.0 & 3.0 \\\\\n  5.0 & 6.0 & 7.0\n  \\end{bmatrix}\n  $$\n  <code>window_size</code>: 1\n  </p>\n\n  <p>\n  <strong>Output:</strong><br>\n  <code>output</code> (2\u00d73):\n  $$\n  \\begin{bmatrix}\n  3.0 & 4.0 & 5.0 \\\\\n  3.5618298 & 4.56183 & 5.5618296\n  \\end{bmatrix}\n  $$\n  </p>\n\n\n\n<h2>Constraints</h2>\n<ul>\n  <li>Matrix <code>Q</code>, <code>K</code>, and <code>V</code> are all of size <code>M\u00d7d</code></li>\n  <li>1 &le; <code>M</code> &le; 10000</li>\n  <li>1 &le; <code>d</code> &le; 128</li>\n  <li>1 &le; <code>window_size</code> &le; 32</li>\n  <li>All elements in <code>Q</code>, <code>K</code>, and <code>V</code> are sampled from<code>[-100.0, 100.0]</code></li>\n  <li>Data type for all matrices is <code>float32</code></li>\n\n  <li>Performance is measured with <code>M</code> = 5,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int M, int d,\n                      int window_size) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    M: int,\n    d: int,\n    window_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, K: jax.Array, V: jax.Array, M: int, d: int, window_size: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    d: Int32,\n    window_size: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    d: int,\n    window_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    d: int,\n    window_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Sliding Window Self-Attention\",\n",
+        "            atol=1e-05,\n",
+        "            rtol=1e-05,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        d: int,\n",
+        "        window_size: int,\n",
+        "    ):\n",
+        "        assert Q.shape == K.shape == V.shape == output.shape == (M, d)\n",
+        "\n",
+        "        scores = (Q @ K.T) / (d**0.5)\n",
+        "\n",
+        "        idxs = torch.arange(M)\n",
+        "        mask = (idxs[None, :] - idxs[:, None]).abs() > window_size\n",
+        "        mask = mask.to(Q.device)\n",
+        "        scores.masked_fill_(mask, float(\"-inf\"))\n",
+        "        attn = torch.softmax(scores, dim=1)\n",
+        "\n",
+        "        torch.matmul(attn, V, out=output)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"d\": (ctypes.c_int, \"in\"),\n",
+        "            \"window_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        V = torch.tensor([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": 2, \"d\": 4, \"window_size\": 1}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"d\": 4,\n",
+        "                \"window_size\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor([[0.0, 0.0, 0.0], [0.0, 1.0, 0.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.tensor([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.tensor([[1.0, 2.0, 3.0], [5.0, 6.0, 7.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"d\": 3,\n",
+        "                \"window_size\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, 5, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"d\": 5,\n",
+        "                \"window_size\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(4, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"d\": 3,\n",
+        "                \"window_size\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(128, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 128,\n",
+        "                \"d\": 32,\n",
+        "                \"window_size\": 8,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, d, window_size = 5000, 64, 16\n",
+        "        Q = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        K = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        V = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-100, 100)\n",
+        "        output = torch.empty(M, d, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"M\": M,\n",
+        "            \"d\": d,\n",
+        "            \"window_size\": window_size,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/73_all_pairs_shortest_paths.ipynb b/challenges/colab_exports/hard/73_all_pairs_shortest_paths.ipynb
new file mode 100644
index 00000000..e6f6af06
--- /dev/null
+++ b/challenges/colab_exports/hard/73_all_pairs_shortest_paths.ipynb
@@ -0,0 +1,606 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given a weighted directed graph of <code>N</code> vertices represented as an\n  <code>N</code> &times; <code>N</code> distance matrix, compute the shortest path distance between\n  every pair of vertices using the Floyd-Warshall algorithm. The matrix is stored as a flat array in\n  row-major order: <code>dist[i * N + j]</code> is the weight of the directed edge from vertex\n  <code>i</code> to vertex <code>j</code>. A value of <code>+infinity</code> means no direct edge\n  exists. The diagonal is always zero. For each intermediate vertex <code>k</code> from <code>0</code> to <code>N - 1</code>\n  (in order), update all pairs:\n</p>\n<p>\n  $$\n    \\text{output}[i][j] = \\min\\!\\bigl(\\text{output}[i][j],\\;\n      \\text{output}[i][k] + \\text{output}[k][j]\\bigr)\n    \\quad \\forall\\, i, j\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>output</code></li>\n</ul>\n\n<h2>Example:</h2>\n<pre>\nInput: N = 4\ndist = [\n  0,   5, inf,  10,   // row 0: edges from vertex 0\n  inf, 0,   3, inf,   // row 1: edges from vertex 1\n  inf, inf, 0,   1,   // row 2: edges from vertex 2\n  inf, inf, inf, 0    // row 3: edges from vertex 3\n]\n\nOutput:\noutput = [\n  0,   5,   8,   9,   // shortest paths from vertex 0\n  inf, 0,   3,   4,   // shortest paths from vertex 1\n  inf, inf, 0,   1,   // shortest paths from vertex 2\n  inf, inf, inf, 0    // shortest paths from vertex 3\n]\n\nExplanation:\n- output[0][2] = 8   (path 0 -&gt; 1 -&gt; 2, cost 5 + 3 = 8)\n- output[0][3] = 9   (path 0 -&gt; 1 -&gt; 2 -&gt; 3, cost 5 + 3 + 1 = 9, beats direct 0 -&gt; 3 = 10)\n- output[1][3] = 4   (path 1 -&gt; 2 -&gt; 3, cost 3 + 1 = 4)\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 4,096</li>\n  <li>Edge weights are finite <code>float32</code> values or <code>+infinity</code> (no edge)</li>\n  <li>The input contains no negative cycles</li>\n  <li>The diagonal satisfies <code>dist[i * N + i] = 0</code> for all <code>i</code></li>\n  <li><code>dist</code> and <code>output</code> are flat arrays of <code>N &times; N</code> floats in row-major order</li>\n  <li>Performance is measured with <code>N</code> = 2,048</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// dist, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* dist, float* output, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# dist, output are tensors on the GPU\n@cute.jit\ndef solve(dist: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# dist is a tensor on the GPU\n@jax.jit\ndef solve(dist: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# dist, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    dist: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# dist, output are tensors on the GPU\ndef solve(dist: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# dist, output are tensors on the GPU\ndef solve(dist: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "def _make_graph(N: int, density: float = 0.5, max_weight: float = 10.0, seed: int = None):\n",
+        "    \"\"\"Create a random non-negative weighted directed graph as a flat float32 CUDA tensor.\"\"\"\n",
+        "    if seed is not None:\n",
+        "        torch.manual_seed(seed)\n",
+        "    d = torch.full((N * N,), float(\"inf\"), device=\"cuda\", dtype=torch.float32)\n",
+        "    d_view = d.view(N, N)\n",
+        "    d_view.fill_diagonal_(0.0)\n",
+        "    if N > 1:\n",
+        "        mask = torch.rand(N, N, device=\"cuda\") < density\n",
+        "        mask.fill_diagonal_(False)\n",
+        "        weights = torch.rand(N, N, device=\"cuda\") * max_weight + 0.1\n",
+        "        d_view[mask] = weights[mask]\n",
+        "    return d\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"All-Pairs Shortest Paths\",\n",
+        "            atol=1e-02,\n",
+        "            rtol=1e-02,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, dist: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert dist.shape == (N * N,)\n",
+        "        assert output.shape == (N * N,)\n",
+        "        assert dist.dtype == output.dtype == torch.float32\n",
+        "        assert dist.device == output.device\n",
+        "        assert dist.device.type == \"cuda\"\n",
+        "        d = dist.view(N, N).clone()\n",
+        "        for k in range(N):\n",
+        "            d = torch.minimum(d, d[:, k : k + 1] + d[k : k + 1, :])\n",
+        "        output.copy_(d.view(-1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"dist\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        # 4-node directed graph: 0->1:5, 0->3:10, 1->2:3, 2->3:1\n",
+        "        # Shortest paths: 0->2 = 8 (via 1), 0->3 = 9 (via 1->2->3)\n",
+        "        inf = float(\"inf\")\n",
+        "        dist = torch.tensor(\n",
+        "            [0.0, 5.0, inf, 10.0, inf, 0.0, 3.0, inf, inf, inf, 0.0, 1.0, inf, inf, inf, 0.0],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=torch.float32,\n",
+        "        )\n",
+        "        return {\n",
+        "            \"dist\": dist,\n",
+        "            \"output\": torch.empty(16, device=\"cuda\", dtype=torch.float32),\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "        inf = float(\"inf\")\n",
+        "\n",
+        "        def make_output(N):\n",
+        "            return torch.empty(N * N, device=\"cuda\", dtype=torch.float32)\n",
+        "\n",
+        "        # --- Edge cases ---\n",
+        "\n",
+        "        # N=1: single vertex\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor([0.0], device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": make_output(1),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # N=2: disconnected graph (no edges between vertices)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor([0.0, inf, inf, 0.0], device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": make_output(2),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # N=2: bidirectional edges\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor([0.0, 3.0, 7.0, 0.0], device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": make_output(2),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # N=3: chain 0->1->2; shortest path 0->2 = 2+3 = 5\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor(\n",
+        "                    [0.0, 2.0, inf, inf, 0.0, 3.0, inf, inf, 0.0],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=torch.float32,\n",
+        "                ),\n",
+        "                \"output\": make_output(3),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # N=4: graph with shortcut (same as example test)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.0,\n",
+        "                        5.0,\n",
+        "                        inf,\n",
+        "                        10.0,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                        3.0,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=torch.float32,\n",
+        "                ),\n",
+        "                \"output\": make_output(4),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # N=4: negative edge weights, no negative cycles (DAG: 0->1->2->3)\n",
+        "        # 0->1: -1, 1->2: 2, 2->3: -3, 0->3: 10\n",
+        "        # Shortest 0->2 = 1, 0->3 = -2, 1->3 = -1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.0,\n",
+        "                        -1.0,\n",
+        "                        inf,\n",
+        "                        10.0,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                        2.0,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                        -3.0,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        inf,\n",
+        "                        0.0,\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=torch.float32,\n",
+        "                ),\n",
+        "                \"output\": make_output(4),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # --- Power-of-2 sizes ---\n",
+        "        for N, seed in [(16, 1), (32, 2), (64, 3), (128, 4)]:\n",
+        "            tests.append(\n",
+        "                {\n",
+        "                    \"dist\": _make_graph(N, density=0.5, seed=seed),\n",
+        "                    \"output\": make_output(N),\n",
+        "                    \"N\": N,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # --- Non-power-of-2 sizes ---\n",
+        "        for N, seed in [(30, 5), (100, 6), (255, 7)]:\n",
+        "            tests.append(\n",
+        "                {\n",
+        "                    \"dist\": _make_graph(N, density=0.4, seed=seed),\n",
+        "                    \"output\": make_output(N),\n",
+        "                    \"N\": N,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # --- Realistic sizes ---\n",
+        "        for N, seed in [(512, 8)]:\n",
+        "            tests.append(\n",
+        "                {\n",
+        "                    \"dist\": _make_graph(N, density=0.3, seed=seed),\n",
+        "                    \"output\": make_output(N),\n",
+        "                    \"N\": N,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # --- Special: all zero-weight edges (any path has cost 0) ---\n",
+        "        N = 8\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"dist\": torch.zeros(N * N, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"output\": make_output(N),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        N = 2048\n",
+        "        return {\n",
+        "            \"dist\": _make_graph(N, density=0.3, seed=42),\n",
+        "            \"output\": torch.empty(N * N, device=\"cuda\", dtype=torch.float32),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/74_gpt2_block.ipynb b/challenges/colab_exports/hard/74_gpt2_block.ipynb
new file mode 100644
index 00000000..53b679e1
--- /dev/null
+++ b/challenges/colab_exports/hard/74_gpt2_block.ipynb
@@ -0,0 +1,565 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a single GPT-2 transformer decoder block. Given an input tensor\n  $x$ of shape <code>(seq_len, 768)</code> and a packed weight buffer containing\n  all block parameters, compute the output using pre-norm architecture with\n  multi-head self-attention and a feed-forward network with GELU activation.\n</p>\n\n<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 320 600\" width=\"320\" height=\"600\" style=\"display:block; margin:20px auto;\">\n  <defs>\n    <marker id=\"ah\" viewBox=\"0 0 10 10\" refX=\"9\" refY=\"5\" markerWidth=\"6\" markerHeight=\"6\" orient=\"auto-start-reverse\">\n      <path d=\"M0 0L10 5L0 10z\" fill=\"#999\"/>\n    </marker>\n  </defs>\n\n  <!-- Input label -->\n  <text x=\"130\" y=\"18\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">x (seq_len, 768)</text>\n\n  <!-- Arrow: input -> LN1 -->\n  <line x1=\"130\" y1=\"26\" x2=\"130\" y2=\"44\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Residual 1: fork right, down, back left to Add1 -->\n  <line x1=\"130\" y1=\"33\" x2=\"260\" y2=\"33\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"260\" y1=\"33\" x2=\"260\" y2=\"270\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"260\" y1=\"270\" x2=\"145\" y2=\"270\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\" marker-end=\"url(#ah)\"/>\n  <text x=\"268\" y=\"155\" fill=\"#666\" font-size=\"10\" font-family=\"sans-serif\" transform=\"rotate(90,268,155)\">residual</text>\n\n  <!-- LN1 -->\n  <rect x=\"55\" y=\"47\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"67\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"sans-serif\">LayerNorm 1</text>\n  <line x1=\"130\" y1=\"77\" x2=\"130\" y2=\"95\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- QKV Proj -->\n  <rect x=\"55\" y=\"98\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"118\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">QKV Projection</text>\n  <line x1=\"130\" y1=\"128\" x2=\"130\" y2=\"146\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- MHA -->\n  <rect x=\"55\" y=\"149\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"169\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">Multi-Head Attention</text>\n  <line x1=\"130\" y1=\"179\" x2=\"130\" y2=\"197\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Attn Out Proj -->\n  <rect x=\"55\" y=\"200\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"220\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">Output Projection</text>\n  <line x1=\"130\" y1=\"230\" x2=\"130\" y2=\"258\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Add 1 -->\n  <circle cx=\"130\" cy=\"270\" r=\"12\" fill=\"#222\" stroke=\"#999\" stroke-width=\"1.5\"/>\n  <text x=\"130\" y=\"275\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"15\" font-family=\"sans-serif\" font-weight=\"bold\">+</text>\n  <line x1=\"130\" y1=\"282\" x2=\"130\" y2=\"306\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Residual 2: fork right, down, back left to Add2 -->\n  <line x1=\"130\" y1=\"292\" x2=\"260\" y2=\"292\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"260\" y1=\"292\" x2=\"260\" y2=\"530\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"260\" y1=\"530\" x2=\"145\" y2=\"530\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\" marker-end=\"url(#ah)\"/>\n  <text x=\"268\" y=\"415\" fill=\"#666\" font-size=\"10\" font-family=\"sans-serif\" transform=\"rotate(90,268,415)\">residual</text>\n\n  <!-- LN2 -->\n  <rect x=\"55\" y=\"309\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"329\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"sans-serif\">LayerNorm 2</text>\n  <line x1=\"130\" y1=\"339\" x2=\"130\" y2=\"357\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- FC -->\n  <rect x=\"55\" y=\"360\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"380\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">Linear (768 \u2192 3072)</text>\n  <line x1=\"130\" y1=\"390\" x2=\"130\" y2=\"408\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- GELU -->\n  <rect x=\"55\" y=\"411\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"431\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">GELU</text>\n  <line x1=\"130\" y1=\"441\" x2=\"130\" y2=\"459\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Proj -->\n  <rect x=\"55\" y=\"462\" width=\"150\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"130\" y=\"482\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">Linear (3072 \u2192 768)</text>\n  <line x1=\"130\" y1=\"492\" x2=\"130\" y2=\"518\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Add 2 -->\n  <circle cx=\"130\" cy=\"530\" r=\"12\" fill=\"#222\" stroke=\"#999\" stroke-width=\"1.5\"/>\n  <text x=\"130\" y=\"535\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"15\" font-family=\"sans-serif\" font-weight=\"bold\">+</text>\n  <line x1=\"130\" y1=\"542\" x2=\"130\" y2=\"566\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Output label -->\n  <text x=\"130\" y=\"584\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">output (seq_len, 768)</text>\n</svg>\n\n<p>The block uses GPT-2's <strong>pre-norm</strong> architecture: LayerNorm is applied\n<em>before</em> each sub-layer (attention and feed-forward), not after. At a high level:</p>\n\n$$\n\\begin{aligned}\nx' &= x + \\text{MultiHeadAttn}\\!\\left(\\text{LN}_1(x)\\right) \\$$4pt]\n\\text{output} &= x' + \\text{FeedForward}\\!\\left(\\text{LN}_2(x')\\right)\n\\end{aligned}\n$$\n\n<p>where the sub-layers are defined as:</p>\n\n$$\n\\begin{aligned}\n\\text{LN}(z) &= \\frac{z - \\mu}{\\sqrt{\\sigma^2 + \\epsilon}} \\odot \\gamma + \\beta, \\quad \\mu = \\frac{1}{d}\\sum_i z_i, \\quad \\sigma^2 = \\frac{1}{d}\\sum_i (z_i - \\mu)^2 \\$$8pt]\n[Q \\mid K \\mid V] &= \\text{LN}_1(x) \\cdot W_{qkv} + b_{qkv} \\$$4pt]\n\\text{head}_i &= \\text{softmax}\\!\\left(\\frac{Q_i K_i^\\top}{\\sqrt{d_k}}\\right) V_i, \\quad d_k = 64 \\$$4pt]\n\\text{MultiHeadAttn}(z) &= \\text{Concat}(\\text{head}_1, \\ldots, \\text{head}_{12}) \\cdot W_{\\text{attn}} + b_{\\text{attn}} \\$$8pt]\n\\text{FeedForward}(z) &= \\text{GELU}\\!\\left(z \\cdot W_{fc} + b_{fc}\\right) \\cdot W_{\\text{proj}} + b_{\\text{proj}}\n\\end{aligned}\n$$\n\n<p>Expanding into individual steps:</p>\n\n<ol>\n  <li><strong>Layer Norm 1:</strong> $x_{\\text{norm}} = \\text{LN}_1(x)$ with parameters $\\gamma_1, \\beta_1$</li>\n  <li><strong>QKV Projection:</strong> $QKV = x_{\\text{norm}} \\cdot W_{qkv} + b_{qkv}$, split into $Q, K, V$ each of shape <code>(seq_len, 768)</code></li>\n  <li><strong>Multi-Head Attention:</strong> Reshape $Q, K, V$ into 12 heads of dimension 64, compute per-head scaled dot-product attention (no causal mask), then concatenate heads into $A$</li>\n  <li><strong>Output Projection:</strong> $P = A \\cdot W_{\\text{attn}} + b_{\\text{attn}}$</li>\n  <li><strong>Residual 1:</strong> $x' = x + P$</li>\n  <li><strong>Layer Norm 2:</strong> $h_{\\text{norm}} = \\text{LN}_2(x')$ with parameters $\\gamma_2, \\beta_2$</li>\n  <li><strong>Feed-Forward:</strong> $F = \\text{GELU}(h_{\\text{norm}} \\cdot W_{fc} + b_{fc}) \\cdot W_{\\text{proj}} + b_{\\text{proj}}$</li>\n  <li><strong>Residual 2:</strong> $\\text{output} = x' + F$</li>\n</ol>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n  <li>LayerNorm uses $\\epsilon = 10^{-5}$</li>\n  <li>Use the <a href=\"https://docs.pytorch.org/docs/stable/generated/torch.nn.GELU.html\" target=\"_blank\">GELU tanh approximation</a>: $\\text{GELU}(x) = 0.5\\,x\\!\\left(1 + \\tanh\\!\\left(\\sqrt{\\tfrac{2}{\\pi}}\\left(x + 0.044715\\,x^3\\right)\\right)\\right)$</li>\n</ul>\n\n<h2>Weight Layout</h2>\n<p>All block parameters are packed into a single contiguous <code>weights</code> buffer\n(7,087,872 floats) in the following order. Index into the buffer using the offsets below\n(e.g. $W_{qkv}[i][j]$ is at <code>weights[1536 + i * 2304 + j]</code>).\nAll 2D matrices are stored in row-major order.</p>\n\n<table style=\"border-collapse:separate; border-spacing:16px 8px;\">\n  <tr>\n    <th style=\"text-align:left;\">Parameter</th>\n    <th style=\"text-align:left;\">Shape</th>\n    <th style=\"text-align:right;\">Size</th>\n    <th style=\"text-align:right;\">Offset</th>\n  </tr>\n  <tr>\n    <td>$\\gamma_1$ (LN1 weight)</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">0</td>\n  </tr>\n  <tr>\n    <td>$\\beta_1$ (LN1 bias)</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">768</td>\n  </tr>\n  <tr>\n    <td>$W_{qkv}$</td>\n    <td>(768, 2304)</td>\n    <td style=\"text-align:right;\">1,769,472</td>\n    <td style=\"text-align:right;\">1,536</td>\n  </tr>\n  <tr>\n    <td>$b_{qkv}$</td>\n    <td>(2304,)</td>\n    <td style=\"text-align:right;\">2,304</td>\n    <td style=\"text-align:right;\">1,771,008</td>\n  </tr>\n  <tr>\n    <td>$W_{\\text{attn}}$</td>\n    <td>(768, 768)</td>\n    <td style=\"text-align:right;\">589,824</td>\n    <td style=\"text-align:right;\">1,773,312</td>\n  </tr>\n  <tr>\n    <td>$b_{\\text{attn}}$</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">2,363,136</td>\n  </tr>\n  <tr>\n    <td>$\\gamma_2$ (LN2 weight)</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">2,363,904</td>\n  </tr>\n  <tr>\n    <td>$\\beta_2$ (LN2 bias)</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">2,364,672</td>\n  </tr>\n  <tr>\n    <td>$W_{fc}$</td>\n    <td>(768, 3072)</td>\n    <td style=\"text-align:right;\">2,359,296</td>\n    <td style=\"text-align:right;\">2,365,440</td>\n  </tr>\n  <tr>\n    <td>$b_{fc}$</td>\n    <td>(3072,)</td>\n    <td style=\"text-align:right;\">3,072</td>\n    <td style=\"text-align:right;\">4,724,736</td>\n  </tr>\n  <tr>\n    <td>$W_{\\text{proj}}$</td>\n    <td>(3072, 768)</td>\n    <td style=\"text-align:right;\">2,359,296</td>\n    <td style=\"text-align:right;\">4,727,808</td>\n  </tr>\n  <tr>\n    <td>$b_{\\text{proj}}$</td>\n    <td>(768,)</td>\n    <td style=\"text-align:right;\">768</td>\n    <td style=\"text-align:right;\">7,087,104</td>\n  </tr>\n</table>\n\n<h2>Example</h2>\n<p>With <code>seq_len</code> = 4, <code>x</code> uniformly drawn from [\u22121, 1], and weights randomly initialized\n(see Weight Layout for the packing structure):</p>\n<pre>\nInput:  x.shape       = (4, 768)       # 4 token embeddings\n        weights.shape = (7,087,872,)   # packed weight buffer\n        seq_len       = 4\nOutput: output.shape  = (4, 768)       # transformed token embeddings\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li><code>d_model</code> = 768, <code>n_heads</code> = 12, <code>ffn_dim</code> = 3,072 (GPT-2 124M architecture)</li>\n  <li>1 &le; <code>seq_len</code> &le; 4,096</li>\n  <li>All tensors use 32-bit floating point</li>\n  <li>Performance is measured with <code>seq_len</code> = 1,024</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// x, output, weights are device pointers\nextern \"C\" void solve(const float* x, float* output, const float* weights, int seq_len) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, output, weights are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    output: cute.Tensor,\n    weights: cute.Tensor,\n    seq_len: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, weights are tensors on GPU\n@jax.jit\ndef solve(x: jax.Array, weights: jax.Array, seq_len: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# x, output, weights are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    weights: UnsafePointer[Float32, MutExternalOrigin],\n    seq_len: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, output, weights are tensors on the GPU\ndef solve(x: torch.Tensor, output: torch.Tensor, weights: torch.Tensor, seq_len: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, output, weights are tensors on the GPU\ndef solve(x: torch.Tensor, output: torch.Tensor, weights: torch.Tensor, seq_len: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "# GPT-2 124M fixed dimensions\n",
+        "D = 768\n",
+        "H = 12\n",
+        "DH = D // H  # 64\n",
+        "FFN = 3072\n",
+        "\n",
+        "# Weight layout offsets in the packed buffer\n",
+        "O_LN1_W = 0\n",
+        "O_LN1_B = O_LN1_W + D\n",
+        "O_WQKV = O_LN1_B + D\n",
+        "O_BQKV = O_WQKV + D * 3 * D\n",
+        "O_WAPROJ = O_BQKV + 3 * D\n",
+        "O_BAPROJ = O_WAPROJ + D * D\n",
+        "O_LN2_W = O_BAPROJ + D\n",
+        "O_LN2_B = O_LN2_W + D\n",
+        "O_WFC = O_LN2_B + D\n",
+        "O_BFC = O_WFC + D * FFN\n",
+        "O_WPROJ = O_BFC + FFN\n",
+        "O_BPROJ = O_WPROJ + FFN * D\n",
+        "TOTAL_WEIGHTS = O_BPROJ + D\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"GPT-2 Transformer Block\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        weights: torch.Tensor,\n",
+        "        seq_len: int,\n",
+        "    ):\n",
+        "        assert x.shape == (seq_len, D)\n",
+        "        assert output.shape == (seq_len, D)\n",
+        "        assert weights.shape == (TOTAL_WEIGHTS,)\n",
+        "        assert x.dtype == output.dtype == weights.dtype\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "        assert weights.device.type == \"cuda\"\n",
+        "\n",
+        "        # unpack weights\n",
+        "        ln1_w = weights[O_LN1_W:O_LN1_B]\n",
+        "        ln1_b = weights[O_LN1_B:O_WQKV]\n",
+        "        W_qkv = weights[O_WQKV:O_BQKV].view(D, 3 * D)\n",
+        "        b_qkv = weights[O_BQKV:O_WAPROJ]\n",
+        "        W_attn = weights[O_WAPROJ:O_BAPROJ].view(D, D)\n",
+        "        b_attn = weights[O_BAPROJ:O_LN2_W]\n",
+        "        ln2_w = weights[O_LN2_W:O_LN2_B]\n",
+        "        ln2_b = weights[O_LN2_B:O_WFC]\n",
+        "        W_fc = weights[O_WFC:O_BFC].view(D, FFN)\n",
+        "        b_fc = weights[O_BFC:O_WPROJ]\n",
+        "        W_proj = weights[O_WPROJ:O_BPROJ].view(FFN, D)\n",
+        "        b_proj = weights[O_BPROJ : O_BPROJ + D]\n",
+        "\n",
+        "        # layer norm 1\n",
+        "        x_norm = F.layer_norm(x, [D], ln1_w, ln1_b, eps=1e-5)\n",
+        "\n",
+        "        # qkv projection\n",
+        "        qkv = x_norm @ W_qkv + b_qkv\n",
+        "        q, k, v = qkv.split(D, dim=-1)\n",
+        "\n",
+        "        # reshape for multi-head attention: (H, seq_len, DH)\n",
+        "        q = q.view(seq_len, H, DH).transpose(0, 1)\n",
+        "        k = k.view(seq_len, H, DH).transpose(0, 1)\n",
+        "        v = v.view(seq_len, H, DH).transpose(0, 1)\n",
+        "\n",
+        "        # scaled dot-product attention\n",
+        "        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(DH)\n",
+        "        attn_weights = torch.softmax(scores, dim=-1)\n",
+        "        attn_out = torch.matmul(attn_weights, v)\n",
+        "\n",
+        "        # concat heads and project\n",
+        "        attn_out = attn_out.transpose(0, 1).contiguous().view(seq_len, D)\n",
+        "        attn_proj = attn_out @ W_attn + b_attn\n",
+        "\n",
+        "        # residual connection 1\n",
+        "        hidden = x + attn_proj\n",
+        "\n",
+        "        # layer norm 2\n",
+        "        h_norm = F.layer_norm(hidden, [D], ln2_w, ln2_b, eps=1e-5)\n",
+        "\n",
+        "        # ffn: linear -> gelu (tanh approx) -> linear\n",
+        "        fc = h_norm @ W_fc + b_fc\n",
+        "        fc = F.gelu(fc, approximate=\"tanh\")\n",
+        "        proj = fc @ W_proj + b_proj\n",
+        "\n",
+        "        # residual connection 2\n",
+        "        output.copy_(hidden + proj)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"weights\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_weights(self, device, dtype):\n",
+        "        scale = 0.02\n",
+        "        ln1_w = torch.empty(D, device=device, dtype=dtype).uniform_(0.8, 1.2)\n",
+        "        ln1_b = torch.empty(D, device=device, dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        W_qkv = torch.empty(D, 3 * D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        b_qkv = torch.zeros(3 * D, device=device, dtype=dtype)\n",
+        "        W_attn = torch.empty(D, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        b_attn = torch.zeros(D, device=device, dtype=dtype)\n",
+        "        ln2_w = torch.empty(D, device=device, dtype=dtype).uniform_(0.8, 1.2)\n",
+        "        ln2_b = torch.empty(D, device=device, dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        W_fc = torch.empty(D, FFN, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        b_fc = torch.zeros(FFN, device=device, dtype=dtype)\n",
+        "        W_proj = torch.empty(FFN, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        b_proj = torch.zeros(D, device=device, dtype=dtype)\n",
+        "        return torch.cat(\n",
+        "            [\n",
+        "                ln1_w,\n",
+        "                ln1_b,\n",
+        "                W_qkv.flatten(),\n",
+        "                b_qkv,\n",
+        "                W_attn.flatten(),\n",
+        "                b_attn,\n",
+        "                ln2_w,\n",
+        "                ln2_b,\n",
+        "                W_fc.flatten(),\n",
+        "                b_fc,\n",
+        "                W_proj.flatten(),\n",
+        "                b_proj,\n",
+        "            ]\n",
+        "        )\n",
+        "\n",
+        "    def _make_test_case(self, seq_len, zero_x=False):\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        weights = self._make_weights(device, dtype)\n",
+        "        if zero_x:\n",
+        "            x = torch.zeros(seq_len, D, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            x = torch.empty(seq_len, D, device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"output\": torch.empty(seq_len, D, device=device, dtype=dtype),\n",
+        "            \"weights\": weights,\n",
+        "            \"seq_len\": seq_len,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        return self._make_test_case(4)\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "        # single token\n",
+        "        tests.append(self._make_test_case(1))\n",
+        "        # zero input\n",
+        "        tests.append(self._make_test_case(4, zero_x=True))\n",
+        "        # small edge cases\n",
+        "        tests.append(self._make_test_case(2))\n",
+        "        tests.append(self._make_test_case(4))\n",
+        "        # power-of-2\n",
+        "        tests.append(self._make_test_case(16))\n",
+        "        tests.append(self._make_test_case(64))\n",
+        "        # non-power-of-2\n",
+        "        tests.append(self._make_test_case(30))\n",
+        "        tests.append(self._make_test_case(100))\n",
+        "        # realistic\n",
+        "        tests.append(self._make_test_case(128))\n",
+        "        tests.append(self._make_test_case(256))\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        return self._make_test_case(1024)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/hard/93_llama_transformer_block.ipynb b/challenges/colab_exports/hard/93_llama_transformer_block.ipynb
new file mode 100644
index 00000000..d7191c9a
--- /dev/null
+++ b/challenges/colab_exports/hard/93_llama_transformer_block.ipynb
@@ -0,0 +1,600 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a single Llama-style transformer decoder block. Given an input tensor $x$ of shape\n  <code>(seq_len, 512)</code>, a packed weight buffer, and precomputed RoPE tables, compute the\n  output using pre-norm architecture with Grouped Query Attention (GQA), Rotary Position Embeddings\n  (RoPE), and a SwiGLU feed-forward network.\n</p>\n\n<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 340 660\" width=\"340\" height=\"660\" style=\"display:block; margin:20px auto;\">\n  <defs>\n    <marker id=\"ah\" viewBox=\"0 0 10 10\" refX=\"9\" refY=\"5\" markerWidth=\"6\" markerHeight=\"6\" orient=\"auto-start-reverse\">\n      <path d=\"M0 0L10 5L0 10z\" fill=\"#999\"/>\n    </marker>\n  </defs>\n  <rect width=\"340\" height=\"660\" fill=\"#222\"/>\n\n  <!-- Input label -->\n  <text x=\"140\" y=\"20\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">x (seq_len, 512)</text>\n\n  <!-- Arrow: input -> RMSNorm1 -->\n  <line x1=\"140\" y1=\"28\" x2=\"140\" y2=\"46\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Residual 1: fork right, down, back left to Add1 -->\n  <line x1=\"140\" y1=\"36\" x2=\"280\" y2=\"36\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"280\" y1=\"36\" x2=\"280\" y2=\"306\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"280\" y1=\"306\" x2=\"157\" y2=\"306\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\" marker-end=\"url(#ah)\"/>\n  <text x=\"290\" y=\"175\" fill=\"#666\" font-size=\"10\" font-family=\"sans-serif\" transform=\"rotate(90,290,175)\">residual</text>\n\n  <!-- RMSNorm1 -->\n  <rect x=\"60\" y=\"49\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"69\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"sans-serif\">RMSNorm 1</text>\n  <line x1=\"140\" y1=\"79\" x2=\"140\" y2=\"97\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- QKV Proj -->\n  <rect x=\"60\" y=\"100\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"120\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">QKV Projection (GQA)</text>\n  <line x1=\"140\" y1=\"130\" x2=\"140\" y2=\"148\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- RoPE -->\n  <rect x=\"60\" y=\"151\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#2d1e4d\" stroke=\"#7755bb\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"171\" text-anchor=\"middle\" fill=\"#ccaaee\" font-size=\"12\" font-family=\"sans-serif\">RoPE (Q and K)</text>\n  <line x1=\"140\" y1=\"181\" x2=\"140\" y2=\"199\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Causal Attn -->\n  <rect x=\"60\" y=\"202\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"222\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">Causal Attention</text>\n  <line x1=\"140\" y1=\"232\" x2=\"140\" y2=\"250\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Output Proj -->\n  <rect x=\"60\" y=\"253\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"273\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"sans-serif\">Output Projection</text>\n  <line x1=\"140\" y1=\"283\" x2=\"140\" y2=\"294\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Add 1 -->\n  <circle cx=\"140\" cy=\"306\" r=\"12\" fill=\"#222\" stroke=\"#999\" stroke-width=\"1.5\"/>\n  <text x=\"140\" y=\"311\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"15\" font-family=\"sans-serif\" font-weight=\"bold\">+</text>\n  <line x1=\"140\" y1=\"318\" x2=\"140\" y2=\"342\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Residual 2 -->\n  <line x1=\"140\" y1=\"330\" x2=\"280\" y2=\"330\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"280\" y1=\"330\" x2=\"280\" y2=\"586\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\"/>\n  <line x1=\"280\" y1=\"586\" x2=\"157\" y2=\"586\" stroke=\"#555\" stroke-width=\"1.5\" stroke-dasharray=\"5,4\" marker-end=\"url(#ah)\"/>\n  <text x=\"290\" y=\"460\" fill=\"#666\" font-size=\"10\" font-family=\"sans-serif\" transform=\"rotate(90,290,460)\">residual</text>\n\n  <!-- RMSNorm2 -->\n  <rect x=\"60\" y=\"345\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"365\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"sans-serif\">RMSNorm 2</text>\n  <line x1=\"140\" y1=\"375\" x2=\"140\" y2=\"393\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Gate + Up Proj -->\n  <rect x=\"60\" y=\"396\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"416\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">Gate &amp; Up Proj (512\u21921408)</text>\n  <line x1=\"140\" y1=\"426\" x2=\"140\" y2=\"444\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- SiLU + multiply -->\n  <rect x=\"60\" y=\"447\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"467\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">SiLU(gate) &#x2299; up</text>\n  <line x1=\"140\" y1=\"477\" x2=\"140\" y2=\"495\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Down Proj -->\n  <rect x=\"60\" y=\"498\" width=\"160\" height=\"30\" rx=\"5\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1\"/>\n  <text x=\"140\" y=\"518\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"sans-serif\">Down Proj (1408\u2192512)</text>\n  <line x1=\"140\" y1=\"528\" x2=\"140\" y2=\"574\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Add 2 -->\n  <circle cx=\"140\" cy=\"586\" r=\"12\" fill=\"#222\" stroke=\"#999\" stroke-width=\"1.5\"/>\n  <text x=\"140\" y=\"591\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"15\" font-family=\"sans-serif\" font-weight=\"bold\">+</text>\n  <line x1=\"140\" y1=\"598\" x2=\"140\" y2=\"622\" stroke=\"#999\" stroke-width=\"1.5\" marker-end=\"url(#ah)\"/>\n\n  <!-- Output label -->\n  <text x=\"140\" y=\"640\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">output (seq_len, 512)</text>\n</svg>\n\n<p>\n  The block follows Llama's <strong>pre-norm</strong> architecture. Unlike GPT-2, it uses\n  <strong>RMSNorm</strong> (no mean subtraction, no additive bias), <strong>Grouped Query\n  Attention</strong> with 8 query heads and 2 key/value heads, <strong>Rotary Position\n  Embeddings</strong> applied to Q and K, and a <strong>SwiGLU</strong> feed-forward network.\n  None of the linear projections have bias terms.\n</p>\n\n$$\n\\begin{aligned}\nx' &= x + \\text{Attn}\\!\\left(\\text{RMSNorm}_1(x),\\; \\cos,\\; \\sin\\right) \\$$4pt]\n\\text{output} &= x' + \\text{FFN}\\!\\left(\\text{RMSNorm}_2(x')\\right)\n\\end{aligned}\n$$\n\n<p>The sub-operations in detail:</p>\n\n$$\n\\begin{aligned}\n\\text{RMSNorm}(z, w) &= \\frac{z}{\\sqrt{\\frac{1}{d}\\sum_i z_i^2 + \\varepsilon}} \\odot w, \\quad \\varepsilon = 10^{-5} \\$$8pt]\nQ &= \\text{RMSNorm}_1(x)\\, W_Q^\\top \\in \\mathbb{R}^{T \\times 512}, \\quad \\text{reshape to } (T, 8, 64) \\$$4pt]\nK &= \\text{RMSNorm}_1(x)\\, W_K^\\top \\in \\mathbb{R}^{T \\times 128}, \\quad \\text{reshape to } (T, 2, 64) \\$$4pt]\nV &= \\text{RMSNorm}_1(x)\\, W_V^\\top \\in \\mathbb{R}^{T \\times 128}, \\quad \\text{reshape to } (T, 2, 64) \\$$8pt]\n\\text{RoPE}(q, \\cos, \\sin) &: \\quad [q_1 \\mid q_2] \\mapsto [q_1 \\odot \\cos - q_2 \\odot \\sin \\mid q_1 \\odot \\sin + q_2 \\odot \\cos] \\$$4pt]\n&\\quad q_1 = q[\\ldots, {:}32],\\; q_2 = q[\\ldots, {32:}] \\$$8pt]\n\\text{GQA} &: \\text{repeat } K,V \\text{ along head dim } 4\\times \\text{ to match 8 Q heads} \\$$4pt]\n\\text{head}_i &= \\text{softmax}\\!\\left(\\frac{Q_i K_i^\\top}{\\sqrt{64}} + M_{\\text{causal}}\\right) V_i \\$$8pt]\n\\text{Attn}(x) &= \\text{Concat}(\\text{head}_1, \\ldots, \\text{head}_8)\\; W_O^\\top \\$$8pt]\n\\text{FFN}(z) &= \\bigl(\\text{SiLU}(z\\, W_{\\text{gate}}^\\top) \\odot z\\, W_{\\text{up}}^\\top\\bigr)\\; W_{\\text{down}}^\\top\n\\end{aligned}\n$$\n\n<p>where $M_{\\text{causal}}$ is the upper-triangular causal mask ($-\\infty$ above the diagonal)\nand $\\text{SiLU}(x) = x \\cdot \\sigma(x)$.</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n  <li>RMSNorm uses $\\varepsilon = 10^{-5}$, no additive bias</li>\n  <li>Apply causal masking: position $i$ attends only to positions $\\le i$</li>\n  <li>Repeat K and V heads $4\\times$ (GQA groups) before computing attention</li>\n  <li><code>cos</code> and <code>sin</code> have shape <code>(seq_len, 32)</code> \u2014 apply\n      them to both Q and K heads independently</li>\n</ul>\n\n<h2>Weight Layout</h2>\n<p>All parameters are packed into a single contiguous <code>weights</code> buffer\n(2,819,072 floats) in the order below. All 2-D matrices are stored row-major\nwith shape <code>(out_dim, in_dim)</code>. There are no bias terms.</p>\n\n<table style=\"border-collapse:separate; border-spacing:16px 6px;\">\n  <tr>\n    <th style=\"text-align:left;\">Parameter</th>\n    <th style=\"text-align:left;\">Shape</th>\n    <th style=\"text-align:right;\">Size</th>\n    <th style=\"text-align:right;\">Offset</th>\n  </tr>\n  <tr>\n    <td>$w_1$ (RMSNorm 1 scale)</td>\n    <td>(512,)</td>\n    <td style=\"text-align:right;\">512</td>\n    <td style=\"text-align:right;\">0</td>\n  </tr>\n  <tr>\n    <td>$W_Q$</td>\n    <td>(512, 512)</td>\n    <td style=\"text-align:right;\">262,144</td>\n    <td style=\"text-align:right;\">512</td>\n  </tr>\n  <tr>\n    <td>$W_K$</td>\n    <td>(128, 512)</td>\n    <td style=\"text-align:right;\">65,536</td>\n    <td style=\"text-align:right;\">262,656</td>\n  </tr>\n  <tr>\n    <td>$W_V$</td>\n    <td>(128, 512)</td>\n    <td style=\"text-align:right;\">65,536</td>\n    <td style=\"text-align:right;\">328,192</td>\n  </tr>\n  <tr>\n    <td>$W_O$</td>\n    <td>(512, 512)</td>\n    <td style=\"text-align:right;\">262,144</td>\n    <td style=\"text-align:right;\">393,728</td>\n  </tr>\n  <tr>\n    <td>$w_2$ (RMSNorm 2 scale)</td>\n    <td>(512,)</td>\n    <td style=\"text-align:right;\">512</td>\n    <td style=\"text-align:right;\">655,872</td>\n  </tr>\n  <tr>\n    <td>$W_{\\text{gate}}$</td>\n    <td>(1408, 512)</td>\n    <td style=\"text-align:right;\">720,896</td>\n    <td style=\"text-align:right;\">656,384</td>\n  </tr>\n  <tr>\n    <td>$W_{\\text{up}}$</td>\n    <td>(1408, 512)</td>\n    <td style=\"text-align:right;\">720,896</td>\n    <td style=\"text-align:right;\">1,377,280</td>\n  </tr>\n  <tr>\n    <td>$W_{\\text{down}}$</td>\n    <td>(512, 1408)</td>\n    <td style=\"text-align:right;\">720,896</td>\n    <td style=\"text-align:right;\">2,098,176</td>\n  </tr>\n</table>\n\n<h2>Example</h2>\n<p>With <code>seq_len</code> = 4, <code>x</code> drawn uniformly from [&minus;1, 1], and randomly\ninitialized weights:</p>\n<pre>\nInput:  x.shape       = (4, 512)       # 4 token hidden states\n        weights.shape = (2,819,072,)   # packed weight buffer\n        cos.shape     = (4, 32)        # precomputed RoPE cosines\n        sin.shape     = (4, 32)        # precomputed RoPE sines\n        seq_len       = 4\nOutput: output.shape  = (4, 512)       # transformed token hidden states\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li><code>d_model</code> = 512, <code>n_q_heads</code> = 8, <code>n_kv_heads</code> = 2,\n      <code>head_dim</code> = 64, <code>ffn_hidden</code> = 1,408</li>\n  <li>1 &le; <code>seq_len</code> &le; 4,096</li>\n  <li>All tensors use 32-bit floating point</li>\n  <li>Performance is measured with <code>seq_len</code> = 2,048</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// x, output, weights, cos, sin are device pointers\nextern \"C\" void solve(const float* x, float* output, const float* weights, const float* cos,\n                      const float* sin, int seq_len) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, output, weights, cos, sin are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    output: cute.Tensor,\n    weights: cute.Tensor,\n    cos: cute.Tensor,\n    sin: cute.Tensor,\n    seq_len: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, weights, cos, sin are tensors on GPU\n@jax.jit\ndef solve(\n    x: jax.Array,\n    weights: jax.Array,\n    cos: jax.Array,\n    sin: jax.Array,\n    seq_len: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# x, output, weights, cos, sin are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    weights: UnsafePointer[Float32, MutExternalOrigin],\n    cos: UnsafePointer[Float32, MutExternalOrigin],\n    sin: UnsafePointer[Float32, MutExternalOrigin],\n    seq_len: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, output, weights, cos, sin are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    output: torch.Tensor,\n    weights: torch.Tensor,\n    cos: torch.Tensor,\n    sin: torch.Tensor,\n    seq_len: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, output, weights, cos, sin are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    output: torch.Tensor,\n    weights: torch.Tensor,\n    cos: torch.Tensor,\n    sin: torch.Tensor,\n    seq_len: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "# Llama-style architecture constants\n",
+        "D = 512  # model dimension\n",
+        "NUM_Q_HEADS = 8  # number of query heads\n",
+        "NUM_KV_HEADS = 2  # number of key/value heads (grouped query attention)\n",
+        "HEAD_DIM = D // NUM_Q_HEADS  # = 64\n",
+        "Q_DIM = NUM_Q_HEADS * HEAD_DIM  # = 512\n",
+        "KV_DIM = NUM_KV_HEADS * HEAD_DIM  # = 128\n",
+        "GQA_GROUPS = NUM_Q_HEADS // NUM_KV_HEADS  # = 4\n",
+        "FFN_HIDDEN = 1408  # SwiGLU intermediate dimension\n",
+        "\n",
+        "# Weight buffer layout offsets (all projections stored as (out_dim, in_dim))\n",
+        "O_RMS1_W = 0\n",
+        "O_WQ = O_RMS1_W + D  # Q projection: Q_DIM x D\n",
+        "O_WK = O_WQ + Q_DIM * D  # K projection: KV_DIM x D\n",
+        "O_WV = O_WK + KV_DIM * D  # V projection: KV_DIM x D\n",
+        "O_WO = O_WV + KV_DIM * D  # output projection: D x D\n",
+        "O_RMS2_W = O_WO + D * D  # RMS norm 2 weights: D\n",
+        "O_WGATE = O_RMS2_W + D  # gate projection: FFN_HIDDEN x D\n",
+        "O_WUP = O_WGATE + FFN_HIDDEN * D  # up projection: FFN_HIDDEN x D\n",
+        "O_WDOWN = O_WUP + FFN_HIDDEN * D  # down projection: D x FFN_HIDDEN\n",
+        "TOTAL_WEIGHTS = O_WDOWN + D * FFN_HIDDEN\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Llama Transformer Block\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        weights: torch.Tensor,\n",
+        "        cos: torch.Tensor,\n",
+        "        sin: torch.Tensor,\n",
+        "        seq_len: int,\n",
+        "    ):\n",
+        "        assert x.shape == (seq_len, D)\n",
+        "        assert output.shape == (seq_len, D)\n",
+        "        assert weights.shape == (TOTAL_WEIGHTS,)\n",
+        "        assert cos.shape == (seq_len, HEAD_DIM // 2)\n",
+        "        assert sin.shape == (seq_len, HEAD_DIM // 2)\n",
+        "        assert x.dtype == output.dtype == weights.dtype == cos.dtype == sin.dtype\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "        assert weights.device.type == \"cuda\"\n",
+        "        assert cos.device.type == \"cuda\"\n",
+        "        assert sin.device.type == \"cuda\"\n",
+        "\n",
+        "        def rms_norm(z, w):\n",
+        "            return z * torch.rsqrt(z.pow(2).mean(-1, keepdim=True) + 1e-5) * w\n",
+        "\n",
+        "        def apply_rope(qk, c, s):\n",
+        "            # qk: (seq_len, num_heads, head_dim)\n",
+        "            q1, q2 = qk[..., : HEAD_DIM // 2], qk[..., HEAD_DIM // 2 :]\n",
+        "            c = c.unsqueeze(1)  # (seq_len, 1, head_dim//2)\n",
+        "            s = s.unsqueeze(1)\n",
+        "            return torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], dim=-1)\n",
+        "\n",
+        "        # unpack weights\n",
+        "        rms1_w = weights[O_RMS1_W:O_WQ]\n",
+        "        W_Q = weights[O_WQ:O_WK].view(Q_DIM, D)\n",
+        "        W_K = weights[O_WK:O_WV].view(KV_DIM, D)\n",
+        "        W_V = weights[O_WV:O_WO].view(KV_DIM, D)\n",
+        "        W_O = weights[O_WO:O_RMS2_W].view(D, D)\n",
+        "        rms2_w = weights[O_RMS2_W:O_WGATE]\n",
+        "        W_gate = weights[O_WGATE:O_WUP].view(FFN_HIDDEN, D)\n",
+        "        W_up = weights[O_WUP:O_WDOWN].view(FFN_HIDDEN, D)\n",
+        "        W_down = weights[O_WDOWN:TOTAL_WEIGHTS].view(D, FFN_HIDDEN)\n",
+        "\n",
+        "        # --- Attention sub-block ---\n",
+        "        x_norm = rms_norm(x, rms1_w)\n",
+        "\n",
+        "        # QKV projections\n",
+        "        q = (x_norm @ W_Q.T).view(seq_len, NUM_Q_HEADS, HEAD_DIM)\n",
+        "        k = (x_norm @ W_K.T).view(seq_len, NUM_KV_HEADS, HEAD_DIM)\n",
+        "        v = (x_norm @ W_V.T).view(seq_len, NUM_KV_HEADS, HEAD_DIM)\n",
+        "\n",
+        "        # Apply RoPE to Q and K\n",
+        "        q = apply_rope(q, cos, sin)\n",
+        "        k = apply_rope(k, cos, sin)\n",
+        "\n",
+        "        # Reshape for batched matmul: (num_heads, seq_len, head_dim)\n",
+        "        q = q.transpose(0, 1)  # (NUM_Q_HEADS, seq_len, HEAD_DIM)\n",
+        "        k = k.transpose(0, 1)  # (NUM_KV_HEADS, seq_len, HEAD_DIM)\n",
+        "        v = v.transpose(0, 1)  # (NUM_KV_HEADS, seq_len, HEAD_DIM)\n",
+        "\n",
+        "        # GQA: broadcast K and V to match Q heads\n",
+        "        k = k.repeat_interleave(GQA_GROUPS, dim=0)  # (NUM_Q_HEADS, seq_len, HEAD_DIM)\n",
+        "        v = v.repeat_interleave(GQA_GROUPS, dim=0)\n",
+        "\n",
+        "        # Causal scaled dot-product attention\n",
+        "        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(HEAD_DIM)\n",
+        "        causal_mask = torch.triu(\n",
+        "            torch.full((seq_len, seq_len), float(\"-inf\"), device=x.device, dtype=x.dtype),\n",
+        "            diagonal=1,\n",
+        "        )\n",
+        "        scores = scores + causal_mask\n",
+        "        attn_weights = torch.softmax(scores, dim=-1)\n",
+        "        attn_out = torch.matmul(attn_weights, v)  # (NUM_Q_HEADS, seq_len, HEAD_DIM)\n",
+        "\n",
+        "        # Merge heads and project\n",
+        "        attn_out = attn_out.transpose(0, 1).contiguous().view(seq_len, D)\n",
+        "        attn_proj = attn_out @ W_O.T\n",
+        "\n",
+        "        # Residual 1\n",
+        "        hidden = x + attn_proj\n",
+        "\n",
+        "        # --- FFN sub-block ---\n",
+        "        h_norm = rms_norm(hidden, rms2_w)\n",
+        "\n",
+        "        # SwiGLU: gate * up, then project down\n",
+        "        gate = F.silu(h_norm @ W_gate.T)\n",
+        "        up = h_norm @ W_up.T\n",
+        "        ffn_out = (gate * up) @ W_down.T\n",
+        "\n",
+        "        # Residual 2\n",
+        "        output.copy_(hidden + ffn_out)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"weights\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"cos\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"sin\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_rope_tables(self, seq_len, device, dtype, theta=10000.0):\n",
+        "        positions = torch.arange(seq_len, device=device, dtype=dtype)\n",
+        "        freqs = 1.0 / (\n",
+        "            theta ** (torch.arange(0, HEAD_DIM, 2, device=device, dtype=dtype) / HEAD_DIM)\n",
+        "        )\n",
+        "        angles = torch.outer(positions, freqs)  # (seq_len, HEAD_DIM//2)\n",
+        "        return angles.cos(), angles.sin()\n",
+        "\n",
+        "    def _make_weights(self, device, dtype):\n",
+        "        scale = 0.02\n",
+        "        rms1_w = torch.empty(D, device=device, dtype=dtype).uniform_(0.8, 1.2)\n",
+        "        W_Q = torch.empty(Q_DIM, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        W_K = torch.empty(KV_DIM, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        W_V = torch.empty(KV_DIM, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        W_O = torch.empty(D, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        rms2_w = torch.empty(D, device=device, dtype=dtype).uniform_(0.8, 1.2)\n",
+        "        W_gate = torch.empty(FFN_HIDDEN, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        W_up = torch.empty(FFN_HIDDEN, D, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        W_down = torch.empty(D, FFN_HIDDEN, device=device, dtype=dtype).normal_(0, scale)\n",
+        "        return torch.cat(\n",
+        "            [\n",
+        "                rms1_w,\n",
+        "                W_Q.flatten(),\n",
+        "                W_K.flatten(),\n",
+        "                W_V.flatten(),\n",
+        "                W_O.flatten(),\n",
+        "                rms2_w,\n",
+        "                W_gate.flatten(),\n",
+        "                W_up.flatten(),\n",
+        "                W_down.flatten(),\n",
+        "            ]\n",
+        "        )\n",
+        "\n",
+        "    def _make_test_case(self, seq_len, zero_x=False):\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        weights = self._make_weights(device, dtype)\n",
+        "        cos, sin = self._make_rope_tables(seq_len, device, dtype)\n",
+        "        if zero_x:\n",
+        "            x = torch.zeros(seq_len, D, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            x = torch.empty(seq_len, D, device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"output\": torch.empty(seq_len, D, device=device, dtype=dtype),\n",
+        "            \"weights\": weights,\n",
+        "            \"cos\": cos,\n",
+        "            \"sin\": sin,\n",
+        "            \"seq_len\": seq_len,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        return self._make_test_case(4)\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "        # single token (decode phase)\n",
+        "        tests.append(self._make_test_case(1))\n",
+        "        # zero input\n",
+        "        tests.append(self._make_test_case(4, zero_x=True))\n",
+        "        # small edge cases\n",
+        "        tests.append(self._make_test_case(2))\n",
+        "        tests.append(self._make_test_case(4))\n",
+        "        # power-of-2\n",
+        "        tests.append(self._make_test_case(16))\n",
+        "        tests.append(self._make_test_case(64))\n",
+        "        # non-power-of-2\n",
+        "        tests.append(self._make_test_case(30))\n",
+        "        tests.append(self._make_test_case(100))\n",
+        "        # realistic inference lengths\n",
+        "        tests.append(self._make_test_case(128))\n",
+        "        tests.append(self._make_test_case(256))\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        return self._make_test_case(2048)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/10_2d_convolution.ipynb b/challenges/colab_exports/medium/10_2d_convolution.ipynb
new file mode 100644
index 00000000..1b0767c5
--- /dev/null
+++ b/challenges/colab_exports/medium/10_2d_convolution.ipynb
@@ -0,0 +1,561 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a program that performs a 2D convolution operation on the GPU. Given an input matrix and a kernel (filter), compute the convolved\n  output. The convolution should be performed with a \"valid\" boundary condition, meaning the kernel is only applied\n  where it fully overlaps with the input.\n</p>\n\n<svg width=\"440\" height=\"220\" viewBox=\"0 0 440 220\" xmlns=\"http://www.w3.org/2000/svg\" style=\"display:block; margin:20px auto;\">\n  <rect width=\"440\" height=\"220\" rx=\"8\" fill=\"#222\"/>\n\n  <!-- Input label -->\n  <text x=\"18\" y=\"22\" fill=\"#ccc\" font-family=\"sans-serif\" font-size=\"12\" font-weight=\"bold\">Input (4x4)</text>\n\n  <!-- 4x4 input grid -->\n  <g transform=\"translate(10,30)\">\n    <!-- Row 0 -->\n    <rect x=\"0\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"18\" y=\"23\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">1</text>\n    <rect x=\"38\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"56\" y=\"23\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">2</text>\n    <rect x=\"76\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"94\" y=\"23\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">3</text>\n    <rect x=\"114\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"132\" y=\"23\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">4</text>\n    <!-- Row 1 -->\n    <rect x=\"0\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"18\" y=\"61\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">5</text>\n    <rect x=\"38\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"56\" y=\"61\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">6</text>\n    <rect x=\"76\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"94\" y=\"61\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">7</text>\n    <rect x=\"114\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"132\" y=\"61\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">8</text>\n    <!-- Row 2 -->\n    <rect x=\"0\" y=\"76\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"18\" y=\"99\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">9</text>\n    <rect x=\"38\" y=\"76\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"56\" y=\"99\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">10</text>\n    <rect x=\"76\" y=\"76\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"94\" y=\"99\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">11</text>\n    <rect x=\"114\" y=\"76\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"132\" y=\"99\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">12</text>\n    <!-- Row 3 -->\n    <rect x=\"0\" y=\"114\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"18\" y=\"137\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">13</text>\n    <rect x=\"38\" y=\"114\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"56\" y=\"137\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">14</text>\n    <rect x=\"76\" y=\"114\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"94\" y=\"137\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">15</text>\n    <rect x=\"114\" y=\"114\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"1\"/>\n    <text x=\"132\" y=\"137\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">16</text>\n\n    <!-- Kernel overlay on top-left 2x2 -->\n    <rect x=\"-2\" y=\"-2\" width=\"78\" height=\"78\" rx=\"4\" fill=\"#1e2d4d\" fill-opacity=\"0.55\" stroke=\"#4477bb\" stroke-width=\"2.5\"/>\n  </g>\n\n  <!-- Kernel label and grid -->\n  <text x=\"195\" y=\"22\" fill=\"#ccc\" font-family=\"sans-serif\" font-size=\"12\" font-weight=\"bold\">Kernel (2x2)</text>\n  <g transform=\"translate(195,30)\">\n    <rect x=\"0\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n    <text x=\"18\" y=\"23\" fill=\"#88bbff\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">0</text>\n    <rect x=\"38\" y=\"0\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n    <text x=\"56\" y=\"23\" fill=\"#88bbff\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">1</text>\n    <rect x=\"0\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n    <text x=\"18\" y=\"61\" fill=\"#88bbff\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">1</text>\n    <rect x=\"38\" y=\"38\" width=\"36\" height=\"36\" rx=\"3\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n    <text x=\"56\" y=\"61\" fill=\"#88bbff\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">0</text>\n  </g>\n\n  <!-- Arrow from kernel area to result -->\n  <defs>\n    <marker id=\"arrowhead\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#ccc\"/>\n    </marker>\n  </defs>\n  <line x1=\"275\" y1=\"68\" x2=\"305\" y2=\"68\" stroke=\"#ccc\" stroke-width=\"1.5\" marker-end=\"url(#arrowhead)\"/>\n\n  <!-- Output cell -->\n  <text x=\"320\" y=\"22\" fill=\"#ccc\" font-family=\"sans-serif\" font-size=\"12\" font-weight=\"bold\">Output[0][0]</text>\n  <rect x=\"340\" y=\"38\" width=\"50\" height=\"50\" rx=\"5\" fill=\"#1a3a1a\" stroke=\"#44bb44\" stroke-width=\"2\"/>\n  <text x=\"365\" y=\"70\" fill=\"#66dd66\" font-family=\"monospace\" font-size=\"18\" font-weight=\"bold\" text-anchor=\"middle\">7</text>\n\n  <!-- Computation breakdown -->\n  <text x=\"10\" y=\"190\" fill=\"#999\" font-family=\"monospace\" font-size=\"11\">1\u00d70 + 2\u00d71 + 5\u00d71 + 6\u00d70 = 0 + 2 + 5 + 0 =</text>\n  <text x=\"325\" y=\"190\" fill=\"#66dd66\" font-family=\"monospace\" font-size=\"11\" font-weight=\"bold\">7</text>\n\n  <!-- Legend -->\n  <rect x=\"10\" y=\"204\" width=\"10\" height=\"10\" rx=\"2\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n  <text x=\"24\" y=\"213\" fill=\"#777\" font-family=\"sans-serif\" font-size=\"10\">kernel position</text>\n  <rect x=\"110\" y=\"204\" width=\"10\" height=\"10\" rx=\"2\" fill=\"#1a3a1a\" stroke=\"#44bb44\" stroke-width=\"1\"/>\n  <text x=\"124\" y=\"213\" fill=\"#777\" font-family=\"sans-serif\" font-size=\"10\">output cell</text>\n</svg>\n\n<p>\n  The input consists of:\n<ul>\n  <li><code>input</code>: A 2D matrix of 32-bit floating-point numbers, represented as a 1D array in row-major order.\n  </li>\n  <li><code>kernel</code>: A 2D kernel (filter) of 32-bit floating-point numbers, also represented as a 1D array in\n    row-major order.</li>\n</ul>\n</p>\n\n<p>\n  The output should be written to the <code>output</code> matrix (also a 1D array in row-major order). The output matrix will have dimensions:\n  <ul>\n    <li><code>output_rows = input_rows - kernel_rows + 1</code></li>\n    <li><code>output_cols = input_cols - kernel_cols + 1</code></li>\n</ul>\n</p>\n\n<p>\n  The convolution operation is defined as:\n</p>\n<p>\n  $output[i][j] = \\sum_{m=0}^{kernel\\_rows-1} \\sum_{n=0}^{kernel\\_cols-1} input[i+m][j+n] * kernel[m][n]$\n</p>\n\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the array\n    <code>output</code>\n  </li>\n</ul>\n\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>input</code> (3\u00d73):\n$$\n\\begin{bmatrix}\n1 & 2 & 3 \\\\\n4 & 5 & 6 \\\\\n7 & 8 & 9\n\\end{bmatrix}\n$$\n<code>kernel</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n0 & 1 \\\\\n1 & 0\n\\end{bmatrix}\n$$\n<code>input_rows = 3</code><br>\n<code>input_cols = 3</code><br>\n<code>kernel_rows = 2</code><br>\n<code>kernel_cols = 2</code>\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n6 & 8 \\\\\n12 & 14\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Example 2:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>input</code> (4\u00d74):\n$$\n\\begin{bmatrix}\n1 & 1 & 1 & 1 \\\\\n1 & 2 & 3 & 1 \\\\\n1 & 4 & 5 & 1 \\\\\n1 & 1 & 1 & 1\n\\end{bmatrix}\n$$\n<code>kernel</code> (1\u00d73):\n$$\n\\begin{bmatrix}\n1 & 0 & 1\n\\end{bmatrix}\n$$\n<code>input_rows = 4</code><br>\n<code>input_cols = 4</code><br>\n<code>kernel_rows = 1</code><br>\n<code>kernel_cols = 3</code>\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (4\u00d72):\n$$\n\\begin{bmatrix}\n2 & 2 \\\\\n4 & 3 \\\\\n6 & 5 \\\\\n2 & 2\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>input_rows</code>, <code>input_cols</code> \u2264 3072</li>\n  <li>1 \u2264 <code>kernel_rows</code>, <code>kernel_cols</code> \u2264 31</li>\n  <li><code>kernel_rows</code> \u2264 <code>input_rows</code></li>\n  <li><code>kernel_cols</code> \u2264 <code>input_cols</code></li>\n\n  <li>Performance is measured with <code>input_cols</code> = 3,072, <code>input_rows</code> = 3,072, <code>kernel_cols</code> = 15, <code>kernel_rows</code> = 15</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, kernel, output are device pointers\nextern \"C\" void solve(const float* input, const float* kernel, float* output, int input_rows,\n                      int input_cols, int kernel_rows, int kernel_cols) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, kernel, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    kernel: cute.Tensor,\n    output: cute.Tensor,\n    input_rows: cute.Int32,\n    input_cols: cute.Int32,\n    kernel_rows: cute.Int32,\n    kernel_cols: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, kernel are tensors on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array,\n    kernel: jax.Array,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    kernel: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    input_rows: Int32,\n    input_cols: Int32,\n    kernel_rows: Int32,\n    kernel_cols: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"2D Convolution\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        kernel: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        input_rows: int,\n",
+        "        input_cols: int,\n",
+        "        kernel_rows: int,\n",
+        "        kernel_cols: int,\n",
+        "    ):\n",
+        "        # Reshape flattened arrays to 2D matrices\n",
+        "        input_2d = input.view(input_rows, input_cols)\n",
+        "        kernel_2d = kernel.view(kernel_rows, kernel_cols)\n",
+        "        # Prepare tensors for conv2d (add batch and channel dimensions)\n",
+        "        kernel_prepared = kernel_2d.unsqueeze(0).unsqueeze(0)\n",
+        "        input_prepared = input_2d.unsqueeze(0).unsqueeze(0)\n",
+        "        # Perform cross-correlation using PyTorch's F.conv2d\n",
+        "        # (which does cross-correlation by default)\n",
+        "        result = torch.nn.functional.conv2d(input_prepared, kernel_prepared, padding=0)\n",
+        "        # Copy result to output tensor (removing the extra dimensions and flattening)\n",
+        "        output.copy_(result.view(-1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"kernel\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"input_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"input_cols\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_cols\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor(\n",
+        "            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        kernel = torch.tensor([0.0, 1.0, 1.0, 0.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"kernel\": kernel,\n",
+        "            \"output\": output,\n",
+        "            \"input_rows\": 3,\n",
+        "            \"input_cols\": 3,\n",
+        "            \"kernel_rows\": 2,\n",
+        "            \"kernel_cols\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([0.0, 1.0, 1.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "        # rectangular_input\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"kernel\": torch.tensor([1.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 2,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_rows\": 1,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([-1.0, 1.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(9, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_rows\": 1,\n",
+        "                \"kernel_cols\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # medium_matrix_small_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(64 * 64, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"kernel\": torch.empty(3 * 3, device=\"cuda\", dtype=dtype).uniform_(-0.5, 0.5),\n",
+        "                \"output\": torch.empty(62 * 62, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 64,\n",
+        "                \"input_cols\": 64,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_matrix_medium_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(128 * 128, device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0),\n",
+        "                \"kernel\": torch.empty(7 * 7, device=\"cuda\", dtype=dtype).uniform_(-0.2, 0.2),\n",
+        "                \"output\": torch.empty(122 * 122, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 128,\n",
+        "                \"input_cols\": 128,\n",
+        "                \"kernel_rows\": 7,\n",
+        "                \"kernel_cols\": 7,\n",
+        "            }\n",
+        "        )\n",
+        "        # rectangular_large_matrix\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(128 * 256, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"kernel\": torch.empty(5 * 5, device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(124 * 252, device=\"cuda\", dtype=dtype),\n",
+        "                \"input_rows\": 128,\n",
+        "                \"input_cols\": 256,\n",
+        "                \"kernel_rows\": 5,\n",
+        "                \"kernel_cols\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_rows = 3072\n",
+        "        input_cols = 3072\n",
+        "        kernel_rows = 15\n",
+        "        kernel_cols = 15\n",
+        "        input = torch.empty(input_rows * input_cols, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        kernel = torch.empty(kernel_rows * kernel_cols, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "            -1.0, 1.0\n",
+        "        )\n",
+        "        output_rows = input_rows - kernel_rows + 1\n",
+        "        output_cols = input_cols - kernel_cols + 1\n",
+        "        output = torch.empty(output_rows * output_cols, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"kernel\": kernel,\n",
+        "            \"output\": output,\n",
+        "            \"input_rows\": input_rows,\n",
+        "            \"input_cols\": input_cols,\n",
+        "            \"kernel_rows\": kernel_rows,\n",
+        "            \"kernel_cols\": kernel_cols,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/11_3d_convolution.ipynb b/challenges/colab_exports/medium/11_3d_convolution.ipynb
new file mode 100644
index 00000000..d7c1f449
--- /dev/null
+++ b/challenges/colab_exports/medium/11_3d_convolution.ipynb
@@ -0,0 +1,651 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that performs a 3D convolution operation. Given a 3D input volume and a 3D kernel (filter), compute the convolved\n  output. The convolution should use a \"valid\" boundary condition (no padding).\n</p>\n\n<p>\n  For a 3D convolution, the output at position $(i,j,k)$ is given by:\n</p>\n\n<p>\n  $$\n  output(i,j,k) = \\sum_{d=0}^{K_d-1} \\sum_{r=0}^{K_r-1} \\sum_{c=0}^{K_c-1} input(i+d,j+r,k+c) \\cdot kernel(d,r,c)\n  $$\n</p>\n\n<p>\n  The input consists of:\n<ul>\n  <li>\n    <code>input</code>: A 3D volume of 32-bit floats, as a 1D array (row-major, then depth).\n  </li>\n  <li>\n    <code>kernel</code>: A 3D kernel of 32-bit floats, as a 1D array (row-major, then depth).\n  </li>\n  <li>\n    <code>input_depth</code>,\n    <code>input_rows</code>,\n    <code>input_cols</code>: Dimensions of the input.\n  </li>\n  <li>\n    <code>kernel_depth</code>,\n    <code>kernel_rows</code>,\n    <code>kernel_cols</code>: Dimensions of the kernel.\n  </li>\n</ul>\n\nOutput:\n<ul>\n  <li>\n    <code>output</code>: A 1D array (row-major, then depth) storing the result.\n  </li>\n</ul>\n\nOutput dimensions:\n<ul>\n  <li>\n    <code>output_depth = input_depth - kernel_depth + 1</code>\n  </li>\n  <li>\n    <code>output_rows = input_rows - kernel_rows + 1</code>\n  </li>\n  <li>\n    <code>output_cols = input_cols - kernel_cols + 1</code>\n  </li>\n</ul>\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>output</code></li>\n</ul>\n\n<h2>Examples</h2>\n\n<h3>Example 1:</h3>\n<p>\nInput volume $V \\in \\mathbb{R}^{3 \\times 3 \\times 3}$:\n$$\n\\begin{aligned}\nV_{d=0} &= \\begin{bmatrix}\n1 & 2 & 3 \\\\\n4 & 5 & 6 \\\\\n7 & 8 & 9\n\\end{bmatrix} \\\\\nV_{d=1} &= \\begin{bmatrix}\n10 & 11 & 12 \\\\\n13 & 14 & 15 \\\\\n16 & 17 & 18\n\\end{bmatrix} \\\\\nV_{d=2} &= \\begin{bmatrix}\n19 & 20 & 21 \\\\\n22 & 23 & 24 \\\\\n25 & 26 & 27\n\\end{bmatrix}\n\\end{aligned}\n$$\n\nKernel $K \\in \\mathbb{R}^{2 \\times 3 \\times 3}$:\n$$\n\\begin{aligned}\nK_{d=0} &= \\begin{bmatrix}\n1 & 0 & 0 \\\\\n1 & 1 & 1 \\\\\n0 & 0 & 0\n\\end{bmatrix} \\\\\nK_{d=1} &= \\begin{bmatrix}\n1 & 1 & 0 \\\\\n1 & 1 & 0 \\\\\n0 & 0 & 1\n\\end{bmatrix}\n\\end{aligned}\n$$\n\nOutput $O \\in \\mathbb{R}^{2 \\times 1 \\times 1}$:\n$$\n[82, 163]\n$$\n</p>\n\n<h3>Example 2:</h3>\n<p>\nInput volume $V \\in \\mathbb{R}^{2 \\times 2 \\times 2}$:\n$$\n\\begin{aligned}\nV_{d=0} &= \\begin{bmatrix}\n1 & 2 \\\\\n3 & 4\n\\end{bmatrix} \\\\\nV_{d=1} &= \\begin{bmatrix}\n5 & 6 \\\\\n7 & 8\n\\end{bmatrix}\n\\end{aligned}\n$$\n\nKernel $K \\in \\mathbb{R}^{2 \\times 2 \\times 2}$:\n$$\n\\begin{aligned}\nK_{d=0} &= \\begin{bmatrix}\n1 & 1 \\\\\n1 & 1\n\\end{bmatrix} \\\\\nK_{d=1} &= \\begin{bmatrix}\n1 & 1 \\\\\n1 & 1\n\\end{bmatrix}\n\\end{aligned}\n$$\n\nOutput $O \\in \\mathbb{R}^{1 \\times 1 \\times 1}$:\n$$\n[36]\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264\n    <code>input_depth</code>,\n    <code>input_rows</code>,\n    <code>input_cols</code> \u2264 256\n  </li>\n  <li>1 \u2264\n    <code>kernel_depth</code>,\n    <code>kernel_rows</code>,\n    <code>kernel_cols</code> \u2264 5\n  </li>\n  <li>\n    <code>kernel_depth</code> \u2264\n    <code>input_depth</code>\n  </li>\n  <li>\n    <code>kernel_rows</code> \u2264\n    <code>input_rows</code>\n  </li>\n  <li>\n    <code>kernel_cols</code> \u2264\n    <code>input_cols</code>\n  </li>\n\n  <li>Performance is measured with <code>input_cols</code> = 128, <code>input_rows</code> = 128, <code>kernel_cols</code> = 5, <code>kernel_rows</code> = 5</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, kernel, output are device pointers\nextern \"C\" void solve(const float* input, const float* kernel, float* output, int input_depth,\n                      int input_rows, int input_cols, int kernel_depth, int kernel_rows,\n                      int kernel_cols) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, kernel, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    kernel: cute.Tensor,\n    output: cute.Tensor,\n    input_depth: cute.Int32,\n    input_rows: cute.Int32,\n    input_cols: cute.Int32,\n    kernel_depth: cute.Int32,\n    kernel_rows: cute.Int32,\n    kernel_cols: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, kernel are tensors on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array,\n    kernel: jax.Array,\n    input_depth: int,\n    input_rows: int,\n    input_cols: int,\n    kernel_depth: int,\n    kernel_rows: int,\n    kernel_cols: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    kernel: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    input_depth: Int32,\n    input_rows: Int32,\n    input_cols: Int32,\n    kernel_depth: Int32,\n    kernel_rows: Int32,\n    kernel_cols: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_depth: int,\n    input_rows: int,\n    input_cols: int,\n    kernel_depth: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_depth: int,\n    input_rows: int,\n    input_cols: int,\n    kernel_depth: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"3D Convolution\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        kernel: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        input_depth: int,\n",
+        "        input_rows: int,\n",
+        "        input_cols: int,\n",
+        "        kernel_depth: int,\n",
+        "        kernel_rows: int,\n",
+        "        kernel_cols: int,\n",
+        "    ):\n",
+        "        assert input.shape == (input_depth, input_rows, input_cols)\n",
+        "        assert kernel.shape == (kernel_depth, kernel_rows, kernel_cols)\n",
+        "        assert output.shape == (\n",
+        "            input_depth - kernel_depth + 1,\n",
+        "            input_rows - kernel_rows + 1,\n",
+        "            input_cols - kernel_cols + 1,\n",
+        "        )\n",
+        "        assert input.dtype == kernel.dtype == output.dtype\n",
+        "        assert input.device == kernel.device == output.device\n",
+        "\n",
+        "        input_expanded = input.unsqueeze(0).unsqueeze(0)\n",
+        "        kernel_expanded = kernel.unsqueeze(0).unsqueeze(0)\n",
+        "\n",
+        "        result = torch.nn.functional.conv3d(\n",
+        "            input_expanded, kernel_expanded, bias=None, stride=1, padding=0\n",
+        "        )\n",
+        "\n",
+        "        output.copy_(result.squeeze(0).squeeze(0))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"kernel\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"input_depth\": (ctypes.c_int, \"in\"),\n",
+        "            \"input_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"input_cols\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_depth\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_cols\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [\n",
+        "                [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n",
+        "                [[10, 11, 12], [13, 14, 15], [16, 17, 18]],\n",
+        "                [[19, 20, 21], [22, 23, 24], [25, 26, 27]],\n",
+        "            ],\n",
+        "            dtype=dtype,\n",
+        "            device=\"cuda\",\n",
+        "        )\n",
+        "        kernel_tensor = torch.tensor(\n",
+        "            [[[1, 0, 0], [1, 1, 1], [0, 0, 0]], [[1, 1, 0], [1, 1, 0], [0, 0, 1]]],\n",
+        "            dtype=dtype,\n",
+        "            device=\"cuda\",\n",
+        "        )\n",
+        "        output_tensor = torch.empty((2, 1, 1), device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"kernel\": kernel_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"input_depth\": 3,\n",
+        "            \"input_rows\": 3,\n",
+        "            \"input_cols\": 3,\n",
+        "            \"kernel_depth\": 2,\n",
+        "            \"kernel_rows\": 3,\n",
+        "            \"kernel_cols\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [\n",
+        "                        [[1, 2, 3], [4, 5, 6], [7, 8, 9]],\n",
+        "                        [[10, 11, 12], [13, 14, 15], [16, 17, 18]],\n",
+        "                        [[19, 20, 21], [22, 23, 24], [25, 26, 27]],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor(\n",
+        "                    [[[1, 0, 0], [1, 1, 1], [0, 0, 0]], [[1, 1, 0], [1, 1, 0], [0, 0, 1]]],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"output\": torch.zeros((2, 1, 1), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 3,\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_depth\": 2,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # small_dimensions\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtype, device=device\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor(\n",
+        "                    [[[1, 1], [1, 1]], [[1, 1], [1, 1]]], dtype=dtype, device=device\n",
+        "                ),\n",
+        "                \"output\": torch.zeros((1, 1, 1), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 2,\n",
+        "                \"input_rows\": 2,\n",
+        "                \"input_cols\": 2,\n",
+        "                \"kernel_depth\": 2,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # unit_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtype, device=device\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([[[2]]], dtype=dtype, device=device),\n",
+        "                \"output\": torch.zeros((2, 2, 2), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 2,\n",
+        "                \"input_rows\": 2,\n",
+        "                \"input_cols\": 2,\n",
+        "                \"kernel_depth\": 1,\n",
+        "                \"kernel_rows\": 1,\n",
+        "                \"kernel_cols\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[[1, 2], [3, 4]], [[5, 6], [7, 8]]], dtype=dtype, device=device\n",
+        "                ),\n",
+        "                \"kernel\": torch.zeros((2, 2, 2), dtype=dtype, device=device),\n",
+        "                \"output\": torch.zeros((1, 1, 1), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 2,\n",
+        "                \"input_rows\": 2,\n",
+        "                \"input_cols\": 2,\n",
+        "                \"kernel_depth\": 2,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[[-1, -2], [3, -4]], [[5, -6], [7, -8]]], dtype=dtype, device=device\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([[[-1, 1], [-1, 1]]], dtype=dtype, device=device),\n",
+        "                \"output\": torch.zeros((2, 1, 1), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 2,\n",
+        "                \"input_rows\": 2,\n",
+        "                \"input_cols\": 2,\n",
+        "                \"kernel_depth\": 1,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # rectangular_dimensions\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [\n",
+        "                        [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12]],\n",
+        "                        [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"kernel\": torch.tensor([[[1, 1, 1], [1, 1, 1]]], dtype=dtype, device=device),\n",
+        "                \"output\": torch.zeros((2, 2, 2), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 2,\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 4,\n",
+        "                \"kernel_depth\": 1,\n",
+        "                \"kernel_rows\": 2,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # power_of_two_dimensions\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(4, 4, 4, device=device, dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"kernel\": torch.empty(3, 3, 3, device=device, dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"output\": torch.zeros((2, 2, 2), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 4,\n",
+        "                \"input_rows\": 4,\n",
+        "                \"input_cols\": 4,\n",
+        "                \"kernel_depth\": 3,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(10, 10, 10, device=device, dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"kernel\": torch.empty(3, 4, 5, device=device, dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"output\": torch.zeros((8, 7, 6), dtype=dtype, device=device),\n",
+        "                \"input_depth\": 10,\n",
+        "                \"input_rows\": 10,\n",
+        "                \"input_cols\": 10,\n",
+        "                \"kernel_depth\": 3,\n",
+        "                \"kernel_rows\": 4,\n",
+        "                \"kernel_cols\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_depth, input_rows, input_cols = 256, 128, 128\n",
+        "        kernel_depth, kernel_rows, kernel_cols = 5, 5, 5\n",
+        "        input_tensor = torch.empty(\n",
+        "            input_depth, input_rows, input_cols, device=\"cuda\", dtype=dtype\n",
+        "        ).uniform_(-1.0, 1.0)\n",
+        "        kernel_tensor = torch.empty(\n",
+        "            kernel_depth, kernel_rows, kernel_cols, device=\"cuda\", dtype=dtype\n",
+        "        ).uniform_(-1.0, 1.0)\n",
+        "        output_tensor = torch.zeros(\n",
+        "            input_depth - kernel_depth + 1,\n",
+        "            input_rows - kernel_rows + 1,\n",
+        "            input_cols - kernel_cols + 1,\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        return {\n",
+        "            \"input\": input_tensor,\n",
+        "            \"kernel\": kernel_tensor,\n",
+        "            \"output\": output_tensor,\n",
+        "            \"input_depth\": input_depth,\n",
+        "            \"input_rows\": input_rows,\n",
+        "            \"input_cols\": input_cols,\n",
+        "            \"kernel_depth\": kernel_depth,\n",
+        "            \"kernel_rows\": kernel_rows,\n",
+        "            \"kernel_cols\": kernel_cols,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/13_histogramming.ipynb b/challenges/colab_exports/medium/13_histogramming.ipynb
new file mode 100644
index 00000000..94a45202
--- /dev/null
+++ b/challenges/colab_exports/medium/13_histogramming.ipynb
@@ -0,0 +1,493 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that computes the histogram of an array of 32-bit integers.\n  The histogram should count the number of occurrences of each integer value in the range <code>[0, num_bins)</code>.\n  You are given an input array <code>input</code> of length <code>N</code> and the number of bins <code>num_bins</code>.\n</p>\n\n<p>\n  The result should be an array of integers of length\n<code>num_bins</code>, where each element represents\nthe count of occurrences of its corresponding index in the input array.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the\n    <code>histogram</code> array.\n  </li>\n</ul>\n\n<h2>Examples</h2>\n\n<pre>\nInput: input = [0, 1, 2, 1, 0],  N = 5, num_bins = 3\nOutput: [2, 2, 1]\n\nInput: input = [3, 3, 3, 3], N = 4, num_bins = 5\nOutput: [0, 0, 0, 4, 0]\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>0 &le; <code>input[i]</code> &lt; <code>num_bins</code></li>\n  <li>1 &le; <code>num_bins</code> &le; 1024</li>\n\n  <li>Performance is measured with <code>N</code> = 50,000,000, <code>num_bins</code> = 256</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, histogram are device pointers\nextern \"C\" void solve(const int* input, int* histogram, int N, int num_bins) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, histogram are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, histogram: cute.Tensor, N: cute.Int32, num_bins: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, num_bins: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    histogram: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    num_bins: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, histogram are tensors on the GPU\ndef solve(input: torch.Tensor, histogram: torch.Tensor, N: int, num_bins: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, histogram are tensors on the GPU\ndef solve(input: torch.Tensor, histogram: torch.Tensor, N: int, num_bins: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Histogramming\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, histogram: torch.Tensor, N: int, num_bins: int):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert histogram.dtype == torch.int32\n",
+        "        assert input.numel() == N\n",
+        "        assert histogram.numel() == num_bins\n",
+        "        # Zero out the histogram\n",
+        "        histogram.zero_()\n",
+        "        # Only count valid input values\n",
+        "        valid_mask = (input >= 0) & (input < num_bins)\n",
+        "        valid_input = input[valid_mask]\n",
+        "        counts = torch.bincount(valid_input, minlength=num_bins)\n",
+        "        histogram.copy_(counts)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"histogram\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"num_bins\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([0, 1, 2, 1, 0], device=\"cuda\", dtype=dtype)\n",
+        "        histogram = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"histogram\": histogram,\n",
+        "            \"N\": 5,\n",
+        "            \"num_bins\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0, 1, 2, 1, 0], device=\"cuda\", dtype=dtype),\n",
+        "                \"histogram\": torch.zeros(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "                \"num_bins\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"histogram\": torch.zeros(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "                \"num_bins\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 4, (32,), device=\"cuda\", dtype=dtype),\n",
+        "                \"histogram\": torch.zeros(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 32,\n",
+        "                \"num_bins\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 10, (1000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"histogram\": torch.zeros(10, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"num_bins\": 10,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_multi_block\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(0, 128, (10000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"histogram\": torch.zeros(128, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "                \"num_bins\": 128,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(0, 256, (50000000,), device=\"cuda\", dtype=dtype)\n",
+        "        histogram = torch.zeros(256, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"histogram\": histogram,\n",
+        "            \"N\": 50000000,\n",
+        "            \"num_bins\": 256,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/16_prefix_sum.ipynb b/challenges/colab_exports/medium/16_prefix_sum.ipynb
new file mode 100644
index 00000000..532c31e9
--- /dev/null
+++ b/challenges/colab_exports/medium/16_prefix_sum.ipynb
@@ -0,0 +1,480 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that computes the prefix sum (cumulative sum) of an array of 32-bit floating point numbers.\n  For an input array <code>[a, b, c, d, ...]</code>, the prefix sum is <code>[a, a+b, a+b+c, a+b+c+d, ...]</code>.\n</p>\n\n<svg width=\"450\" height=\"200\" viewBox=\"0 0 450 200\" xmlns=\"http://www.w3.org/2000/svg\" style=\"display:block; margin:20px auto;\">\n  <defs>\n    <marker id=\"arrowhead\" markerWidth=\"6\" markerHeight=\"4\" refX=\"5\" refY=\"2\" orient=\"auto\">\n      <polygon points=\"0 0, 6 2, 0 4\" fill=\"#4477bb\"/>\n    </marker>\n  </defs>\n  <rect width=\"450\" height=\"200\" rx=\"8\" fill=\"#222\"/>\n  <!-- Labels -->\n  <text x=\"18\" y=\"48\" fill=\"#888\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">input</text>\n  <text x=\"18\" y=\"168\" fill=\"#888\" font-family=\"monospace\" font-size=\"10\" text-anchor=\"middle\">prefix</text>\n  <text x=\"18\" y=\"180\" fill=\"#888\" font-family=\"monospace\" font-size=\"10\" text-anchor=\"middle\">sum</text>\n  <!-- Input cells -->\n  <g font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">\n    <rect x=\"50\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"71\" y=\"49\" fill=\"#ccc\">1</text>\n    <rect x=\"97\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"118\" y=\"49\" fill=\"#ccc\">2</text>\n    <rect x=\"144\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"165\" y=\"49\" fill=\"#ccc\">3</text>\n    <rect x=\"191\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"212\" y=\"49\" fill=\"#ccc\">4</text>\n    <rect x=\"238\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"259\" y=\"49\" fill=\"#ccc\">5</text>\n    <rect x=\"285\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"306\" y=\"49\" fill=\"#ccc\">6</text>\n    <rect x=\"332\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"353\" y=\"49\" fill=\"#ccc\">7</text>\n    <rect x=\"379\" y=\"30\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#333\" stroke=\"#777\" stroke-width=\"1\"/>\n    <text x=\"400\" y=\"49\" fill=\"#ccc\">8</text>\n  </g>\n  <!-- Output cells -->\n  <g font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">\n    <rect x=\"50\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"71\" y=\"169\" fill=\"#ccc\">1</text>\n    <rect x=\"97\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"118\" y=\"169\" fill=\"#ccc\">3</text>\n    <rect x=\"144\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"165\" y=\"169\" fill=\"#ccc\">6</text>\n    <rect x=\"191\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"212\" y=\"169\" fill=\"#ccc\">10</text>\n    <rect x=\"238\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"259\" y=\"169\" fill=\"#ccc\">15</text>\n    <rect x=\"285\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"306\" y=\"169\" fill=\"#ccc\">21</text>\n    <rect x=\"332\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"353\" y=\"169\" fill=\"#ccc\">28</text>\n    <rect x=\"379\" y=\"150\" width=\"42\" height=\"28\" rx=\"4\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1\"/>\n    <text x=\"400\" y=\"169\" fill=\"#ccc\">36</text>\n  </g>\n  <!-- Arrows: output[0] <- input[0] -->\n  <line x1=\"71\" y1=\"58\" x2=\"71\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[1] <- input[0..1] -->\n  <line x1=\"71\" y1=\"58\" x2=\"118\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.4\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"118\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[2] <- input[0..2] -->\n  <line x1=\"71\" y1=\"58\" x2=\"165\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.3\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"165\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.5\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"165\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[3] <- input[0..3] -->\n  <line x1=\"71\" y1=\"58\" x2=\"212\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.25\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"212\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.4\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"212\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.6\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"212\" y1=\"58\" x2=\"212\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[4] <- input[0..4] -->\n  <line x1=\"71\" y1=\"58\" x2=\"259\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.2\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"259\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.3\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"259\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.45\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"212\" y1=\"58\" x2=\"259\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.65\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"259\" y1=\"58\" x2=\"259\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[5] <- input[0..5] -->\n  <line x1=\"71\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.18\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.28\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.38\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"212\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.5\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"259\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.7\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"306\" y1=\"58\" x2=\"306\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[6] <- input[0..6] -->\n  <line x1=\"71\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.15\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.22\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.32\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"212\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.42\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"259\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.55\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"306\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.75\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"353\" y1=\"58\" x2=\"353\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n  <!-- output[7] <- input[0..7] -->\n  <line x1=\"71\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.12\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"118\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.2\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"165\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.28\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"212\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.38\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"259\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.48\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"306\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.6\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"353\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"0.8\" stroke-width=\"1\" marker-end=\"url(#arrowhead)\"/>\n  <line x1=\"400\" y1=\"58\" x2=\"400\" y2=\"150\" stroke=\"#4477bb\" stroke-opacity=\"1.0\" stroke-width=\"1.2\" marker-end=\"url(#arrowhead)\"/>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The result must be stored in the <code>output</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: [1.0, 2.0, 3.0, 4.0]\nOutput: [1.0, 3.0, 6.0, 10.0]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: [5.0, -2.0, 3.0, 1.0, -4.0]\nOutput: [5.0, 3.0, 6.0, 7.0, 3.0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>-1000.0 &le; <code>input[i]</code> &le; 1000.0</li>\n  <li>The largest value in the output array will fit within a 32-bit float</li>\n\n  <li>Performance is measured with <code>N</code> = 250,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# data and output are tensors on the GPU\ndef solve(data: torch.Tensor, output: torch.Tensor, n: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Prefix Sum\", atol=1e-02, rtol=1e-02, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        result = torch.cumsum(input, dim=0)\n",
+        "        output.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # mixed_signs\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([5.0, -2.0, 3.0, 1.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(5, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([42.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # power_of_two\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(8, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 8,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(1024, device=\"cuda\", dtype=dtype).zero_(),\n",
+        "                \"output\": torch.empty(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "        # random_large\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(2025, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"output\": torch.empty(2025, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2025,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 250000\n",
+        "        input = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0)\n",
+        "        output = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/17_dot_product.ipynb b/challenges/colab_exports/medium/17_dot_product.ipynb
new file mode 100644
index 00000000..b300ed80
--- /dev/null
+++ b/challenges/colab_exports/medium/17_dot_product.ipynb
@@ -0,0 +1,498 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a GPU program that computes the dot product of two vectors containing 32-bit floating point numbers.\n    The dot product is the sum of the products of the corresponding elements of two vectors.\n</p>\n<p>\n    Mathematically, the dot product of two vectors $A$ and $B$ of length $n$ is defined as:\n    $$\n    A \\cdot B = \\sum_{i=0}^{n-1} A_i \\cdot B_i = A_0 \\cdot B_0 + A_1 \\cdot B_1 + \\ldots + A_{n-1} \\cdot B_{n-1}\n    $$\n</p>\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only GPU native features (external libraries are not permitted)</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the output variable</li>\n</ul>\n<h2>Example 1:</h2>\n<pre>Input:  A = [1.0, 2.0, 3.0, 4.0]\n               B = [5.0, 6.0, 7.0, 8.0]\n       Output: result = 70.0  (1.0*5.0 + 2.0*6.0 + 3.0*7.0 + 4.0*8.0)</pre>\n<h2>Example 2:</h2>\n<pre>Input:  A = [0.5, 1.5, 2.5]\n               B = [2.0, 3.0, 4.0]\n       Output: result = 15.5  (0.5*2.0 + 1.5*3.0 + 2.5*4.0)</pre>\n<h2>Constraints</h2>\n<ul>\n    <li><code>A</code> and <code>B</code> have identical lengths</li>\n    <li>1 \u2264 <code>N</code> \u2264 100,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 5</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, B, result are device pointers\nextern \"C\" void solve(const float* A, const float* B, float* result, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, result are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, result: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    result: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, result are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b, result are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, result: torch.Tensor, n: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Dot Product\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):\n",
+        "        assert A.shape == (N,)\n",
+        "        assert B.shape == (N,)\n",
+        "        assert result.shape == (1,)\n",
+        "        result[0] = torch.dot(A, B)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"result\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        result = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"result\": result,\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_small\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([0.0] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([0.0] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([-1.0, -2.0, -3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-5.0, -6.0, -7.0, -8.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # mixed_positive_negative\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-1.0, 2.0, -3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # orthogonal_vectors\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([0.0, 1.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # medium_sized_vector\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_vector\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"B\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 5\n",
+        "        A = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        result = torch.zeros(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"result\": result,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/18_sparse_matrix_vector_multiplication.ipynb b/challenges/colab_exports/medium/18_sparse_matrix_vector_multiplication.ipynb
new file mode 100644
index 00000000..ad2f90a3
--- /dev/null
+++ b/challenges/colab_exports/medium/18_sparse_matrix_vector_multiplication.ipynb
@@ -0,0 +1,650 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that performs sparse matrix-vector multiplication.\n  Given a sparse matrix $A$ of dimensions $M \\times N$ and a dense vector $x$ of length $N$,\n  compute the product vector $y = A \\times x$, which will have length $M$. <code>A</code> is stored in row-major order.\n  <code>nnz</code> is the number of non-zero elements in <code>A</code>.\n</p>\n\n<p>\n  Mathematically, the operation is defined as:\n  $$\n  y_i = \\sum_{j=0}^{N-1} A_{ij} \\cdot x_j \\quad \\text{for} \\quad i = 0, 1, \\ldots, M-1\n  $$\n</p>\n\n<p>\n  The matrix $A$ is approximately 60 - 70% sparse.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in vector <code>y</code></li>\n</ul>\n\n<h2>Example:</h2>\n<p>\nInput:<br>\nMatrix $A$ ($3 \\times 4$):\n$$\n\\begin{bmatrix}\n5.0 & 0.0 & 0.0 & 1.0 \\\\\n0.0 & 2.0 & 3.0 & 0.0 \\\\\n0.0 & 0.0 & 0.0 & 4.0\n\\end{bmatrix}\n$$\nVector $x$:\n$$\n\\begin{bmatrix}\n1.0 \\\\\n2.0 \\\\\n3.0 \\\\\n4.0\n\\end{bmatrix}\n$$\nOutput:<br>\nVector $y$:\n$$\n\\begin{bmatrix}\n9.0 \\\\\n13.0 \\\\\n16.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 10,000</li>\n  <li>The matrix $A$ is approximately 60-70% sparse (i.e., 60-70% of elements are zero)</li>\n\n  <li>Performance is measured with <code>M</code> = 1,000, <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, x, y are device pointers\nextern \"C\" void solve(const float* A, const float* x, float* y, int M, int N, int nnz) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, x, y are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor, x: cute.Tensor, y: cute.Tensor, M: cute.Int32, N: cute.Int32, nnz: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, x are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, x: jax.Array, M: int, N: int, nnz: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    x: UnsafePointer[Float32, MutExternalOrigin],\n    y: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    nnz: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, x, y are tensors on the GPU\ndef solve(A: torch.Tensor, x: torch.Tensor, y: torch.Tensor, M: int, N: int, nnz: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# A, x, y are tensors on the GPU\ndef solve(A: torch.Tensor, x: torch.Tensor, y: torch.Tensor, M: int, N: int, nnz: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Sparse Matrix-Vector Multiplication\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, A: torch.Tensor, x: torch.Tensor, y: torch.Tensor, M: int, N: int, nnz: int\n",
+        "    ):\n",
+        "        # Accept A as either flattened (M*N,) or 2D (M, N)\n",
+        "        if A.shape == (M * N,):\n",
+        "            A_matrix = A.view(M, N)\n",
+        "        elif A.shape == (M, N):\n",
+        "            A_matrix = A\n",
+        "        else:\n",
+        "            raise AssertionError(f\"A.shape {A.shape} does not match expected {(M*N,)} or {(M, N)}\")\n",
+        "        assert x.shape == (N,)\n",
+        "        assert y.shape == (M,)\n",
+        "        result = torch.matmul(A_matrix, x)\n",
+        "        y.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"y\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"nnz\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor(\n",
+        "            [5.0, 0.0, 0.0, 1.0, 0.0, 2.0, 3.0, 0.0, 0.0, 0.0, 0.0, 4.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        x = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        y = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"x\": x,\n",
+        "            \"y\": y,\n",
+        "            \"M\": 3,\n",
+        "            \"N\": 4,\n",
+        "            \"nnz\": 5,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # small_test\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"x\": torch.tensor([1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"y\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 2,\n",
+        "                \"nnz\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # identity_test\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 1.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"x\": torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"y\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 3,\n",
+        "                \"nnz\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # zero_test\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros((2, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"x\": torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"y\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 3,\n",
+        "                \"nnz\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element_per_row\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 2.0, 0.0, 0.0], [0.0, 0.0, 3.0, 0.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"x\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"y\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 4,\n",
+        "                \"nnz\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"x\": torch.tensor([-1.0, -2.0, -3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"y\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 3,\n",
+        "                \"nnz\": 6,\n",
+        "            }\n",
+        "        )\n",
+        "        # medium_matrix\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor(\n",
+        "                    [\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                        2.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        3.0,\n",
+        "                        0.0,\n",
+        "                        4.0,\n",
+        "                        0.0,\n",
+        "                        5.0,\n",
+        "                        0.0,\n",
+        "                        6.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        7.0,\n",
+        "                        0.0,\n",
+        "                        8.0,\n",
+        "                        0.0,\n",
+        "                        9.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        2.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        3.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        4.0,\n",
+        "                        5.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        6.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        7.0,\n",
+        "                        0.0,\n",
+        "                        8.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        9.0,\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                        2.0,\n",
+        "                        0.0,\n",
+        "                        3.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        4.0,\n",
+        "                        5.0,\n",
+        "                        6.0,\n",
+        "                        0.0,\n",
+        "                        7.0,\n",
+        "                        8.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        9.0,\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                        2.0,\n",
+        "                        3.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        4.0,\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"x\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"y\": torch.empty(10, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 10,\n",
+        "                \"N\": 8,\n",
+        "                \"nnz\": 35,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # random_sparse_matrix\n",
+        "        M_sparse = 20\n",
+        "        N_sparse = 20\n",
+        "        sparsity = 0.65\n",
+        "\n",
+        "        # Generate random sparse matrix\n",
+        "        A_dense = torch.empty((M_sparse, N_sparse), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        mask = torch.rand((M_sparse, N_sparse), device=\"cuda\") > sparsity\n",
+        "        A_sparse = A_dense * mask\n",
+        "        nnz_sparse = int(mask.sum().item())\n",
+        "\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"x\": torch.empty(N_sparse, device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0),\n",
+        "                \"y\": torch.zeros(M_sparse, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M_sparse,\n",
+        "                \"N\": N_sparse,\n",
+        "                \"nnz\": nnz_sparse,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M = 1000\n",
+        "        N = 10000\n",
+        "        nnz = 3500000\n",
+        "        A = torch.zeros((M, N), device=\"cuda\", dtype=dtype)\n",
+        "        total_elements = M * N\n",
+        "        flat_indices = torch.randperm(total_elements, device=\"cuda\")[:nnz]\n",
+        "        values = torch.empty(nnz, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        A.view(-1)[flat_indices] = values\n",
+        "\n",
+        "        # Create a mask: 35% entries will be kept, 65% set to zero\n",
+        "        x = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        y = torch.empty(M, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"x\": x,\n",
+        "            \"y\": y,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"nnz\": nnz,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/22_gemm.ipynb b/challenges/colab_exports/medium/22_gemm.ipynb
new file mode 100644
index 00000000..53664f58
--- /dev/null
+++ b/challenges/colab_exports/medium/22_gemm.ipynb
@@ -0,0 +1,541 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a basic General Matrix Multiplication (GEMM). Given matrix $A$ of dimensions $M \\times K$, matrix $B$ of dimensions $K \\times N$, input/output matrix $C$ of dimensions $M \\times N$, and scalar multipliers $ \\alpha $ and $ \\beta $, compute the operation:\n    $$ C = \\alpha \\cdot (A \\times B) + \\beta \\cdot C_{initial} $$\n</p>\n<p>\n    The input matrices $A$, $B$, and the initial state of $C$ contain 16-bit floating-point numbers (FP16/<code>half</code>). All matrices are stored in row-major order. The scalars $ \\alpha $ and $ \\beta $ are 32-bit floats.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only native features (external libraries other than WMMA are not permitted).</li>\n    <li>The <code>solve</code> function signature must remain unchanged.</li>\n    <li>Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16.</li>\n    <li>The final result must be stored back into matrix <code>C</code> as <code>half</code>.</li>\n</ul>\n\n<h2>Example:</h2>\n<p>\nInput:<br>\n<em>(Note: Input matrices A, B, C_initial are FP16 type for the problem)</em><br>\nMatrix $A$ ($M=2, K=3$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 \\\\\n4.0 & 5.0 & 6.0\n\\end{bmatrix}\n$$\nMatrix $B$ ($K=3, N=2$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 \\\\\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\nMatrix $C_{initial}$ ($M=2, N=2$):\n$$\n\\begin{bmatrix}\n1.0 & 1.0 \\\\\n1.0 & 1.0\n\\end{bmatrix}\n$$\n$$\\alpha = 1.0 \\text{ (FP32)}$$\n$$\\beta = 0.0 \\text{ (FP32)}$$\n\nOutput (FP16):<br>\nMatrix $C$ ($M=2, N=2$):\n$$\n\\begin{bmatrix}\n22.0 & 28.0 \\\\\n49.0 & 64.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n    <li>16 &le; <code>M</code>, <code>N</code>, <code>K</code> &le; 4096</li>\n\n  <li>Performance is measured with <code>K</code> = 1,024, <code>M</code> = 1,024, <code>N</code> = 1,024</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_fp16.h>\n#include <cuda_runtime.h>\n\n// A, B, and C are device pointers\nextern \"C\" void solve(const half* A, const half* B, half* C, int M, int N, int K, float alpha,\n                      float beta) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n    alpha: cute.Float32,\n    beta: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(\n    A: jax.Array, B: jax.Array, M: int, N: int, K: int, alpha: float, beta: float\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Float16, MutExternalOrigin],\n    B: UnsafePointer[Float16, MutExternalOrigin],\n    C: UnsafePointer[Float16, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n    alpha: Float32,\n    beta: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(\n    A: torch.Tensor,\n    B: torch.Tensor,\n    C: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    alpha: float,\n    beta: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b, c are tensors on the GPU\ndef solve(\n    a: torch.Tensor,\n    b: torch.Tensor,\n    c: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    alpha: float,\n    beta: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"General Matrix Multiplication (GEMM)\",\n",
+        "            atol=5e-2,\n",
+        "            rtol=5e-2,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        C: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        K: int,\n",
+        "        alpha: float,\n",
+        "        beta: float,\n",
+        "    ):\n",
+        "        assert A.shape == (M, K)\n",
+        "        assert B.shape == (K, N)\n",
+        "        assert C.shape == (M, N)\n",
+        "        A_f32 = A.view(M, K).to(torch.float32)\n",
+        "        B_f32 = B.view(K, N).to(torch.float32)\n",
+        "        C_f32 = C.view(M, N).to(torch.float32)\n",
+        "        matmul_result = torch.matmul(A_f32, B_f32)\n",
+        "        final_result = alpha * matmul_result + beta * C_f32\n",
+        "        C.copy_(final_result.to(torch.float16))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_uint16), \"inout\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"alpha\": (ctypes.c_float, \"in\"),\n",
+        "            \"beta\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        A = torch.tensor([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype)\n",
+        "        C = torch.tensor([[1.0, 1.0], [1.0, 1.0]], device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": 2,\n",
+        "            \"N\": 2,\n",
+        "            \"K\": 3,\n",
+        "            \"alpha\": 1.0,\n",
+        "            \"beta\": 0.0,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float16\n",
+        "        tests = []\n",
+        "\n",
+        "        # 16x16x16_a1_b0\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.zeros((16, 16), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 16,\n",
+        "                \"N\": 16,\n",
+        "                \"K\": 16,\n",
+        "                \"alpha\": 1.0,\n",
+        "                \"beta\": 0.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 16x16x16_a1_b1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-0.5, 0.5),\n",
+        "                \"B\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-0.5, 0.5),\n",
+        "                \"C\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-0.5, 0.5),\n",
+        "                \"M\": 16,\n",
+        "                \"N\": 16,\n",
+        "                \"K\": 16,\n",
+        "                \"alpha\": 1.0,\n",
+        "                \"beta\": 1.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 32x16x16_a0.5_b0.5\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((32, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((32, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"M\": 32,\n",
+        "                \"N\": 16,\n",
+        "                \"K\": 16,\n",
+        "                \"alpha\": 0.5,\n",
+        "                \"beta\": 0.5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 16x32x16_a1_b1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty((16, 32), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((16, 32), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"M\": 16,\n",
+        "                \"N\": 32,\n",
+        "                \"K\": 16,\n",
+        "                \"alpha\": 1.0,\n",
+        "                \"beta\": 1.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 16x16x32_a0_b1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty((16, 32), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty((32, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((16, 16), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"M\": 16,\n",
+        "                \"N\": 16,\n",
+        "                \"K\": 32,\n",
+        "                \"alpha\": 0.0,\n",
+        "                \"beta\": 1.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        M = 1024\n",
+        "        N = 1024\n",
+        "        K = 1024\n",
+        "        A = torch.empty((M, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B = torch.empty((K, N), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "            \"alpha\": 1.0,\n",
+        "            \"beta\": 1.0,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/25_categorical_cross_entropy_loss.ipynb b/challenges/colab_exports/medium/25_categorical_cross_entropy_loss.ipynb
new file mode 100644
index 00000000..0ce4e775
--- /dev/null
+++ b/challenges/colab_exports/medium/25_categorical_cross_entropy_loss.ipynb
@@ -0,0 +1,547 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program to calculate the categorical cross-entropy loss for a batch of predictions.\n  Given a matrix of predicted logits $Z$ of size $N \\times C$ and a vector of true class labels <code>true_labels</code> of size $N$, compute the average cross-entropy loss over the batch.\n  The loss for a single sample $j$ with logits $z_j = [z_{j1}, \\ldots, z_{jC}]$ and true label $y_j$ is calculated using the numerically stable formula:\n  $$ \\text{Loss}_j = \\log\\left(\\sum_{k=1}^{C} e^{z_{jk}}\\right) - z_{j, y_j} $$\n  The final output stored in the <code>loss</code> variable should be the average loss over the $N$ samples:\n  $$ L = \\frac{1}{N} \\sum_{j=1}^{N} \\text{Loss}_j $$\n  The input parameters are <code>logits</code>, <code>true_labels</code>, <code>N</code> (number of samples), and <code>C</code> (number of classes). The result should be stored in <code>loss</code> (a pointer to a single float).\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result (average loss) must be stored in <code>loss</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>Input:  N = 2, C = 3\n        logits = [[1.0, 2.0, 0.5], [0.1, 3.0, 1.5]]\n        true_labels = [1, 1]\nOutput: loss = [0.3548926]</pre>\n\n\n<h2>Example 2:</h2>\n<pre>Input:  N = 3, C = 4\n        logits = [[-0.5, 1.5, 0.0, 1.0], [2.0, -1.0, 0.5, 0.5], [0.0, 0.0, 0.0, 0.0]]\n        true_labels = [3, 0, 1]\nOutput: loss = [0.98820376]</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 10,000</li>\n  <li>2 &le; <code>C</code> &le; 1,000</li>\n  <li>-10.0 &le; <code>logits[i, j]</code> &le; 10.0</li>\n  <li>0 &le; <code>true_labels[i]</code> &le; <code>C</code></li>\n\n  <li>Performance is measured with <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// logits, true_labels, loss are device pointers\nextern \"C\" void solve(const float* logits, const int* true_labels, float* loss, int N, int C) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# logits, true_labels, loss are tensors on the GPU\n@cute.jit\ndef solve(\n    logits: cute.Tensor, true_labels: cute.Tensor, loss: cute.Tensor, N: cute.Int32, C: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# logits, true_labels are tensors on the GPU\n@jax.jit\ndef solve(logits: jax.Array, true_labels: jax.Array, N: int, C: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    logits: UnsafePointer[Float32, MutExternalOrigin],\n    true_labels: UnsafePointer[Int32, MutExternalOrigin],\n    loss: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    C: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# logits, true_labels, loss are tensors on the GPU\ndef solve(logits: torch.Tensor, true_labels: torch.Tensor, loss: torch.Tensor, N: int, C: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# logits, true_labels, loss are tensors on the GPU\ndef solve(logits: torch.Tensor, true_labels: torch.Tensor, loss: torch.Tensor, N: int, C: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Categorical Cross Entropy Loss\",\n",
+        "            atol=1e-05,\n",
+        "            rtol=1e-05,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, logits: torch.Tensor, true_labels: torch.Tensor, loss: torch.Tensor, N: int, C: int\n",
+        "    ):\n",
+        "        assert logits.dtype == torch.float32\n",
+        "        assert true_labels.dtype == torch.int32\n",
+        "        assert loss.dtype == torch.float32\n",
+        "        assert logits.shape == (N, C)\n",
+        "        assert true_labels.shape == (N,)\n",
+        "        assert loss.shape == (1,)\n",
+        "        assert N > 0 and C > 0\n",
+        "        total_loss = 0.0\n",
+        "        for i in range(N):\n",
+        "            log_probs = logits[i]\n",
+        "            true_label = true_labels[i].item()\n",
+        "            assert 0 <= true_label < C\n",
+        "            max_logit = torch.max(log_probs)\n",
+        "            log_sum_exp = max_logit + torch.log(torch.sum(torch.exp(log_probs - max_logit)))\n",
+        "            loss_i = log_sum_exp - log_probs[true_label]\n",
+        "            total_loss += loss_i.item()\n",
+        "        loss[0] = total_loss / N\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"logits\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"true_labels\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"loss\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"C\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype_logits = torch.float32\n",
+        "        dtype_labels = torch.int32\n",
+        "        logits = torch.tensor([[1.0, 2.0, 0.5], [0.1, 3.0, 1.5]], device=\"cuda\", dtype=dtype_logits)\n",
+        "        true_labels = torch.tensor([1, 1], device=\"cuda\", dtype=dtype_labels)\n",
+        "        loss = torch.zeros(1, device=\"cuda\", dtype=dtype_logits)\n",
+        "        return {\n",
+        "            \"logits\": logits,\n",
+        "            \"true_labels\": true_labels,\n",
+        "            \"loss\": loss,\n",
+        "            \"N\": 2,\n",
+        "            \"C\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype_logits = torch.float32\n",
+        "        dtype_labels = torch.int32\n",
+        "        tests = []\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 0.5], [0.1, 3.0, 1.5]], device=\"cuda\", dtype=dtype_logits\n",
+        "                ),\n",
+        "                \"true_labels\": torch.tensor([1, 1], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 2,\n",
+        "                \"C\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # example_2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[-0.5, 1.5, 0.0, 1.0], [2.0, -1.0, 0.5, 0.5], [0.0, 0.0, 0.0, 0.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_logits,\n",
+        "                ),\n",
+        "                \"true_labels\": torch.tensor([3, 0, 1], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 3,\n",
+        "                \"C\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_sample\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[0.1, 0.2, 0.3, 0.4, 0.5]], device=\"cuda\", dtype=dtype_logits\n",
+        "                ),\n",
+        "                \"true_labels\": torch.tensor([4], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 1,\n",
+        "                \"C\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # uniform_logits_correct_label\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor([[1.0] * 5, [1.0] * 5], device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"true_labels\": torch.tensor([0, 0], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 2,\n",
+        "                \"C\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # high_confidence_correct\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[-5.0, -5.0, 10.0, -5.0], [10.0, -5.0, -5.0, -5.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_logits,\n",
+        "                ),\n",
+        "                \"true_labels\": torch.tensor([2, 0], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 2,\n",
+        "                \"C\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # high_confidence_incorrect\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[10.0, -5.0, -5.0], [-5.0, 10.0, -5.0]], device=\"cuda\", dtype=dtype_logits\n",
+        "                ),\n",
+        "                \"true_labels\": torch.tensor([1, 2], device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 2,\n",
+        "                \"C\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # larger_batch_random\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": torch.empty(100, 5, device=\"cuda\", dtype=dtype_logits).uniform_(\n",
+        "                    -5.0, 5.0\n",
+        "                ),\n",
+        "                \"true_labels\": torch.randint(0, 5, (100,), device=\"cuda\", dtype=dtype_labels),\n",
+        "                \"loss\": torch.zeros(1, device=\"cuda\", dtype=dtype_logits),\n",
+        "                \"N\": 100,\n",
+        "                \"C\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype_logits = torch.float32\n",
+        "        dtype_labels = torch.int32\n",
+        "        logits = torch.empty(10000, 1000, device=\"cuda\", dtype=dtype_logits).uniform_(-10.0, 10.0)\n",
+        "        true_labels = torch.randint(0, 1000, (10000,), device=\"cuda\", dtype=dtype_labels)\n",
+        "        loss = torch.zeros(1, device=\"cuda\", dtype=dtype_logits)\n",
+        "        return {\n",
+        "            \"logits\": logits,\n",
+        "            \"true_labels\": true_labels,\n",
+        "            \"loss\": loss,\n",
+        "            \"N\": 10000,\n",
+        "            \"C\": 1000,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/27_mean_squared_error.ipynb b/challenges/colab_exports/medium/27_mean_squared_error.ipynb
new file mode 100644
index 00000000..507c3fe5
--- /dev/null
+++ b/challenges/colab_exports/medium/27_mean_squared_error.ipynb
@@ -0,0 +1,497 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program to calculate the Mean Squared Error (MSE) between\n  predicted values and target values. Given two arrays of equal length,\n  <code>predictions</code> and <code>targets</code>, compute: $$ \\text{MSE} =\n  \\frac{1}{N} \\sum_{i=1}^{N} (predictions_i - targets_i)^2 $$ where N is the\n  number of elements in each array.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted.</li>\n  <li>The <code>solve</code> function signature must remain unchanged.</li>\n  <li>The final result must be stored in the <code>mse</code> variable.</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\n  Input:  predictions = [1.0, 2.0, 3.0, 4.0]\n          targets = [1.5, 2.5, 3.5, 4.5]\n  Output: mse = 0.25\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\n  Input:  predictions = [10.0, 20.0, 30.0]\n          targets = [12.0, 18.0, 33.0]\n  Output: mse = 5.67\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>\n    -1000.0 &le; <code>predictions[i]</code>, <code>targets[i]</code> &le;\n    1000.0\n  </li>\n\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// predictions, targets, mse are device pointers\nextern \"C\" void solve(const float* predictions, const float* targets, float* mse, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# predictions, targets, mse are tensors on the GPU\n@cute.jit\ndef solve(predictions: cute.Tensor, targets: cute.Tensor, mse: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# predictions, targets are tensors on the GPU\n@jax.jit\ndef solve(predictions: jax.Array, targets: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    predictions: UnsafePointer[Float32, MutExternalOrigin],\n    targets: UnsafePointer[Float32, MutExternalOrigin],\n    mse: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# predictions, targets, mse are tensors on the GPU\ndef solve(predictions: torch.Tensor, targets: torch.Tensor, mse: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# predictions, targets, mse are tensors on the GPU\ndef solve(predictions: torch.Tensor, targets: torch.Tensor, mse: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Mean Squared Error\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, predictions: torch.Tensor, targets: torch.Tensor, mse: torch.Tensor, N: int\n",
+        "    ):\n",
+        "        # predictions, targets, mse are tensors on the GPU\n",
+        "        squared_diffs = torch.square(predictions - targets)\n",
+        "        mean_squared_error = torch.mean(squared_diffs)\n",
+        "        mse[0] = mean_squared_error\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"predictions\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"targets\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"mse\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        predictions = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        targets = torch.tensor([1.5, 2.5, 3.5, 4.5], device=\"cuda\", dtype=dtype)\n",
+        "        mse = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        N = 4\n",
+        "        return {\n",
+        "            \"predictions\": predictions,\n",
+        "            \"targets\": targets,\n",
+        "            \"mse\": mse,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # Test 1: basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"predictions\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"targets\": torch.tensor([1.5, 2.5, 3.5, 4.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"mse\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # Test 2: second_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"predictions\": torch.tensor([10.0, 20.0, 30.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"targets\": torch.tensor([12.0, 18.0, 33.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"mse\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # Test 3: zero_error\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"predictions\": torch.tensor([1.5, 2.5, 3.5, 4.5, 5.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"targets\": torch.tensor([1.5, 2.5, 3.5, 4.5, 5.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"mse\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # Test 4: negative_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"predictions\": torch.tensor([-2.5, -1.0, 0.0, 1.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"targets\": torch.tensor([-1.5, -2.0, 0.5, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"mse\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # Test 5: large_difference\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"predictions\": torch.tensor([100.0, 200.0, 300.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"targets\": torch.tensor([150.0, 250.0, 350.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"mse\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # Test 6: medium_size\n",
+        "        N = 1024\n",
+        "        predictions = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        targets = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        mse = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        tests.append({\"predictions\": predictions, \"targets\": targets, \"mse\": mse, \"N\": N})\n",
+        "        # Test 7: large_size\n",
+        "        N = 10000\n",
+        "        predictions = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0)\n",
+        "        targets = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0)\n",
+        "        mse = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        tests.append({\"predictions\": predictions, \"targets\": targets, \"mse\": mse, \"N\": N})\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50000000\n",
+        "        predictions = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0)\n",
+        "        targets = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0)\n",
+        "        mse = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"predictions\": predictions,\n",
+        "            \"targets\": targets,\n",
+        "            \"mse\": mse,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/28_gaussian_blur.ipynb b/challenges/colab_exports/medium/28_gaussian_blur.ipynb
new file mode 100644
index 00000000..12c6b1a8
--- /dev/null
+++ b/challenges/colab_exports/medium/28_gaussian_blur.ipynb
@@ -0,0 +1,585 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that applies a Gaussian blur filter to a 2D image. Given an input image represented as a floating-point array and a Gaussian kernel, the program should compute the convolution of the image with the kernel.\n  All inputs and outputs are stored in row-major order.\n</p>\n\n<p>\n  The Gaussian blur is performed by convolving each pixel with a weighted average of its neighbors, where the weights are determined by the Gaussian kernel. For each output pixel at position (i, j), the value is calculated as:\n\n  $$ output[i, j] = \\sum_{m=-k_h/2}^{k_h/2} \\sum_{n=-k_w/2}^{k_w/2} input[i+m, j+n] \\times kernel[m+k_h/2, n+k_w/2] $$\n\n  where $k_h$ and $k_w$ are the kernel height and width.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> array</li>\n  <li>Handle boundary conditions by using zero-padding (treat values outside the image boundary as zeros)</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\n  image (5, 5) = [\n    [1.0, 2.0, 3.0, 4.0, 5.0],\n    [6.0, 7.0, 8.0, 9.0, 10.0],\n    [11.0, 12.0, 13.0, 14.0, 15.0],\n    [16.0, 17.0, 18.0, 19.0, 20.0],\n    [21.0, 22.0, 23.0, 24.0, 25.0]\n  ]\n\n  kernel (3, 3) = [\n    [0.0625, 0.125, 0.0625],\n    [0.125, 0.25, 0.125],\n    [0.0625, 0.125, 0.0625]\n  ]\n\nOutput:\n  output (5, 5) = [\n    [1.6875, 2.75, 3.5, 4.25, 3.5625],\n    [4.75, 7.0, 8.0, 9.0, 7.25],\n    [8.5, 12.0, 13.0, 14.0, 11.0],\n    [12.25, 17.0, 18.0, 19.0, 14.75],\n    [11.0625, 15.25, 16.0, 16.75, 12.9375]\n  ]\n\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:\n  image (3, 3) = [\n    [10.0, 20.0, 30.0],\n    [40.0, 50.0, 60.0],\n    [70.0, 80.0, 90.0]\n  ]\n\n  kernel (3, 3) = [\n    [0.1, 0.1, 0.1],\n    [0.1, 0.2, 0.1],\n    [0.1, 0.1, 0.1]\n  ]\n\nOutput:\n  output (3, 3) = [\n    [13.0, 23.0, 19.0],\n    [31.0, 50.0, 39.0],\n    [31.0, 47.0, 37.0]\n  ]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>input_rows</code>, <code>input_cols</code> \u2264 4096</li>\n  <li>3 \u2264 <code>kernel_rows</code>, <code>kernel_cols</code> \u2264 21</li>\n  <li>Both <code>kernel_rows</code> and <code>kernel_cols</code> will be odd numbers</li>\n  <li>All kernel values will be non-negative and sum to 1.0 (normalized)</li>\n\n  <li>Performance is measured with <code>input_cols</code> = 512, <code>input_rows</code> = 512, <code>kernel_cols</code> = 7, <code>kernel_rows</code> = 7</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, kernel, output are device pointers\nextern \"C\" void solve(const float* input, const float* kernel, float* output, int input_rows,\n                      int input_cols, int kernel_rows, int kernel_cols) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, kernel, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    kernel: cute.Tensor,\n    output: cute.Tensor,\n    input_rows: cute.Int32,\n    input_cols: cute.Int32,\n    kernel_rows: cute.Int32,\n    kernel_cols: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, kernel are tensors on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array,\n    kernel: jax.Array,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    kernel: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    input_rows: Int32,\n    input_cols: Int32,\n    kernel_rows: Int32,\n    kernel_cols: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, kernel, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    kernel: torch.Tensor,\n    output: torch.Tensor,\n    input_rows: int,\n    input_cols: int,\n    kernel_rows: int,\n    kernel_cols: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Gaussian Blur\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        kernel: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        input_rows: int,\n",
+        "        input_cols: int,\n",
+        "        kernel_rows: int,\n",
+        "        kernel_cols: int,\n",
+        "    ):\n",
+        "        input_2d = input.view(1, 1, input_rows, input_cols)\n",
+        "        kernel_2d = kernel.view(1, 1, kernel_rows, kernel_cols)\n",
+        "        pad_h = kernel_rows // 2\n",
+        "        pad_w = kernel_cols // 2\n",
+        "        result = torch.nn.functional.conv2d(input_2d, kernel_2d, padding=(pad_h, pad_w))\n",
+        "        output[:] = result.view(-1)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"kernel\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"input_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"input_cols\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_cols\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_rows, input_cols = 5, 5\n",
+        "        kernel_rows, kernel_cols = 3, 3\n",
+        "        input = torch.tensor(\n",
+        "            [\n",
+        "                1.0,\n",
+        "                2.0,\n",
+        "                3.0,\n",
+        "                4.0,\n",
+        "                5.0,\n",
+        "                6.0,\n",
+        "                7.0,\n",
+        "                8.0,\n",
+        "                9.0,\n",
+        "                10.0,\n",
+        "                11.0,\n",
+        "                12.0,\n",
+        "                13.0,\n",
+        "                14.0,\n",
+        "                15.0,\n",
+        "                16.0,\n",
+        "                17.0,\n",
+        "                18.0,\n",
+        "                19.0,\n",
+        "                20.0,\n",
+        "                21.0,\n",
+        "                22.0,\n",
+        "                23.0,\n",
+        "                24.0,\n",
+        "                25.0,\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        kernel = torch.tensor(\n",
+        "            [0.0625, 0.125, 0.0625, 0.125, 0.25, 0.125, 0.0625, 0.125, 0.0625],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.empty(input_rows * input_cols, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"kernel\": kernel,\n",
+        "            \"output\": output,\n",
+        "            \"input_rows\": input_rows,\n",
+        "            \"input_cols\": input_cols,\n",
+        "            \"kernel_rows\": kernel_rows,\n",
+        "            \"kernel_cols\": kernel_cols,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [\n",
+        "                        [1.0, 2.0, 3.0, 4.0, 5.0],\n",
+        "                        [6.0, 7.0, 8.0, 9.0, 10.0],\n",
+        "                        [11.0, 12.0, 13.0, 14.0, 15.0],\n",
+        "                        [16.0, 17.0, 18.0, 19.0, 20.0],\n",
+        "                        [21.0, 22.0, 23.0, 24.0, 25.0],\n",
+        "                    ],\n",
+        "                    device=device,\n",
+        "                    dtype=dtype,\n",
+        "                ).flatten(),\n",
+        "                \"kernel\": torch.tensor(\n",
+        "                    [[0.0625, 0.125, 0.0625], [0.125, 0.25, 0.125], [0.0625, 0.125, 0.0625]],\n",
+        "                    device=device,\n",
+        "                    dtype=dtype,\n",
+        "                ).flatten(),\n",
+        "                \"output\": torch.zeros((5, 5), device=device, dtype=dtype).flatten(),\n",
+        "                \"input_rows\": 5,\n",
+        "                \"input_cols\": 5,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # identity_kernel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]], device=device, dtype=dtype\n",
+        "                ).flatten(),\n",
+        "                \"kernel\": torch.tensor(\n",
+        "                    [[0.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, 0.0]], device=device, dtype=dtype\n",
+        "                ).flatten(),\n",
+        "                \"output\": torch.zeros((3, 3), device=device, dtype=dtype).flatten(),\n",
+        "                \"input_rows\": 3,\n",
+        "                \"input_cols\": 3,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_ones_input\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.ones((4, 4), device=device, dtype=dtype).flatten(),\n",
+        "                \"kernel\": torch.full((3, 3), 0.111111, device=device, dtype=dtype).flatten(),\n",
+        "                \"output\": torch.zeros((4, 4), device=device, dtype=dtype).flatten(),\n",
+        "                \"input_rows\": 4,\n",
+        "                \"input_cols\": 4,\n",
+        "                \"kernel_rows\": 3,\n",
+        "                \"kernel_cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_pixel\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[42.0]], device=device, dtype=dtype).flatten(),\n",
+        "                \"kernel\": torch.tensor([[1.0]], device=device, dtype=dtype).flatten(),\n",
+        "                \"output\": torch.zeros((1, 1), device=device, dtype=dtype).flatten(),\n",
+        "                \"input_rows\": 1,\n",
+        "                \"input_cols\": 1,\n",
+        "                \"kernel_rows\": 1,\n",
+        "                \"kernel_cols\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_random\n",
+        "        input_large = torch.empty((32, 32), device=device, dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        kernel_large = torch.empty((5, 5), device=device, dtype=dtype).uniform_(0.0, 1.0)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": input_large.flatten(),\n",
+        "                \"kernel\": kernel_large.flatten(),\n",
+        "                \"output\": torch.zeros((32, 32), device=device, dtype=dtype).flatten(),\n",
+        "                \"input_rows\": 32,\n",
+        "                \"input_cols\": 32,\n",
+        "                \"kernel_rows\": 5,\n",
+        "                \"kernel_cols\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input_rows, input_cols = 512, 512\n",
+        "        kernel_rows, kernel_cols = 7, 7\n",
+        "        input = torch.empty(input_rows * input_cols, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "            0.0, 255.0\n",
+        "        )\n",
+        "        kernel = torch.empty(kernel_rows * kernel_cols, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "            0.0001, 0.02\n",
+        "        )\n",
+        "        output = torch.empty(input_rows * input_cols, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"kernel\": kernel,\n",
+        "            \"output\": output,\n",
+        "            \"input_rows\": input_rows,\n",
+        "            \"input_cols\": input_cols,\n",
+        "            \"kernel_rows\": kernel_rows,\n",
+        "            \"kernel_cols\": kernel_cols,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/29_top_k_selection.ipynb b/challenges/colab_exports/medium/29_top_k_selection.ipynb
new file mode 100644
index 00000000..7eed7f24
--- /dev/null
+++ b/challenges/colab_exports/medium/29_top_k_selection.ipynb
@@ -0,0 +1,493 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a GPU program that, given a 1D array <code>input</code> of 32-bit floating point numbers of length <code>N</code>, selects the <code>k</code> largest elements and writes them in descending order to the <code>output</code> array of length <code>k</code>.\n  </p>\n\n  <h2>Implementation Requirements</h2>\n  <ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the <code>output</code> array</li>\n  </ul>\n\n  <h2>Example 1:</h2>\n  <pre>\n  Input:\n  input = [1.0, 5.0, 3.0, 2.0, 4.0]\n  N = 5\n  k = 3\n\n  Output:\n  output = [5.0, 4.0, 3.0]\n  </pre>\n\n  <h2>Example 2:</h2>\n  <pre>\n  Input:\n  input = [7.2, -1.0, 3.3, 8.8, 2.2]\n  N = 5\n  k = 2\n\n  Output:\n  output = [8.8, 7.2]\n  </pre>\n\n  <h2>Constraints</h2>\n  <ul>\n    <li>1 \u2264 N \u2264 100,000,000</li>\n    <li>1 \u2264 k \u2264 N</li>\n    <li>All values in <code>input</code> are 32-bit floats</li>\n\n  <li>Performance is measured with <code>N</code> = 50,000,000, <code>k</code> = 100</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N, int k) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, k: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, k: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    k: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, k: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, k: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Top K Selection\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, k: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (k,)\n",
+        "        assert input.dtype == output.dtype == torch.float32\n",
+        "        assert input.device == output.device\n",
+        "        topk = torch.topk(input, k, largest=True).values\n",
+        "        output.copy_(topk)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"k\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor([1.0, 5.0, 3.0, 2.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 5,\n",
+        "            \"k\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 5.0, 3.0, 2.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "                \"k\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [-2.0, -1.0, -3.0, -4.0, -5.0, -6.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 6,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_equal\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([7.0, 7.0, 7.0, 7.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "                \"k\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([42.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "                \"k\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # reverse_sorted\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([5.0, 4.0, 3.0, 2.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_random (simulated; actual is random in runner)\n",
+        "        N, k = 1000, 10\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "                \"output\": torch.empty(k, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"k\": k,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50000000\n",
+        "        k = 100\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1e6, 1e6),\n",
+        "            \"output\": torch.empty(k, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "            \"k\": k,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/30_batched_matrix_multiplication.ipynb b/challenges/colab_exports/medium/30_batched_matrix_multiplication.ipynb
new file mode 100644
index 00000000..3f40a472
--- /dev/null
+++ b/challenges/colab_exports/medium/30_batched_matrix_multiplication.ipynb
@@ -0,0 +1,499 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a batched matrix multiplication in FP32. Given a batch of matrices <code>A</code> of shape <code>[B, M, K]</code> and a batch of matrices <code>B</code> of shape <code>[B, K, N]</code>, compute the output batch <code>C</code> of shape <code>[B, M, N]</code> such that for each batch index <code>b</code>:\n  $$\n    C_b = A_b \\times B_b\n  $$\n  All matrices are stored in row-major order and use 32-bit floating point numbers (FP32).\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>C</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\nB = 2, M = 2, K = 3, N = 2\nA = [\n  [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],\n  [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]\n]\nB = [\n  [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],\n  [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]\n]\nOutput:\nC = [\n  [[22.0, 28.0], [49.0, 64.0]],\n  [[92.0, 68.0], [128.0, 95.0]]\n]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>B</code> &le; 128</li>\n  <li>1 &le; <code>M</code>, <code>N</code>, <code>K</code> &le; 1024</li>\n\n  <li>Performance is measured with <code>K</code> = 256, <code>M</code> = 256, <code>N</code> = 256</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, B, C are device pointers\nextern \"C\" void solve(const float* A, const float* B, float* C, int BATCH, int M, int N, int K) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    BATCH: cute.Int32,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, BATCH: int, M: int, N: int, K: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    BATCH: Int32,\n    M: Int32,\n    N: Int32,\n    K: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, BATCH: int, M: int, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b, c are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, BATCH: int, M: int, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Batched Matrix Multiplication\",\n",
+        "            atol=1e-5,\n",
+        "            rtol=1e-5,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, BATCH: int, M: int, N: int, K: int\n",
+        "    ):\n",
+        "        # A: (BATCH, M, K), B: (BATCH, K, N), C: (BATCH, M, N)\n",
+        "        A = A.view(BATCH, M, K)\n",
+        "        B = B.view(BATCH, K, N)\n",
+        "        C.copy_(torch.bmm(A, B))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"BATCH\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        BATCH, M, K, N = 2, 2, 3, 2\n",
+        "        A = torch.tensor(\n",
+        "            [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        B = torch.tensor(\n",
+        "            [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        C = torch.empty(BATCH, M, N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"BATCH\": BATCH, \"M\": M, \"N\": N, \"K\": K}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # 1. basic_example\n",
+        "        A1 = torch.tensor(\n",
+        "            [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        ).flatten()\n",
+        "        B1 = torch.tensor(\n",
+        "            [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        ).flatten()\n",
+        "        C1 = torch.empty((2, 2, 2), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A1, \"B\": B1, \"C\": C1, \"BATCH\": 2, \"M\": 2, \"N\": 2, \"K\": 3})\n",
+        "\n",
+        "        # 2. single_batch\n",
+        "        A2 = torch.tensor(\n",
+        "            [[[1.0, 0.0, 2.0], [0.0, 1.0, 2.0], [2.0, 1.0, 0.0]]], device=device, dtype=dtype\n",
+        "        ).flatten()\n",
+        "        B2 = torch.tensor(\n",
+        "            [[[2.0, 1.0, 0.0], [1.0, 2.0, 0.0], [0.0, 1.0, 2.0]]], device=device, dtype=dtype\n",
+        "        ).flatten()\n",
+        "        C2 = torch.empty((1, 3, 3), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A2, \"B\": B2, \"C\": C2, \"BATCH\": 1, \"M\": 3, \"N\": 3, \"K\": 3})\n",
+        "\n",
+        "        # 3. batch_4_small\n",
+        "        A3 = torch.empty((4, 2, 2), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B3 = torch.empty((4, 2, 2), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C3 = torch.empty((4, 2, 2), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A3, \"B\": B3, \"C\": C3, \"BATCH\": 4, \"M\": 2, \"N\": 2, \"K\": 2})\n",
+        "\n",
+        "        # 4. batch_8_rectangular\n",
+        "        A4 = torch.empty((8, 4, 2), device=device, dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        B4 = torch.empty((8, 2, 3), device=device, dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        C4 = torch.empty((8, 4, 3), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A4, \"B\": B4, \"C\": C4, \"BATCH\": 8, \"M\": 4, \"N\": 3, \"K\": 2})\n",
+        "\n",
+        "        # 5. batch_16_large\n",
+        "        A5 = torch.empty((16, 16, 16), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B5 = torch.empty((16, 16, 16), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C5 = torch.empty((16, 16, 16), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A5, \"B\": B5, \"C\": C5, \"BATCH\": 16, \"M\": 16, \"N\": 16, \"K\": 16})\n",
+        "\n",
+        "        # 6. batch_2_non_square\n",
+        "        A6 = torch.empty((2, 8, 4), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        B6 = torch.empty((2, 4, 6), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        C6 = torch.empty((2, 8, 6), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A6, \"B\": B6, \"C\": C6, \"BATCH\": 2, \"M\": 8, \"N\": 6, \"K\": 4})\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        BATCH, M, N, K = 32, 256, 256, 256  # Match speed_test.json\n",
+        "        A = torch.empty(BATCH, M, K, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "            -10.0, 10.0\n",
+        "        )  # Match range\n",
+        "        B = torch.empty(BATCH, K, N, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "            -10.0, 10.0\n",
+        "        )  # Match range\n",
+        "        C = torch.empty(BATCH, M, N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"BATCH\": BATCH, \"M\": M, \"N\": N, \"K\": K}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/32_int8_quantized_matmul.ipynb b/challenges/colab_exports/medium/32_int8_quantized_matmul.ipynb
new file mode 100644
index 00000000..d44d8879
--- /dev/null
+++ b/challenges/colab_exports/medium/32_int8_quantized_matmul.ipynb
@@ -0,0 +1,601 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n     Implement a quantized matrix multiplication program for 8-bit signed integer matrices. Given two input matrices <code>A</code> of dimensions $M \\times K$ and <code>B</code> of dimensions $K \\times N$, quantization scales <code>scale_A</code>, <code>scale_B</code>, output scale <code>scale_C</code>, zero-points <code>zero_point_A</code>, <code>zero_point_B</code>, <code>zero_point_C</code>, compute:\n     $$\n     C_{\\text{quant}}(i, j) = \\mathrm{clamp}\\left(\n         \\mathrm{round}\\left(\n             \\frac{\n                 \\sum_{k=0}^{K-1} (A_{ik} - z_A)(B_{kj} - z_B) \\cdot s_A s_B\n             }{s_C}\n         \\right) + z_C,\\ -128,\\ 127\n     \\right)\n     $$\n     where <code>s_A = scale_A</code>, <code>z_A = zero_point_A</code>, etc.\n     </p>\n\n     <h2>Implementation Requirements</h2>\n     <ul>\n     <li>External libraries are not permitted</li>\n     <li>The <code>solve</code> function signature must remain unchanged</li>\n     <li>The final result must be stored in the output matrix <code>C</code> as <code>int8</code></li>\n     <li>After accumulation in int32 and scaling in float32, values must be rounded to the nearest integer, shifted by <code>zero_point_C</code>, and clamped to the <code>[-128, 127]</code> range</li>\n     </ul>\n\n     <h2>Example 1:</h2>\n     <pre>\n     Input:\n     A = [[1, 2],\n          [3, 4]]\n     B = [[5, 6],\n          [7, 8]]\n     M = 2, N = 2, K = 2\n     scale_A = 0.1, scale_B = 0.2, scale_C = 0.05\n     zero_point_A = 0, zero_point_B = 0, zero_point_C = 0\n\n     Output:\n     C = [[19, 22],\n          [43, 50]]\n     </pre>\n\n     <h2>Example 2:</h2>\n     <pre>\n     Input:\n     A = [[1, 2]]\n     B = [[3],\n          [4]]\n     M = 1, N = 1, K = 2\n     scale_A = 1.0, scale_B = 1.0, scale_C = 1.0\n     zero_point_A = 1, zero_point_B = 3, zero_point_C = 5\n\n     Output:\n     C = [[6]]\n     </pre>\n\n     <h2>Constraints</h2>\n     <ul>\n     <li>1 \u2264 <code>M</code>, <code>N</code>, <code>K</code> \u2264 4096</li>\n     <li><code>scale_A</code>, <code>scale_B</code>, <code>scale_C</code> are positive floats</li>\n     <li><code>-128</code> \u2264 <code>zero_point_A</code>, <code>zero_point_B</code>, <code>zero_point_C</code> \u2264 <code>127</code></li>\n\n  <li>Performance is measured with <code>K</code> = 2,048, <code>M</code> = 8,192, <code>N</code> = 4,096</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, B, C are device pointers\nextern \"C\" void solve(const int8_t* A, const int8_t* B, int8_t* C, int M, int N, int K,\n                      float scale_A, float scale_B, float scale_C, int zero_point_A,\n                      int zero_point_B, int zero_point_C) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n    scale_A: cute.Float32,\n    scale_B: cute.Float32,\n    scale_C: cute.Float32,\n    zero_point_A: cute.Int32,\n    zero_point_B: cute.Int32,\n    zero_point_C: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(\n    A: jax.Array,\n    B: jax.Array,\n    M: int,\n    N: int,\n    K: int,\n    scale_A: float,\n    scale_B: float,\n    scale_C: float,\n    zero_point_A: int,\n    zero_point_B: int,\n    zero_point_C: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Int8, MutExternalOrigin],\n    B: UnsafePointer[Int8, MutExternalOrigin],\n    C: UnsafePointer[Int8, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n    scale_A: Float32,\n    scale_B: Float32,\n    scale_C: Float32,\n    zero_point_A: Int32,\n    zero_point_B: Int32,\n    zero_point_C: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(\n    A: torch.Tensor,\n    B: torch.Tensor,\n    C: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    scale_A: float,\n    scale_B: float,\n    scale_C: float,\n    zero_point_A: int,\n    zero_point_B: int,\n    zero_point_C: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b, c are tensors on the GPU\ndef solve(\n    a: torch.Tensor,\n    b: torch.Tensor,\n    c: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    scale_A: float,\n    scale_B: float,\n    scale_C: float,\n    zero_point_A: int,\n    zero_point_B: int,\n    zero_point_C: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"INT8 Quantized MatMul\", atol=0, rtol=0, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        C: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        K: int,\n",
+        "        scale_A: float,\n",
+        "        scale_B: float,\n",
+        "        scale_C: float,\n",
+        "        zero_point_A: int,\n",
+        "        zero_point_B: int,\n",
+        "        zero_point_C: int,\n",
+        "    ):\n",
+        "        A = A.view(M, K).to(torch.int32)\n",
+        "        B = B.view(K, N).to(torch.int32)\n",
+        "        A_f = (A - zero_point_A).to(torch.float32)\n",
+        "        B_f = (B - zero_point_B).to(torch.float32)\n",
+        "        C_f = torch.matmul(A_f, B_f).round().int()  # closest thing to integer accumulation we have\n",
+        "        C_f = C_f * scale_A * scale_B / scale_C\n",
+        "        C_q = torch.round(C_f).to(torch.int32) + zero_point_C\n",
+        "        C_q = torch.clamp(C_q, -128, 127).to(torch.int8)\n",
+        "        C.view(M, N).copy_(C_q)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_int8), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_int8), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_int8), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"scale_A\": (ctypes.c_float, \"in\"),\n",
+        "            \"scale_B\": (ctypes.c_float, \"in\"),\n",
+        "            \"scale_C\": (ctypes.c_float, \"in\"),\n",
+        "            \"zero_point_A\": (ctypes.c_int, \"in\"),\n",
+        "            \"zero_point_B\": (ctypes.c_int, \"in\"),\n",
+        "            \"zero_point_C\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int8\n",
+        "        device = \"cuda\"\n",
+        "        A = torch.tensor([[1, 2], [3, 4]], dtype=dtype, device=device).flatten()\n",
+        "        B = torch.tensor([[5, 6], [7, 8]], dtype=dtype, device=device).flatten()\n",
+        "        C = torch.tensor([[0, 0], [0, 0]], dtype=dtype, device=device).flatten()\n",
+        "\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": 2,\n",
+        "            \"N\": 2,\n",
+        "            \"K\": 2,\n",
+        "            \"scale_A\": 0.1,\n",
+        "            \"scale_B\": 0.2,\n",
+        "            \"scale_C\": 0.05,\n",
+        "            \"zero_point_A\": 0,\n",
+        "            \"zero_point_B\": 0,\n",
+        "            \"zero_point_C\": 0,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int8\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # 1. 4x4x4_zero_zp\n",
+        "        A1 = torch.randint(-128, 128, (4, 4), dtype=dtype, device=device)\n",
+        "        B1 = torch.randint(-128, 128, (4, 4), dtype=dtype, device=device)\n",
+        "        C1 = torch.randint(-128, 128, (4, 4), dtype=dtype, device=device)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A1,\n",
+        "                \"B\": B1,\n",
+        "                \"C\": C1,\n",
+        "                \"M\": 4,\n",
+        "                \"N\": 4,\n",
+        "                \"K\": 4,\n",
+        "                \"scale_A\": 0.1,\n",
+        "                \"scale_B\": 0.2,\n",
+        "                \"scale_C\": 0.05,\n",
+        "                \"zero_point_A\": 0,\n",
+        "                \"zero_point_B\": 0,\n",
+        "                \"zero_point_C\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 2. 2x3x5_nonzero_zp\n",
+        "        A2 = torch.randint(-128, 128, (2, 5), dtype=dtype, device=device)\n",
+        "        B2 = torch.randint(-128, 128, (5, 3), dtype=dtype, device=device)\n",
+        "        C2 = torch.empty((2, 3), dtype=dtype, device=device)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A2,\n",
+        "                \"B\": B2,\n",
+        "                \"C\": C2,\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 3,\n",
+        "                \"K\": 5,\n",
+        "                \"scale_A\": 0.5,\n",
+        "                \"scale_B\": 0.25,\n",
+        "                \"scale_C\": 0.125,\n",
+        "                \"zero_point_A\": 1,\n",
+        "                \"zero_point_B\": -2,\n",
+        "                \"zero_point_C\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 3. 1x1x3\n",
+        "        A3 = torch.randint(-128, 128, (1, 3), dtype=dtype, device=device)\n",
+        "        B3 = torch.randint(-128, 128, (3, 1), dtype=dtype, device=device)\n",
+        "        C3 = torch.empty((1, 1), dtype=dtype, device=device)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A3,\n",
+        "                \"B\": B3,\n",
+        "                \"C\": C3,\n",
+        "                \"M\": 1,\n",
+        "                \"N\": 1,\n",
+        "                \"K\": 3,\n",
+        "                \"scale_A\": 1.0,\n",
+        "                \"scale_B\": 1.0,\n",
+        "                \"scale_C\": 1.0,\n",
+        "                \"zero_point_A\": 1,\n",
+        "                \"zero_point_B\": 3,\n",
+        "                \"zero_point_C\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 4. 3x5x2\n",
+        "        A4 = torch.randint(-50, 51, (3, 2), dtype=dtype, device=device)\n",
+        "        B4 = torch.randint(-50, 51, (2, 5), dtype=dtype, device=device)\n",
+        "        C4 = torch.zeros((3, 5), dtype=dtype, device=device)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A4,\n",
+        "                \"B\": B4,\n",
+        "                \"C\": C4,\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 5,\n",
+        "                \"K\": 2,\n",
+        "                \"scale_A\": 0.05,\n",
+        "                \"scale_B\": 0.1,\n",
+        "                \"scale_C\": 0.01,\n",
+        "                \"zero_point_A\": 0,\n",
+        "                \"zero_point_B\": 0,\n",
+        "                \"zero_point_C\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # 5. 32x32x16\n",
+        "        A5 = torch.randint(-128, 128, (32, 16), dtype=dtype, device=device)\n",
+        "        B5 = torch.randint(-128, 128, (16, 32), dtype=dtype, device=device)\n",
+        "        C5 = torch.empty((32, 32), dtype=dtype, device=device)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A5,\n",
+        "                \"B\": B5,\n",
+        "                \"C\": C5,\n",
+        "                \"M\": 32,\n",
+        "                \"N\": 32,\n",
+        "                \"K\": 16,\n",
+        "                \"scale_A\": 0.2,\n",
+        "                \"scale_B\": 0.3,\n",
+        "                \"scale_C\": 0.1,\n",
+        "                \"zero_point_A\": 0,\n",
+        "                \"zero_point_B\": 0,\n",
+        "                \"zero_point_C\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int8\n",
+        "        device = \"cuda\"\n",
+        "        shape_A = (8192, 2048)\n",
+        "        shape_B = (2048, 4096)\n",
+        "        shape_C = (8192, 4096)\n",
+        "        A = torch.randint(-128, 128, (shape_A[0] * shape_A[1],), dtype=dtype, device=device)\n",
+        "        B = torch.randint(-128, 128, (shape_B[0] * shape_B[1],), dtype=dtype, device=device)\n",
+        "        C = torch.empty(shape_C[0] * shape_C[1], dtype=dtype, device=device)\n",
+        "        M = 8192\n",
+        "        N = 4096\n",
+        "        K = 2048\n",
+        "        scale_A = 0.1\n",
+        "        scale_B = 0.1\n",
+        "        scale_C = 0.01\n",
+        "        zero_point_A = 0\n",
+        "        zero_point_B = 0\n",
+        "        zero_point_C = 0\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "            \"scale_A\": scale_A,\n",
+        "            \"scale_B\": scale_B,\n",
+        "            \"scale_C\": scale_C,\n",
+        "            \"zero_point_A\": zero_point_A,\n",
+        "            \"zero_point_B\": zero_point_B,\n",
+        "            \"zero_point_C\": zero_point_C,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/33_ordinary_least_squares.ipynb b/challenges/colab_exports/medium/33_ordinary_least_squares.ipynb
new file mode 100644
index 00000000..73456968
--- /dev/null
+++ b/challenges/colab_exports/medium/33_ordinary_least_squares.ipynb
@@ -0,0 +1,709 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Solve the Ordinary Least Squares (OLS) regression problem on a GPU. Given a feature matrix $X$ of size $n\\_samples \\times n\\_features$ and a target vector $y$ of size $n\\_samples$, compute the coefficient vector $\\beta$ that minimizes the sum of squared residuals:\n  $$ \\min_{\\beta} ||X\\beta - y||^2 $$\n\n  The closed-form solution to OLS is:\n  $$ \\beta = (X^TX)^{-1}X^Ty $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted.</li>\n  <li>The <code>solve</code> function signature must remain unchanged.</li>\n  <li>The final coefficients must be stored in the <code>beta</code> vector.</li>\n  <li>Assume that the feature matrix $X$ is full rank (i.e., $X^TX$ is invertible).</li>\n</ul>\n\n<h2>Example:</h2>\n<p>\nInput:<br>\n$X$ (samples \u00d7 features):\n$$\n\\begin{bmatrix}\n-0.23 & -0.23 & 1.52 \\\\\n0.77 & -0.47 & 1.58 \\\\\n-0.14 & 0.65 & 0.5 \\\\\n-1.91 & -1.72 & 0.24 \\\\\n-0.46 & -0.47 & 0.54\n\\end{bmatrix}\n$$\n$y$:\n$$\n\\begin{bmatrix}\n83.01 \\\\\n93.4 \\\\\n47.33 \\\\\n-62.22 \\\\\n13.06\n\\end{bmatrix}\n$$\nOutput:<br>\n$\\beta$:\n$$\n\\begin{bmatrix}\n13.97 \\\\\n29.12 \\\\\n61.05\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>n_samples</code> \u2264 100,000</li>\n  <li>1 \u2264 <code>n_features</code> \u2264 1,000</li>\n  <li><code>n_samples</code> \u2265 <code>n_features</code></li>\n  <li>-1000.0 \u2264 values in <code>X</code> and <code>y</code> \u2264 1000.0</li>\n  <li>Solutions are tested with absolute tolerance of 1e-2 and relative tolerance of 1e-2</li>\n\n  <li>Performance is measured with <code>n_features</code> = 32, <code>n_samples</code> = 32</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// X, y, beta are device pointers\nextern \"C\" void solve(const float* X, const float* y, float* beta, int n_samples, int n_features) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# X, y, beta are tensors on the GPU\n@cute.jit\ndef solve(\n    X: cute.Tensor, y: cute.Tensor, beta: cute.Tensor, n_samples: cute.Int32, n_features: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# X, y are tensors on the GPU\n@jax.jit\ndef solve(X: jax.Array, y: jax.Array, n_samples: int, n_features: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# X, y, beta are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    X: UnsafePointer[Float32, MutExternalOrigin],\n    y: UnsafePointer[Float32, MutExternalOrigin],\n    beta: UnsafePointer[Float32, MutExternalOrigin],\n    n_samples: Int32,\n    n_features: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# X, y, beta are tensors on the GPU\ndef solve(X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# X, y, beta are tensors on the GPU\ndef solve(X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Ordinary Least Squares\", atol=1e-02, rtol=1e-02, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int\n",
+        "    ):\n",
+        "        # Reshape tensors to their proper dimensions\n",
+        "        X_reshaped = X.view(n_samples, n_features)\n",
+        "        y_reshaped = y.view(n_samples)\n",
+        "\n",
+        "        # Compute X^T * X\n",
+        "        XTX = torch.matmul(X_reshaped.t(), X_reshaped)\n",
+        "\n",
+        "        # Compute X^T * y\n",
+        "        XTy = torch.matmul(X_reshaped.t(), y_reshaped)\n",
+        "\n",
+        "        # Solve the system using Cholesky decomposition\n",
+        "        L = torch.linalg.cholesky(XTX)\n",
+        "\n",
+        "        # Manual forward substitution for L * z = X^T * y\n",
+        "        z = torch.zeros_like(XTy)\n",
+        "        for i in range(n_features):\n",
+        "            z[i] = XTy[i]\n",
+        "            for j in range(i):\n",
+        "                z[i] = z[i] - L[i, j] * z[j]\n",
+        "            z[i] = z[i] / L[i, i]\n",
+        "\n",
+        "        # Manual backward substitution for L^T * beta = z\n",
+        "        result = torch.zeros_like(z)\n",
+        "        for i in range(n_features - 1, -1, -1):\n",
+        "            result[i] = z[i]\n",
+        "            for j in range(i + 1, n_features):\n",
+        "                result[i] = result[i] - L[j, i] * result[j]\n",
+        "            result[i] = result[i] / L[i, i]\n",
+        "\n",
+        "        # Copy to output tensor\n",
+        "        beta.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"X\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"y\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"beta\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"n_samples\": (ctypes.c_int, \"in\"),\n",
+        "            \"n_features\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        n_samples, n_features = 5, 3\n",
+        "        X = torch.tensor(\n",
+        "            [\n",
+        "                [-0.23, -0.23, 1.52],\n",
+        "                [0.77, -0.47, 1.58],\n",
+        "                [-0.14, 0.65, 0.5],\n",
+        "                [-1.91, -1.72, 0.24],\n",
+        "                [-0.46, -0.47, 0.54],\n",
+        "            ],\n",
+        "            dtype=dtype,\n",
+        "            device=\"cuda\",\n",
+        "        )\n",
+        "        y = torch.tensor([83.01, 93.4, 47.33, -62.22, 13.06], dtype=dtype, device=\"cuda\")\n",
+        "        beta = torch.empty(n_features, dtype=dtype, device=\"cuda\")\n",
+        "        return {\n",
+        "            \"X\": X.flatten(),\n",
+        "            \"y\": y,\n",
+        "            \"beta\": beta,\n",
+        "            \"n_samples\": n_samples,\n",
+        "            \"n_features\": n_features,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # Test 1: simple_1d\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.24799999594688416],\n",
+        "                        [-0.0689999982714653],\n",
+        "                        [0.3240000009536743],\n",
+        "                        [0.7620000243186951],\n",
+        "                        [-0.11699999868869781],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.12200000137090683,\n",
+        "                        -0.01899999938905239,\n",
+        "                        0.17000000178813934,\n",
+        "                        0.37599998712539673,\n",
+        "                        -0.05299999937415123,\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(1, dtype=dtype, device=device),\n",
+        "                \"n_samples\": 5,\n",
+        "                \"n_features\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 2: simple_2d\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.1289999932050705, -0.45399999618530273],\n",
+        "                        [-0.1889999955892563, -0.2669999897480011],\n",
+        "                        [0.42899999022483826, -0.2070000022649765],\n",
+        "                        [0.24899999797344208, 1.0049999952316284],\n",
+        "                        [0.6309999823570251, -0.2199999988079071],\n",
+        "                        [-0.17299999296665192, 0.2280000001192093],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [\n",
+        "                        -0.40700000524520874,\n",
+        "                        -0.3709999918937683,\n",
+        "                        0.013000000268220901,\n",
+        "                        1.128000020980835,\n",
+        "                        0.11500000208616257,\n",
+        "                        0.13500000536441803,\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(2, dtype=dtype, device=device),\n",
+        "                \"n_samples\": 6,\n",
+        "                \"n_features\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 3: square_3x3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.125, 0.6579999923706055, 0.6230000257492065],\n",
+        "                        [-0.8019999861717224, -0.23399999737739563, -0.8579999804496765],\n",
+        "                        [0.9290000200271606, 0.04399999976158142, 0.4740000069141388],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [1.6610000133514404, -1.930999994277954, 1.2170000076293945],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(3, dtype=dtype, device=device),\n",
+        "                \"n_samples\": 3,\n",
+        "                \"n_features\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 4: overdetermined_8x3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.013000000268220901, 0.12999999523162842, -0.1979999989271164],\n",
+        "                        [-0.10199999809265137, -0.6359999775886536, -1.2979999780654907],\n",
+        "                        [0.14499999582767487, -0.43700000643730164, 0.19699999690055847],\n",
+        "                        [0.46799999475479126, -0.00800000037997961, 0.12999999523162842],\n",
+        "                        [-0.7369999885559082, 0.4009999930858612, -0.875],\n",
+        "                        [-0.24799999594688416, -0.5040000081062317, 0.013000000268220901],\n",
+        "                        [-0.061000000685453415, -0.7730000019073486, -0.30300000309944153],\n",
+        "                        [-0.6970000267028809, -0.3140000104904175, 0.16599999368190765],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [\n",
+        "                        -0.17499999701976776,\n",
+        "                        -2.618000030517578,\n",
+        "                        -0.07400000095367432,\n",
+        "                        0.4269999861717224,\n",
+        "                        -1.2580000162124634,\n",
+        "                        -0.6259999871253967,\n",
+        "                        -1.2640000581741333,\n",
+        "                        -0.41600000858306885,\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(3, dtype=dtype, device=device),\n",
+        "                \"n_samples\": 8,\n",
+        "                \"n_features\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 5: medium_10x5\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [\n",
+        "                            0.2919999957084656,\n",
+        "                            0.6159999966621399,\n",
+        "                            0.41100001335144043,\n",
+        "                            -0.4000000059604645,\n",
+        "                            0.20600000023841858,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.08799999952316284,\n",
+        "                            -0.03700000047683716,\n",
+        "                            -0.28299999237060547,\n",
+        "                            -0.04699999839067459,\n",
+        "                            0.42899999022483826,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.4309999942779541,\n",
+        "                            0.00800000037997961,\n",
+        "                            0.7829999923706055,\n",
+        "                            -0.23499999940395355,\n",
+        "                            -0.19599999487400055,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            0.40799999237060547,\n",
+        "                            0.03799999877810478,\n",
+        "                            -0.05000000074505806,\n",
+        "                            0.8119999766349792,\n",
+        "                            -0.6679999828338623,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.06800000369548798,\n",
+        "                            -0.23899999260902405,\n",
+        "                            -0.796999990940094,\n",
+        "                            -0.4339999854564667,\n",
+        "                            -0.01600000075995922,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.7639999985694885,\n",
+        "                            -0.06199999898672104,\n",
+        "                            -0.13099999725818634,\n",
+        "                            0.49799999594688416,\n",
+        "                            0.1589999943971634,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.01899999938905239,\n",
+        "                            -0.03400000184774399,\n",
+        "                            -0.22100000083446503,\n",
+        "                            -0.23999999463558197,\n",
+        "                            0.026000000536441803,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -0.4869999885559082,\n",
+        "                            -0.7170000076293945,\n",
+        "                            -0.18000000715255737,\n",
+        "                            0.22699999809265137,\n",
+        "                            -0.40299999713897705,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            -1.347000002861023,\n",
+        "                            0.25099998712539673,\n",
+        "                            -0.0020000000949949026,\n",
+        "                            -0.19599999487400055,\n",
+        "                            -0.07800000160932541,\n",
+        "                        ],\n",
+        "                        [\n",
+        "                            0.22499999403953552,\n",
+        "                            0.593999981880188,\n",
+        "                            -0.16699999570846558,\n",
+        "                            -0.057999998331069946,\n",
+        "                            0.9179999828338623,\n",
+        "                        ],\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [\n",
+        "                        1.1009999513626099,\n",
+        "                        0.4620000123977661,\n",
+        "                        -0.007000000216066837,\n",
+        "                        0.1420000046491623,\n",
+        "                        -2.3970000743865967,\n",
+        "                        0.7590000033378601,\n",
+        "                        -0.796999990940094,\n",
+        "                        -1.7799999713897705,\n",
+        "                        -1.003000020980835,\n",
+        "                        2.617000102996826,\n",
+        "                    ],\n",
+        "                    dtype=dtype,\n",
+        "                    device=device,\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(5, dtype=dtype, device=device),\n",
+        "                \"n_samples\": 10,\n",
+        "                \"n_features\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        n_samples = 32\n",
+        "        n_features = 32\n",
+        "        X = torch.eye(n_samples, dtype=dtype, device=device)\n",
+        "        y = torch.ones(n_samples, dtype=dtype, device=device)\n",
+        "        beta = torch.zeros(n_features, dtype=dtype, device=device)\n",
+        "        return {\n",
+        "            \"X\": X.flatten(),  # flattened as in your other examples,\n",
+        "            \"y\": y,\n",
+        "            \"beta\": beta,\n",
+        "            \"n_samples\": n_samples,\n",
+        "            \"n_features\": n_features,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/34_logistic_regression.ipynb b/challenges/colab_exports/medium/34_logistic_regression.ipynb
new file mode 100644
index 00000000..5744b020
--- /dev/null
+++ b/challenges/colab_exports/medium/34_logistic_regression.ipynb
@@ -0,0 +1,621 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Solve the logistic regression problem on a GPU. Given a feature matrix $X$ of size $n\\_samples \\times n\\_features$ and a binary target vector $y$ of size $n\\_samples$ (containing only 0s and 1s), compute the coefficient vector $\\beta$ that maximizes the log-likelihood:\n  $$ \\max_{\\beta} \\sum_{i=1}^{n} \\left[ y_i \\log(p_i) + (1-y_i) \\log(1-p_i) \\right] $$\n\n  where $p_i = \\sigma(X_i^T \\beta)$ and $\\sigma(z) = \\frac{1}{1 + e^{-z}}$ is the sigmoid function.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final coefficients must be stored in the <code>beta</code> vector</li>\n  <li>The target vector <code>y</code> contains only binary values (0 and 1)</li>\n</ul>\n\n<h2>Example:</h2>\n<p>\nInput:<br>\n$X$ (samples \u00d7 features):\n$$\n\\begin{bmatrix}\n2.0 & 1.0 \\\\\n1.0 & 2.0 \\\\\n3.0 & 3.0 \\\\\n1.5 & 2.5 \\\\\n-1.0 & -2.0 \\\\\n-2.0 & -1.0 \\\\\n-1.5 & -2.5 \\\\\n-3.0 & -3.0\n\\end{bmatrix}\n$$\n$y$:\n$$\n\\begin{bmatrix}\n1 \\\\\n1 \\\\\n1 \\\\\n0 \\\\\n0 \\\\\n0 \\\\\n1 \\\\\n0\n\\end{bmatrix}\n$$\nOutput:<br>\n$\\beta$:\n$$\n\\begin{bmatrix}\n2.26 \\\\\n-1.29\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>n_samples</code> \u2264 100,000</li>\n  <li>1 \u2264 <code>n_features</code> \u2264 1,000</li>\n  <li><code>n_samples</code> \u2265 <code>n_features</code></li>\n  <li>-10.0 \u2264 values in <code>X</code> \u2264 10.0</li>\n  <li><code>y</code> contains only binary values: 0 or 1</li>\n  <li>Solutions are tested with absolute tolerance of 1e-2 and relative tolerance of 1e-2</li>\n\n  <li>Performance is measured with <code>n_features</code> = 8, <code>n_samples</code> = 16</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// X, y, beta are device pointers\nextern \"C\" void solve(const float* X, const float* y, float* beta, int n_samples, int n_features) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# X, y, beta are tensors on the GPU\n@cute.jit\ndef solve(\n    X: cute.Tensor, y: cute.Tensor, beta: cute.Tensor, n_samples: cute.Int32, n_features: cute.Int32\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# X, y are tensors on the GPU\n@jax.jit\ndef solve(X: jax.Array, y: jax.Array, n_samples: int, n_features: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# X, y, beta are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    X: UnsafePointer[Float32, MutExternalOrigin],\n    y: UnsafePointer[Float32, MutExternalOrigin],\n    beta: UnsafePointer[Float32, MutExternalOrigin],\n    n_samples: Int32,\n    n_features: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# X, y, beta are tensors on the GPU\ndef solve(X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# X, y, beta are tensors on the GPU\ndef solve(X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Logistic Regression\", atol=1e-02, rtol=1e-02, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, X: torch.Tensor, y: torch.Tensor, beta: torch.Tensor, n_samples: int, n_features: int\n",
+        "    ):\n",
+        "        \"\"\"\n",
+        "        Logistic regression using Newton-Raphson (IRLS) in PyTorch.\n",
+        "        This converges faster and more accurately than plain gradient descent.\n",
+        "        \"\"\"\n",
+        "        assert X.dtype == torch.float32\n",
+        "        assert y.dtype == torch.float32\n",
+        "        assert beta.dtype == torch.float32\n",
+        "        assert X.shape == (n_samples, n_features)\n",
+        "        assert y.shape == (n_samples,)\n",
+        "        assert beta.shape == (n_features,)\n",
+        "\n",
+        "        X_reshaped = X.view(n_samples, n_features)\n",
+        "        y_reshaped = y.view(n_samples)\n",
+        "        beta.zero_()\n",
+        "\n",
+        "        max_iter = 1000\n",
+        "        tol = 1e-8\n",
+        "        l2_reg = 1e-6\n",
+        "\n",
+        "        for _ in range(max_iter):\n",
+        "            z = torch.mv(X_reshaped, beta)\n",
+        "            p = torch.sigmoid(z)\n",
+        "            W = p * (1 - p)\n",
+        "            W = torch.clamp(W, min=1e-8)\n",
+        "\n",
+        "            # Gradient\n",
+        "            gradient = torch.mv(X_reshaped.t(), p - y_reshaped) + l2_reg * beta\n",
+        "\n",
+        "            # Hessian\n",
+        "            XW = X_reshaped * W.unsqueeze(1)\n",
+        "            hessian = torch.mm(X_reshaped.t(), XW) + l2_reg * torch.eye(\n",
+        "                n_features, device=X.device, dtype=X.dtype\n",
+        "            )\n",
+        "\n",
+        "            # Solve H @ delta = gradient\n",
+        "            try:\n",
+        "                delta = torch.linalg.solve(hessian, gradient)\n",
+        "            except RuntimeError:\n",
+        "                delta = torch.linalg.lstsq(hessian, gradient.unsqueeze(1)).solution.squeeze()\n",
+        "\n",
+        "            beta_new = beta - delta\n",
+        "\n",
+        "            if torch.norm(beta_new - beta) < tol:\n",
+        "                beta.copy_(beta_new)\n",
+        "                break\n",
+        "\n",
+        "            beta.copy_(beta_new)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"X\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"y\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"beta\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"n_samples\": (ctypes.c_int, \"in\"),\n",
+        "            \"n_features\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        X = torch.tensor(\n",
+        "            [\n",
+        "                [2.0, 1.0],\n",
+        "                [1.0, 2.0],\n",
+        "                [3.0, 3.0],\n",
+        "                [1.5, 2.5],\n",
+        "                [-1.0, -2.0],\n",
+        "                [-2.0, -1.0],\n",
+        "                [-1.5, -2.5],\n",
+        "                [-3.0, -3.0],\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        y = torch.tensor([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0], device=\"cuda\", dtype=dtype)\n",
+        "        beta = torch.zeros(2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"X\": X,\n",
+        "            \"y\": y,\n",
+        "            \"beta\": beta,\n",
+        "            \"n_samples\": 8,\n",
+        "            \"n_features\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # simple_1d\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.24799999594688416],\n",
+        "                        [-0.0689999982714653],\n",
+        "                        [0.3240000009536743],\n",
+        "                        [0.7620000243186951],\n",
+        "                        [-0.11699999868869781],\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor([1.0, 1.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"n_samples\": 5,\n",
+        "                \"n_features\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # simple_2d\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.1289999932050705, -0.45399999618530273],\n",
+        "                        [-0.1889999955892563, -0.2669999897480011],\n",
+        "                        [0.42899999022483826, -0.2070000022649765],\n",
+        "                        [0.24899999797344208, 1.0049999952316284],\n",
+        "                        [0.6309999823570251, -0.2199999988079071],\n",
+        "                        [-0.17299999296665192, 0.2280000001192093],\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor([0.0, 0.0, 1.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.zeros(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"n_samples\": 6,\n",
+        "                \"n_features\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # square_3x3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.125, 0.6579999923706055, 0.6230000257492065],\n",
+        "                        [-0.8019999861717224, -0.23399999737739563, -0.8579999804496765],\n",
+        "                        [0.9290000200271606, 0.04399999976158142, 0.4740000069141388],\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor([1.0, 0.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.zeros(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"n_samples\": 3,\n",
+        "                \"n_features\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # overdetermined_8x3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.013000000268220901, 0.12999999523162842, -0.1979999989271164],\n",
+        "                        [-0.10199999809265137, -0.6359999775886536, -1.2979999780654907],\n",
+        "                        [0.14499999582767487, -0.43700000643730164, 0.19699999690055847],\n",
+        "                        [0.46799999475479126, -0.00800000037997961, 0.12999999523162842],\n",
+        "                        [-0.7369999885559082, 0.4009999930858612, -0.875],\n",
+        "                        [-0.24799999594688416, -0.5040000081062317, 0.013000000268220901],\n",
+        "                        [-0.061000000685453415, -0.7730000019073486, -0.30300000309944153],\n",
+        "                        [-0.6970000267028809, -0.3140000104904175, 0.16599999368190765],\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"n_samples\": 8,\n",
+        "                \"n_features\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_10x3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.tensor(\n",
+        "                    [\n",
+        "                        [0.2919999957084656, 0.6159999966621399, 0.41100001335144043],\n",
+        "                        [-0.4000000059604645, 0.20600000023841858, -0.08799999952316284],\n",
+        "                        [-0.03700000047683716, -0.28299999237060547, -0.04699999839067459],\n",
+        "                        [0.42899999022483826, -0.4309999942779541, 0.00800000037997961],\n",
+        "                        [0.7829999923706055, -0.23499999940395355, -0.19599999487400055],\n",
+        "                        [0.40799999237060547, 0.03799999877810478, -0.05000000074505806],\n",
+        "                        [0.8119999766349792, -0.6679999828338623, -0.06800000369548798],\n",
+        "                        [-0.23899999260902405, -0.796999990940094, -0.4339999854564667],\n",
+        "                        [-0.01600000075995922, -0.7639999985694885, -0.06199999898672104],\n",
+        "                        [-0.13099999725818634, 0.49799999594688416, 0.1589999943971634],\n",
+        "                    ],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"y\": torch.tensor(\n",
+        "                    [1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"beta\": torch.zeros(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"n_samples\": 10,\n",
+        "                \"n_features\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        X = torch.eye(8, device=device, dtype=dtype).repeat(2, 1)\n",
+        "        y = torch.tensor(\n",
+        "            [0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        beta = torch.zeros(8, device=device, dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"X\": X,\n",
+        "            \"y\": y,\n",
+        "            \"beta\": beta,\n",
+        "            \"n_samples\": 16,\n",
+        "            \"n_features\": 8,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/35_monte_carlo_integration.ipynb b/challenges/colab_exports/medium/35_monte_carlo_integration.ipynb
new file mode 100644
index 00000000..1736ba36
--- /dev/null
+++ b/challenges/colab_exports/medium/35_monte_carlo_integration.ipynb
@@ -0,0 +1,507 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement Monte Carlo integration on a GPU. Given a set of function values $y_i = f(x_i)$ sampled at random points $x_i$ uniformly distributed in the interval $[a, b]$, estimate the definite integral:\n  $$ \\int_a^b f(x) \\, dx \\approx (b - a) \\cdot \\frac{1}{n} \\sum_{i=1}^{n} y_i $$\n\n  The Monte Carlo method approximates the integral by computing the average of the function values and multiplying by the interval width.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>result</code> variable</li>\n  <li>Solutions are tested with absolute tolerance of 1e-2 and relative tolerance of 1e-2</li>\n</ul>\n\n<h2>Example:</h2>\n<pre>\nInput:  a = 0, b = 2, n_samples = 8\n        y_samples = [0.0625, 0.25, 0.5625, 1.0, 1.5625, 2.25, 3.0625, 4.0]\nOutput: result = 3.1875\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>n_samples</code> \u2264 100,000,000</li>\n  <li>-1000.0 \u2264 <code>a</code> &lt; <code>b</code> \u2264 1000.0</li>\n  <li>-10000.0 \u2264 function values \u2264 10000.0</li>\n  <li>The tolerance is set to 1e-2 to account for the inherent randomness in Monte Carlo methods and floating-point precision variations.</li>\n\n  <li>Performance is measured with <code>n_samples</code> = 10,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// y_samples, result are device pointers\nextern \"C\" void solve(const float* y_samples, float* result, float a, float b, int n_samples) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# y_samples, result are tensors on the GPU\n@cute.jit\ndef solve(\n    y_samples: cute.Tensor,\n    result: cute.Tensor,\n    a: cute.Float32,\n    b: cute.Float32,\n    n_samples: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# y_samples is a tensor on the GPU\n@jax.jit\ndef solve(y_samples: jax.Array, a: float, b: float, n_samples: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# y_samples, result are device pointers\n@export\ndef solve(\n    y_samples: UnsafePointer[Float32, MutExternalOrigin],\n    result: UnsafePointer[Float32, MutExternalOrigin],\n    a: Float32,\n    b: Float32,\n    n_samples: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# y_samples, result are tensors on the GPU\ndef solve(y_samples: torch.Tensor, result: torch.Tensor, a: float, b: float, n_samples: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# y_samples, result are tensors on the GPU\ndef solve(y_samples: torch.Tensor, result: torch.Tensor, a: float, b: float, n_samples: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Monte Carlo Integration\", atol=1e-02, rtol=1e-02, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, y_samples: torch.Tensor, result: torch.Tensor, a: float, b: float, n_samples: int\n",
+        "    ):\n",
+        "        assert y_samples.shape == (n_samples,)\n",
+        "        assert result.shape == (1,)\n",
+        "        assert y_samples.dtype == result.dtype\n",
+        "        assert y_samples.device == result.device\n",
+        "        assert b > a\n",
+        "\n",
+        "        # Monte Carlo integration: integral \u2248 (b - a) * mean(y_samples)\n",
+        "        mean_y = torch.mean(y_samples)\n",
+        "        integral = (b - a) * mean_y\n",
+        "\n",
+        "        result[0] = integral\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"y_samples\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"result\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"a\": (ctypes.c_float, \"in\"),\n",
+        "            \"b\": (ctypes.c_float, \"in\"),\n",
+        "            \"n_samples\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        y_samples = torch.tensor(\n",
+        "            [0.0625, 0.25, 0.5625, 1.0, 1.5625, 2.25, 3.0625, 4.0], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        result = torch.zeros(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"y_samples\": y_samples,\n",
+        "            \"result\": result,\n",
+        "            \"a\": 0.0,\n",
+        "            \"b\": 2.0,\n",
+        "            \"n_samples\": 8,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_specs = [\n",
+        "            # Basic test cases\n",
+        "            (\"basic_8\", [0.0625, 0.25, 0.5625, 1.0, 1.5625, 2.25, 3.0625, 4.0], 0.0, 2.0),\n",
+        "            (\"constant_function\", [1.0, 1.0, 1.0, 1.0], 0.0, 4.0),\n",
+        "            (\"linear_function\", [0.0, 1.0, 2.0, 3.0], 0.0, 3.0),\n",
+        "            (\"negative_interval\", [-1.0, -2.0, -3.0], -2.0, 1.0),\n",
+        "            (\"small_interval\", [0.5, 1.5], 1.0, 2.0),\n",
+        "        ]\n",
+        "\n",
+        "        test_cases = []\n",
+        "        for _, y_vals, a, b in test_specs:\n",
+        "            n_samples = len(y_vals)\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"y_samples\": torch.tensor(y_vals, device=\"cuda\", dtype=dtype),\n",
+        "                    \"result\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "                    \"a\": a,\n",
+        "                    \"b\": b,\n",
+        "                    \"n_samples\": n_samples,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Random test cases with different sizes\n",
+        "        for _, n_samples, a, b in [\n",
+        "            (\"small_samples\", 10, 0.0, 1.0),\n",
+        "            (\"medium_samples\", 100, -1.0, 1.0),\n",
+        "            (\"large_samples\", 1000, 0.0, 10.0),\n",
+        "            (\"many_samples\", 10000, -5.0, 5.0),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"y_samples\": torch.empty(n_samples, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -10.0, 10.0\n",
+        "                    ),\n",
+        "                    \"result\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "                    \"a\": a,\n",
+        "                    \"b\": b,\n",
+        "                    \"n_samples\": n_samples,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Edge cases\n",
+        "        for _, n_samples, a, b in [\n",
+        "            (\"min_samples\", 1, 0.0, 1.0),\n",
+        "            (\"large_interval\", 100, -100.0, 100.0),\n",
+        "            (\"small_interval_edge\", 50, 0.0, 0.1),\n",
+        "        ]:\n",
+        "            test_cases.append(\n",
+        "                {\n",
+        "                    \"y_samples\": torch.empty(n_samples, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                        -1.0, 1.0\n",
+        "                    ),\n",
+        "                    \"result\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "                    \"a\": a,\n",
+        "                    \"b\": b,\n",
+        "                    \"n_samples\": n_samples,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        n_samples = 10000000\n",
+        "        return {\n",
+        "            \"y_samples\": torch.empty(n_samples, device=\"cuda\", dtype=dtype).uniform_(\n",
+        "                -1000.0, 1000.0\n",
+        "            ),\n",
+        "            \"result\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "            \"a\": -10.0,\n",
+        "            \"b\": 10.0,\n",
+        "            \"n_samples\": n_samples,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/37_matrix_power.ipynb b/challenges/colab_exports/medium/37_matrix_power.ipynb
new file mode 100644
index 00000000..61066bd6
--- /dev/null
+++ b/challenges/colab_exports/medium/37_matrix_power.ipynb
@@ -0,0 +1,518 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a GPU program that raises a square matrix $A$ of size $N \\times N$ to an integer power $P$.<br/>\n    The <code>solve</code> function receives a flattened input matrix <code>input</code> (row-major order), an empty output matrix <code>output</code> of the same size, the dimension <code>N</code>, and the exponent <code>P</code>.<br/>\n    You must compute $\\text{output} = A^{P}$ where matrix multiplication is standard dense multiplication over 32-bit floating point numbers.\n  </p>\n\n  <h2>Implementation Requirements</h2>\n  <ul>\n    <li>External libraries are <strong>not</strong> permitted.</li>\n    <li>The <code>solve</code> function signature must remain unchanged.</li>\n    <li>The final result must be written to the <code>output</code> array in row-major order.</li>\n  </ul>\n\n  <h2>Example 1:</h2>\n  <pre>\n  Input:\n    input  = [[1.0, 2.0],\n              [3.0, 4.0]]\n    N      = 2\n    P      = 3\n  Output:\n    output = [[37.0, 54.0],\n              [81.0, 118.0]]\n  </pre>\n\n  <h2>Example 2:</h2>\n  <pre>\n  Input:\n    input  = [[1.0, 0.0, 2.0],\n              [0.0, 1.0, 0.0],\n              [3.0, 0.0, 0.0]]\n    N      = 3\n    P      = 2\n  Output:\n    output = [[7.0, 0.0, 2.0],\n              [0.0, 1.0, 0.0],\n              [3.0, 0.0, 6.0]]\n  </pre>\n\n  <h2>Constraints</h2>\n  <ul>\n    <li>$1 \\le N \\le 1024$</li>\n    <li>$1 \\le P \\le 20$</li>\n    <li>Elements of <code>input</code> satisfy $-10.0 \\le A_{ij} \\le 10.0$</li>\n\n  <li>Performance is measured with <code>N</code> = 512</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N, int P) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, P: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, P: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    P: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, P: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, P: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Matrix Power\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, P: int):\n",
+        "        \"\"\"\n",
+        "        Matrix power implementation using PyTorch.\n",
+        "        Raises an N x N matrix to integer power P.\n",
+        "        \"\"\"\n",
+        "        assert input.dtype == torch.float32\n",
+        "        assert output.dtype == torch.float32\n",
+        "        assert input.shape == output.shape == (N * N,)\n",
+        "        assert P >= 1\n",
+        "\n",
+        "        mat = input.view(N, N)\n",
+        "        result = torch.linalg.matrix_power(mat, P).float()\n",
+        "        output[:] = result.reshape(-1)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"P\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 2\n",
+        "        P = 3\n",
+        "        input_data = torch.tensor([[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype).flatten()\n",
+        "        output_data = torch.zeros((2, 2), device=\"cuda\", dtype=dtype).flatten()\n",
+        "\n",
+        "        return {\n",
+        "            \"input\": input_data,\n",
+        "            \"output\": output_data,\n",
+        "            \"N\": N,\n",
+        "            \"P\": P,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Test case 1: example 2x2 power 3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0], [3.0, 4.0]], device=\"cuda\", dtype=dtype\n",
+        "                ).flatten(),\n",
+        "                \"output\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 2,\n",
+        "                \"P\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: identity 3x3 power 5\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.eye(3, device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"output\": torch.zeros((3, 3), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 3,\n",
+        "                \"P\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: random 5x5 power 2\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((5, 5), device=\"cuda\", dtype=dtype)\n",
+        "                .uniform_(-5.0, 5.0)\n",
+        "                .flatten(),\n",
+        "                \"output\": torch.zeros((5, 5), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 5,\n",
+        "                \"P\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: random 16x16 power 3\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((16, 16), device=\"cuda\", dtype=dtype)\n",
+        "                .uniform_(-1.0, 1.0)\n",
+        "                .flatten(),\n",
+        "                \"output\": torch.zeros((16, 16), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 16,\n",
+        "                \"P\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: random 8x8 power 4\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((8, 8), device=\"cuda\", dtype=dtype)\n",
+        "                .uniform_(-10.0, 10.0)\n",
+        "                .flatten(),\n",
+        "                \"output\": torch.zeros((8, 8), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 8,\n",
+        "                \"P\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: random 10x10 power 1\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((10, 10), device=\"cuda\", dtype=dtype)\n",
+        "                .uniform_(-2.0, 2.0)\n",
+        "                .flatten(),\n",
+        "                \"output\": torch.zeros((10, 10), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "                \"N\": 10,\n",
+        "                \"P\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 512\n",
+        "        P = 3\n",
+        "        return {\n",
+        "            \"input\": torch.empty((N, N), device=\"cuda\", dtype=dtype)\n",
+        "            .uniform_(-10.0, 10.0)\n",
+        "            .flatten(),\n",
+        "            \"output\": torch.zeros((N, N), device=\"cuda\", dtype=dtype).flatten(),\n",
+        "            \"N\": N,\n",
+        "            \"P\": P,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/38_nearest_neighbor.ipynb b/challenges/colab_exports/medium/38_nearest_neighbor.ipynb
new file mode 100644
index 00000000..ba7fb595
--- /dev/null
+++ b/challenges/colab_exports/medium/38_nearest_neighbor.ipynb
@@ -0,0 +1,599 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that, for <code>N</code> three-dimensional points stored on the device, fills <code>indices[i]</code> with the index <code>j \u2260 i</code> of the point closest to <code>points[i]</code>. Comparing <em>squared</em> Euclidean distance is sufficient\u2014you do <strong>not</strong> need to compute square-roots.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>External libraries are not permitted</li>\n  <li>The final result must be stored in the <code>indices</code> array</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  points  = [(0,0,0), (1,0,0), (5,5,5)]\n        indices = [-1, -1, -1]\n        N       = 3\nOutput: indices = [1, 0, 1]   # 0\u21c61 are nearest, 2 is closest to 1</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 100,000</li>\n  <li>Coordinates are 32-bit floats in the range [-1000, 1000]</li>\n\n  <li>Performance is measured with <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// points and indices are device pointers\nextern \"C\" void solve(const float* points, int* indices, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# points, indices are tensors on the GPU\n@cute.jit\ndef solve(points: cute.Tensor, indices: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# points is a tensor on the GPU\n@jax.jit\ndef solve(points: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.memory import UnsafePointer\n\n\n# points and indices are device pointers\n@export\ndef solve(\n    points: UnsafePointer[Float32, MutExternalOrigin],\n    indices: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# points and indices are tensors on the GPU\ndef solve(points: torch.Tensor, indices: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# points and indices are tensors on the GPU\ndef solve(points: torch.Tensor, indices: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Nearest Neighbor\", atol=0, rtol=0, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, points: torch.Tensor, indices: torch.Tensor, N: int):\n",
+        "        \"\"\"\n",
+        "        Reference implementation that finds the nearest neighbor for each point.\n",
+        "        For N three-dimensional points, fills indices[i] with the index j\u2260i\n",
+        "        of the point closest to points[i].\n",
+        "        \"\"\"\n",
+        "        assert points.dtype == torch.float32\n",
+        "        assert indices.dtype == torch.int32\n",
+        "        assert points.shape == (N * 3,)  # N points, each with 3 coordinates\n",
+        "        assert indices.shape == (N,)\n",
+        "        assert N >= 1\n",
+        "\n",
+        "        # Reshape points to (N, 3) for easier processing\n",
+        "        pts = points.view(N, 3)\n",
+        "\n",
+        "        # pts shape: (N, 3)\n",
+        "        # Expand to (N, 1, 3) and (1, N, 3) for broadcasting\n",
+        "        pts_expand1 = pts.unsqueeze(1)  # (N, 1, 3)\n",
+        "        pts_expand2 = pts.unsqueeze(0)  # (1, N, 3)\n",
+        "\n",
+        "        # Compute all pairwise squared distances: (N, N)\n",
+        "        diff = pts_expand1 - pts_expand2  # (N, N, 3)\n",
+        "        dist_sq = torch.sum(diff * diff, dim=2)  # (N, N)\n",
+        "\n",
+        "        # Mask diagonal (distance to self) with large value\n",
+        "        mask = torch.eye(N, device=points.device, dtype=torch.bool)\n",
+        "        dist_sq[mask] = float(\"inf\")\n",
+        "\n",
+        "        # Find nearest neighbor indices\n",
+        "        indices.copy_(torch.argmin(dist_sq, dim=1).int())\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"points\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"indices\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        N = 3\n",
+        "\n",
+        "        # Example: points = [(0,0,0), (1,0,0), (5,5,5)]\n",
+        "        points_data = torch.tensor(\n",
+        "            [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 5.0, 5.0, 5.0],  # point 0  # point 1  # point 2\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype_float,\n",
+        "        )\n",
+        "        indices_data = torch.full((N,), -1, device=\"cuda\", dtype=dtype_int)\n",
+        "\n",
+        "        return {\n",
+        "            \"points\": points_data,\n",
+        "            \"indices\": indices_data,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Test case 1: Basic example from problem description\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.tensor(\n",
+        "                    [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 5.0, 5.0, 5.0],  # point 0  # point 1  # point 2\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"indices\": torch.full((3,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: Two points only\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.tensor(\n",
+        "                    [0.0, 0.0, 0.0, 3.0, 4.0, 0.0],  # point 0  # point 1\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"indices\": torch.full((2,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: Four points in a square\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 0\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 1\n",
+        "                        0.0,\n",
+        "                        1.0,\n",
+        "                        0.0,  # point 2\n",
+        "                        1.0,\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                    ],  # point 3\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"indices\": torch.full((4,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: Points with negative coordinates\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.tensor(\n",
+        "                    [\n",
+        "                        -1.0,\n",
+        "                        -1.0,\n",
+        "                        -1.0,  # point 0\n",
+        "                        1.0,\n",
+        "                        1.0,\n",
+        "                        1.0,  # point 1\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 2\n",
+        "                        2.0,\n",
+        "                        2.0,\n",
+        "                        2.0,\n",
+        "                    ],  # point 3\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"indices\": torch.full((4,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: Points with clear unique nearest neighbors\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.tensor(\n",
+        "                    [\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 0\n",
+        "                        10.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 1\n",
+        "                        1.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 2 (closest to 0)\n",
+        "                        11.0,\n",
+        "                        0.0,\n",
+        "                        0.0,  # point 3 (closest to 1)\n",
+        "                        5.0,\n",
+        "                        0.0,\n",
+        "                        0.0,\n",
+        "                    ],  # point 4\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"indices\": torch.full((5,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: Medium random test with fixed seed for reproducibility\n",
+        "        torch.manual_seed(42)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.empty((100, 3), device=\"cuda\", dtype=dtype_float)\n",
+        "                .uniform_(-100.0, 100.0)\n",
+        "                .flatten(),\n",
+        "                \"indices\": torch.full((100,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 100,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 7: Larger test with fixed seed\n",
+        "        torch.manual_seed(123)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"points\": torch.empty((250, 3), device=\"cuda\", dtype=dtype_float)\n",
+        "                .uniform_(-1000.0, 1000.0)\n",
+        "                .flatten(),\n",
+        "                \"indices\": torch.full((250,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "                \"N\": 250,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        N = 10000\n",
+        "\n",
+        "        return {\n",
+        "            \"points\": torch.empty((N, 3), device=\"cuda\", dtype=dtype_float)\n",
+        "            .uniform_(-1000.0, 1000.0)\n",
+        "            .flatten(),\n",
+        "            \"indices\": torch.full((N,), -1, device=\"cuda\", dtype=dtype_int),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/40_batch_normalization.ipynb b/challenges/colab_exports/medium/40_batch_normalization.ipynb
new file mode 100644
index 00000000..96ba439f
--- /dev/null
+++ b/challenges/colab_exports/medium/40_batch_normalization.ipynb
@@ -0,0 +1,601 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement batch normalization forward pass for 2D input tensors. Given an input tensor of shape [N, C] where N is the batch size and C is the number of features, compute the normalized output using learnable scale (<code>gamma</code>) and shift (<code>beta</code>) parameters.\n</p>\n\n<p>\n  For each feature channel j, batch normalization computes:\n  $$\n  \\begin{align}\n  \\mu_j &= \\frac{1}{N} \\sum_{i=1}^{N} x_{i,j} \\\\\n  \\sigma_j^2 &= \\frac{1}{N} \\sum_{i=1}^{N} (x_{i,j} - \\mu_j)^2 \\\\\n  \\hat{x}_{i,j} &= \\frac{x_{i,j} - \\mu_j}{\\sqrt{\\sigma_j^2 + \\epsilon}} \\\\\n  y_{i,j} &= \\gamma_j \\hat{x}_{i,j} + \\beta_j\n  \\end{align}\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]  (N=3, C=2)\n        gamma = [1.0, 1.0]\n        beta = [0.0, 0.0]\n        eps = 1e-5\nOutput: output = [[-1.224, -1.224], [0.0, 0.0], [1.224, 1.224]]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [[0.0, 1.0], [2.0, 3.0]]  (N=2, C=2)\n        gamma = [2.0, 0.5]\n        beta = [1.0, -1.0]\n        eps = 1e-5\nOutput: output = [[-1.0, -1.5], [3.0, -0.5]]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 10,000</li>\n  <li>1 \u2264 <code>C</code> \u2264 1,024</li>\n  <li><code>eps</code> = 1e-5</li>\n  <li>-100.0 \u2264 input values \u2264 100.0</li>\n  <li>0.1 \u2264 gamma values \u2264 10.0</li>\n  <li>-10.0 \u2264 beta values \u2264 10.0</li>\n\n  <li>Performance is measured with <code>N</code> = 5,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, gamma, beta, output are device pointers\nextern \"C\" void solve(const float* input, const float* gamma, const float* beta, float* output,\n                      int N, int C, float eps) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, gamma, beta, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    gamma: cute.Tensor,\n    beta: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    C: cute.Int32,\n    eps: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, gamma, beta are tensors on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array, gamma: jax.Array, beta: jax.Array, N: int, C: int, eps: float\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, gamma, beta, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    gamma: UnsafePointer[Float32, MutExternalOrigin],\n    beta: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    C: Int32,\n    eps: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, gamma, beta, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    gamma: torch.Tensor,\n    beta: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    C: int,\n    eps: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, gamma, beta, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    gamma: torch.Tensor,\n    beta: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    C: int,\n    eps: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Batch Normalization\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        gamma: torch.Tensor,\n",
+        "        beta: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        C: int,\n",
+        "        eps: float,\n",
+        "    ):\n",
+        "        assert input.shape == output.shape == (N, C)\n",
+        "        assert gamma.shape == beta.shape == (C,)\n",
+        "        assert input.dtype == gamma.dtype == beta.dtype == output.dtype\n",
+        "        assert input.device == gamma.device == beta.device == output.device\n",
+        "\n",
+        "        # Compute mean and variance for each feature channel\n",
+        "        mean = torch.mean(input, dim=0)  # Shape: [C]\n",
+        "        variance = torch.var(input, dim=0, unbiased=False)  # Shape: [C]\n",
+        "\n",
+        "        # Normalize\n",
+        "        normalized = (input - mean) / torch.sqrt(variance + eps)\n",
+        "\n",
+        "        # Scale and shift\n",
+        "        output.copy_(gamma * normalized + beta)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"gamma\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"beta\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"C\": (ctypes.c_int, \"in\"),\n",
+        "            \"eps\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N, C = 3, 2\n",
+        "        input = torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype)\n",
+        "        gamma = torch.tensor([1.0, 1.0], device=\"cuda\", dtype=dtype)\n",
+        "        beta = torch.tensor([0.0, 0.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty((N, C), device=\"cuda\", dtype=dtype)\n",
+        "        eps = 1e-5\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"gamma\": gamma,\n",
+        "            \"beta\": beta,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "            \"C\": C,\n",
+        "            \"eps\": eps,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_small\n",
+        "        N, C = 3, 2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"gamma\": torch.tensor([1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.tensor([0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_batch\n",
+        "        N, C = 1, 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": torch.tensor([1.0, 1.0, 1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.tensor([0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_zeros\n",
+        "        N, C = 4, 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": torch.ones(C, device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.zeros(C, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative_numbers\n",
+        "        N, C = 2, 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[-1.0, -2.0, -3.0], [-4.0, -5.0, -6.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"gamma\": torch.tensor([1.0, 1.0, 1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.tensor([0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # different_gamma_beta\n",
+        "        N, C = 2, 2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[0.0, 1.0], [2.0, 3.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": torch.tensor([2.0, 0.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.tensor([1.0, -1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_values\n",
+        "        N, C = 5, 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((N, C), device=\"cuda\", dtype=dtype).uniform_(-50.0, 50.0),\n",
+        "                \"gamma\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(0.5, 2.0),\n",
+        "                \"beta\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        N, C = 64, 32\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((N, C), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"gamma\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(0.5, 2.0),\n",
+        "                \"beta\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_feature\n",
+        "        N, C = 100, 1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((N, C), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"gamma\": torch.tensor([1.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.tensor([0.5], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # high_variance\n",
+        "        N, C = 10, 5\n",
+        "        input_data = torch.empty((N, C), device=\"cuda\", dtype=dtype)\n",
+        "        for i in range(C):\n",
+        "            input_data[:, i] = torch.linspace(\n",
+        "                -100 + i * 10, 100 - i * 10, N, device=\"cuda\", dtype=dtype\n",
+        "            )\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": input_data,\n",
+        "                \"gamma\": torch.ones(C, device=\"cuda\", dtype=dtype),\n",
+        "                \"beta\": torch.zeros(C, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N, C = 5000, 512\n",
+        "        return {\n",
+        "            \"input\": torch.empty((N, C), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"gamma\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(0.5, 2.0),\n",
+        "            \"beta\": torch.empty(C, device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0),\n",
+        "            \"output\": torch.empty((N, C), device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "            \"C\": C,\n",
+        "            \"eps\": 1e-5,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/42_2d_max_pooling.ipynb b/challenges/colab_exports/medium/42_2d_max_pooling.ipynb
new file mode 100644
index 00000000..da880140
--- /dev/null
+++ b/challenges/colab_exports/medium/42_2d_max_pooling.ipynb
@@ -0,0 +1,774 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a 2D max pooling operation for image/feature map downsampling.\n  The program should take an input tensor and produce an output tensor by applying max pooling with specified kernel size, stride, and padding.\n</p>\n\n<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 420 180\" style=\"display:block; margin:20px auto;\" width=\"420\" height=\"180\" font-family=\"monospace\">\n  <rect width=\"420\" height=\"180\" rx=\"8\" fill=\"#222\"/>\n  <defs>\n    <marker id=\"arrpool\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#ccc\"/>\n    </marker>\n  </defs>\n\n  <!-- Input label -->\n  <text x=\"75\" y=\"18\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\" font-weight=\"bold\">Input (4x4)</text>\n\n  <!-- Input grid background -->\n  <rect x=\"15\" y=\"24\" width=\"120\" height=\"120\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"0.5\"/>\n\n  <!-- Region highlights (2x2 stride=2) with dashed borders -->\n  <!-- Top-left region (green) -->\n  <rect x=\"15\" y=\"24\" width=\"60\" height=\"60\" fill=\"#1e3d2d\" opacity=\"0.5\"/>\n  <rect x=\"15\" y=\"24\" width=\"60\" height=\"60\" fill=\"none\" stroke=\"#44aa66\" stroke-width=\"1.5\" stroke-dasharray=\"4,2\"/>\n  <!-- Top-right region (blue) -->\n  <rect x=\"75\" y=\"24\" width=\"60\" height=\"60\" fill=\"#1e2d3d\" opacity=\"0.5\"/>\n  <rect x=\"75\" y=\"24\" width=\"60\" height=\"60\" fill=\"none\" stroke=\"#4488cc\" stroke-width=\"1.5\" stroke-dasharray=\"4,2\"/>\n  <!-- Bottom-left region (amber) -->\n  <rect x=\"15\" y=\"84\" width=\"60\" height=\"60\" fill=\"#3d2d1e\" opacity=\"0.5\"/>\n  <rect x=\"15\" y=\"84\" width=\"60\" height=\"60\" fill=\"none\" stroke=\"#cc8844\" stroke-width=\"1.5\" stroke-dasharray=\"4,2\"/>\n  <!-- Bottom-right region (purple) -->\n  <rect x=\"75\" y=\"84\" width=\"60\" height=\"60\" fill=\"#3d1e3d\" opacity=\"0.5\"/>\n  <rect x=\"75\" y=\"84\" width=\"60\" height=\"60\" fill=\"none\" stroke=\"#aa44aa\" stroke-width=\"1.5\" stroke-dasharray=\"4,2\"/>\n\n  <!-- Grid lines -->\n  <line x1=\"45\" y1=\"24\" x2=\"45\" y2=\"144\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"75\" y1=\"24\" x2=\"75\" y2=\"144\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"105\" y1=\"24\" x2=\"105\" y2=\"144\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"15\" y1=\"54\" x2=\"135\" y2=\"54\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"15\" y1=\"84\" x2=\"135\" y2=\"84\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"15\" y1=\"114\" x2=\"135\" y2=\"114\" stroke=\"#555\" stroke-width=\"0.5\"/>\n\n  <!-- Input values row 1: 1,3,2,4 -->\n  <text x=\"30\" y=\"43\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">1</text>\n  <text x=\"60\" y=\"43\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">3</text>\n  <text x=\"90\" y=\"43\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">2</text>\n  <text x=\"120\" y=\"43\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">4</text>\n  <!-- Row 2: 5,8,6,7 -->\n  <text x=\"30\" y=\"73\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">5</text>\n  <text x=\"60\" y=\"73\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"12\" font-weight=\"bold\">8</text>\n  <text x=\"90\" y=\"73\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">6</text>\n  <text x=\"120\" y=\"73\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"12\" font-weight=\"bold\">7</text>\n  <!-- Row 3: 9,2,4,3 -->\n  <text x=\"30\" y=\"103\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"12\" font-weight=\"bold\">9</text>\n  <text x=\"60\" y=\"103\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">2</text>\n  <text x=\"90\" y=\"103\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">4</text>\n  <text x=\"120\" y=\"103\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">3</text>\n  <!-- Row 4: 1,6,5,8 -->\n  <text x=\"30\" y=\"133\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">1</text>\n  <text x=\"60\" y=\"133\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">6</text>\n  <text x=\"90\" y=\"133\" text-anchor=\"middle\" fill=\"#999\" font-size=\"12\">5</text>\n  <text x=\"120\" y=\"133\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"12\" font-weight=\"bold\">8</text>\n\n  <!-- Arrow with \"max\" label -->\n  <text x=\"172\" y=\"76\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-style=\"italic\">max</text>\n  <line x1=\"150\" y1=\"84\" x2=\"195\" y2=\"84\" stroke=\"#ccc\" stroke-width=\"1.5\" marker-end=\"url(#arrpool)\"/>\n\n  <!-- Output label -->\n  <text x=\"240\" y=\"52\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\" font-weight=\"bold\">Output (2x2)</text>\n\n  <!-- Output grid background -->\n  <rect x=\"210\" y=\"60\" width=\"60\" height=\"60\" rx=\"3\" fill=\"#333\" stroke=\"#555\" stroke-width=\"0.5\"/>\n\n  <!-- Output grid lines -->\n  <line x1=\"240\" y1=\"60\" x2=\"240\" y2=\"120\" stroke=\"#555\" stroke-width=\"0.5\"/>\n  <line x1=\"210\" y1=\"90\" x2=\"270\" y2=\"90\" stroke=\"#555\" stroke-width=\"0.5\"/>\n\n  <!-- Output region color coding -->\n  <rect x=\"210\" y=\"60\" width=\"30\" height=\"30\" fill=\"#1e3d2d\" opacity=\"0.5\"/>\n  <rect x=\"240\" y=\"60\" width=\"30\" height=\"30\" fill=\"#1e2d3d\" opacity=\"0.5\"/>\n  <rect x=\"210\" y=\"90\" width=\"30\" height=\"30\" fill=\"#3d2d1e\" opacity=\"0.5\"/>\n  <rect x=\"240\" y=\"90\" width=\"30\" height=\"30\" fill=\"#3d1e3d\" opacity=\"0.5\"/>\n\n  <!-- Output values: [[8,7],[9,8]] -->\n  <text x=\"225\" y=\"80\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"13\" font-weight=\"bold\">8</text>\n  <text x=\"255\" y=\"80\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"13\" font-weight=\"bold\">7</text>\n  <text x=\"225\" y=\"110\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"13\" font-weight=\"bold\">9</text>\n  <text x=\"255\" y=\"110\" text-anchor=\"middle\" fill=\"#fff\" font-size=\"13\" font-weight=\"bold\">8</text>\n\n  <!-- Legend -->\n  <text x=\"300\" y=\"70\" fill=\"#ccc\" font-size=\"9\">kernel: 2x2</text>\n  <text x=\"300\" y=\"84\" fill=\"#ccc\" font-size=\"9\">stride: 2</text>\n  <text x=\"300\" y=\"98\" fill=\"#ccc\" font-size=\"9\">padding: 0</text>\n\n  <!-- Footer note -->\n  <text x=\"75\" y=\"164\" text-anchor=\"middle\" fill=\"#666\" font-size=\"9\">dashed borders = pooling windows</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in tensor <code>output</code></li>\n</ul>\n\n<h2>Max Pooling Operation</h2>\n<p>\n  For each output position (n, c, h_out, w_out), compute the maximum value over the corresponding input window:\n  <br>\n  <code>output[n, c, h_out, w_out] = max(input[n, c, h:h+kernel_size, w:w+kernel_size])</code>\n  <br>\n  where h = h_out * stride and w = w_out * stride\n</p>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [[[[1.0, 2.0, 3.0],\n                   [4.0, 5.0, 6.0],\n                   [7.0, 8.0, 9.0]]]]\n        kernel_size = 2\n        stride = 1\n        padding = 0\nOutput: output = [[[[5.0, 6.0],\n                    [8.0, 9.0]]]]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [[[[1.0, 2.0, 3.0, 4.0, 5.0],\n                   [6.0, 7.0, 8.0, 9.0, 10.0],\n                   [11.0, 12.0, 13.0, 14.0, 15.0],\n                   [16.0, 17.0, 18.0, 19.0, 20.0],\n                   [21.0, 22.0, 23.0, 24.0, 25.0]]]]\n        kernel_size = 3\n        stride = 1\n        padding = 1\nOutput: output = [[[[7.0, 8.0, 9.0, 10.0, 10.0],\n                    [12.0, 13.0, 14.0, 15.0, 15.0],\n                    [17.0, 18.0, 19.0, 20.0, 20.0],\n                    [22.0, 23.0, 24.0, 25.0, 25.0],\n                    [22.0, 23.0, 24.0, 25.0, 25.0]]]]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 N \u2264 100 (batch size)</li>\n  <li>1 \u2264 C \u2264 512 (channels)</li>\n  <li>1 \u2264 H, W \u2264 1024 (height, width)</li>\n  <li>1 \u2264 kernel_size \u2264 16</li>\n  <li>1 \u2264 stride \u2264 16</li>\n  <li>0 \u2264 padding \u2264 16</li>\n  <li>Input and output tensors use float32 precision</li>\n\n  <li>Performance is measured with <code>N</code> = 4, <code>kernel_size</code> = 3, <code>stride</code> = 2</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, float* output, int N, int C, int H, int W,\n                      int kernel_size, int stride, int padding) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    C: cute.Int32,\n    H: cute.Int32,\n    W: cute.Int32,\n    kernel_size: cute.Int32,\n    stride: cute.Int32,\n    padding: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array, N: int, C: int, H: int, W: int, kernel_size: int, stride: int, padding: int\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    C: Int32,\n    H: Int32,\n    W: Int32,\n    kernel_size: Int32,\n    stride: Int32,\n    padding: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input, output, N, C, H, W, kernel_size, stride, padding):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input, output, N, C, H, W, kernel_size, stride, padding):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"2D Max Pooling\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        C: int,\n",
+        "        H: int,\n",
+        "        W: int,\n",
+        "        kernel_size: int,\n",
+        "        stride: int,\n",
+        "        padding: int,\n",
+        "    ):\n",
+        "        input_tensor = input.view(N, C, H, W)\n",
+        "\n",
+        "        # Apply max pooling\n",
+        "        result = torch.nn.functional.max_pool2d(\n",
+        "            input_tensor, kernel_size=kernel_size, stride=stride, padding=padding\n",
+        "        )\n",
+        "\n",
+        "        output.copy_(result.flatten())\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"C\": (ctypes.c_int, \"in\"),\n",
+        "            \"H\": (ctypes.c_int, \"in\"),\n",
+        "            \"W\": (ctypes.c_int, \"in\"),\n",
+        "            \"kernel_size\": (ctypes.c_int, \"in\"),\n",
+        "            \"stride\": (ctypes.c_int, \"in\"),\n",
+        "            \"padding\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        \"\"\"Simple test case matching the example in challenge.html\"\"\"\n",
+        "        dtype = torch.float32\n",
+        "        N, C, H, W = 1, 1, 3, 3\n",
+        "        kernel_size, stride, padding = 2, 1, 0\n",
+        "\n",
+        "        # Create input tensor: [[[[1, 2, 3], [4, 5, 6], [7, 8, 9]]]]\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]]], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "\n",
+        "        # Calculate output dimensions\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"input\": input_tensor.flatten(),\n",
+        "            \"output\": output_tensor,\n",
+        "            \"N\": N,\n",
+        "            \"C\": C,\n",
+        "            \"H\": H,\n",
+        "            \"W\": W,\n",
+        "            \"kernel_size\": kernel_size,\n",
+        "            \"stride\": stride,\n",
+        "            \"padding\": padding,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"Comprehensive test suite covering various scenarios and edge cases\"\"\"\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Set seed for reproducible random tests\n",
+        "        torch.manual_seed(42)\n",
+        "\n",
+        "        # Test case 1: 2x2 kernel, stride 2, no padding (deterministic)\n",
+        "        N, C, H, W = 1, 1, 4, 4\n",
+        "        kernel_size, stride, padding = 2, 2, 0\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [\n",
+        "                [\n",
+        "                    [\n",
+        "                        [1.0, 2.0, 3.0, 4.0],\n",
+        "                        [5.0, 6.0, 7.0, 8.0],\n",
+        "                        [9.0, 10.0, 11.0, 12.0],\n",
+        "                        [13.0, 14.0, 15.0, 16.0],\n",
+        "                    ]\n",
+        "                ]\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: 3x3 kernel, stride 1, padding 1 (random data)\n",
+        "        N, C, H, W = 1, 2, 5, 5\n",
+        "        kernel_size, stride, padding = 3, 1, 1\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: 1x1 kernel, stride 1, no padding (identity operation)\n",
+        "        N, C, H, W = 2, 3, 8, 8\n",
+        "        kernel_size, stride, padding = 1, 1, 0\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: Large kernel with padding\n",
+        "        N, C, H, W = 1, 1, 10, 10\n",
+        "        kernel_size, stride, padding = 5, 2, 2\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: Edge case with small dimensions\n",
+        "        N, C, H, W = 1, 1, 2, 2\n",
+        "        kernel_size, stride, padding = 2, 1, 0\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.tensor([[[[1.0, 2.0], [3.0, 4.0]]]], device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: Boundary conditions - kernel size equals input size\n",
+        "        N, C, H, W = 1, 1, 3, 3\n",
+        "        kernel_size, stride, padding = 3, 1, 0\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [[[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]]], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 7: Large padding relative to input size\n",
+        "        N, C, H, W = 1, 1, 4, 4\n",
+        "        kernel_size, stride, padding = 2, 1, 1\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [\n",
+        "                [\n",
+        "                    [\n",
+        "                        [1.0, 2.0, 3.0, 4.0],\n",
+        "                        [5.0, 6.0, 7.0, 8.0],\n",
+        "                        [9.0, 10.0, 11.0, 12.0],\n",
+        "                        [13.0, 14.0, 15.0, 16.0],\n",
+        "                    ]\n",
+        "                ]\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 8: Multiple channels with different patterns\n",
+        "        N, C, H, W = 1, 3, 6, 6\n",
+        "        kernel_size, stride, padding = 2, 2, 1\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        # Create structured input with different patterns per channel\n",
+        "        input_tensor = torch.zeros(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        input_tensor[0, 0, :, :] = torch.arange(H * W, device=\"cuda\", dtype=dtype).reshape(H, W)\n",
+        "        input_tensor[0, 1, :, :] = (\n",
+        "            torch.arange(H * W, device=\"cuda\", dtype=dtype).reshape(H, W).flip(0)\n",
+        "        )\n",
+        "        input_tensor[0, 2, :, :] = (\n",
+        "            torch.arange(H * W, device=\"cuda\", dtype=dtype).reshape(H, W).flip(1)\n",
+        "        )\n",
+        "\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 9: Extreme values and edge cases\n",
+        "        N, C, H, W = 1, 1, 5, 5\n",
+        "        kernel_size, stride, padding = 2, 1, 0\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        # Create input with extreme values\n",
+        "        input_tensor = torch.tensor(\n",
+        "            [\n",
+        "                [\n",
+        "                    [\n",
+        "                        [1e6, -1e6, 0.0, 1e-6, -1e-6],\n",
+        "                        [float(\"inf\"), float(\"-inf\"), 1.0, 2.0, 3.0],\n",
+        "                        [4.0, 5.0, 6.0, 7.0, 8.0],\n",
+        "                        [9.0, 10.0, 11.0, 12.0, 13.0],\n",
+        "                        [14.0, 15.0, 16.0, 17.0, 18.0],\n",
+        "                    ]\n",
+        "                ]\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 10: Non-power-of-two dimensions\n",
+        "        N, C, H, W = 1, 1, 7, 11\n",
+        "        kernel_size, stride, padding = 3, 2, 1\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        input_tensor = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"input\": input_tensor.flatten(),\n",
+        "                \"output\": output_tensor,\n",
+        "                \"N\": N,\n",
+        "                \"C\": C,\n",
+        "                \"H\": H,\n",
+        "                \"W\": W,\n",
+        "                \"kernel_size\": kernel_size,\n",
+        "                \"stride\": stride,\n",
+        "                \"padding\": padding,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        \"\"\"Large test case for performance evaluation\"\"\"\n",
+        "        dtype = torch.float32\n",
+        "        # Reasonable size for performance testing without memory issues\n",
+        "        N, C, H, W = 4, 64, 256, 256  # 4 batches, 64 channels, 256x256 spatial\n",
+        "        kernel_size, stride, padding = 3, 2, 1\n",
+        "\n",
+        "        H_out = (H + 2 * padding - kernel_size) // stride + 1\n",
+        "        W_out = (W + 2 * padding - kernel_size) // stride + 1\n",
+        "\n",
+        "        # Use seeded random for reproducible performance tests\n",
+        "        torch.manual_seed(123)\n",
+        "        input_tensor = torch.randn(N, C, H, W, device=\"cuda\", dtype=dtype)\n",
+        "        output_tensor = torch.empty(N * C * H_out * W_out, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"input\": input_tensor.flatten(),\n",
+        "            \"output\": output_tensor,\n",
+        "            \"N\": N,\n",
+        "            \"C\": C,\n",
+        "            \"H\": H,\n",
+        "            \"W\": W,\n",
+        "            \"kernel_size\": kernel_size,\n",
+        "            \"stride\": stride,\n",
+        "            \"padding\": padding,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/43_count_array_element.ipynb b/challenges/colab_exports/medium/43_count_array_element.ipynb
new file mode 100644
index 00000000..eaa59f23
--- /dev/null
+++ b/challenges/colab_exports/medium/43_count_array_element.ipynb
@@ -0,0 +1,490 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that counts the number of elements with the integer value k in an array of 32-bit integers.\n  The program should count the number of elements with k in an array.\n  You are given an input array <code>input</code> of length <code>N</code> and integer <code>k</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: [1, 2, 3, 4, 1], k = 1\nOutput: 2\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: [5, 10, 5, 2], k = 11\nOutput: 0\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>1 &le; <code>input[i], k</code> &le; 100,000</li>\n\n  <li>Performance is measured with <code>K</code> = 501,010, <code>N</code> = 100,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const int* input, int* output, int N, int K) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, K: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, K: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    K: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Count Array Element\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, K: int):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # count the number of element with value k in an input array\n",
+        "        equality_tensor = input == K\n",
+        "        output[0] = torch.sum(equality_tensor)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([1, 2, 3, 4, 1], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 5,\n",
+        "            \"K\": 1,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1, 2, 3, 4, 1], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "                \"K\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "                \"K\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 5, (32,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 32,\n",
+        "                \"K\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 10, (1000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"K\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 1000, (100000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100000,\n",
+        "                \"K\": 501,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 100001, (100000000,), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 100000000,\n",
+        "            \"K\": 501010,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/44_count_2d_array_element.ipynb b/challenges/colab_exports/medium/44_count_2d_array_element.ipynb
new file mode 100644
index 00000000..4f33771c
--- /dev/null
+++ b/challenges/colab_exports/medium/44_count_2d_array_element.ipynb
@@ -0,0 +1,498 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that counts the number of elements with the integer value k in an 2D array of 32-bit integers.\n  The program should count the number of elements with k in an 2D array.\n  You are given an input 2D array <code>input</code> of length <code>N x M</code> and integer <code>k</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: input [[1, 2, 3],\n              [4, 5, 1]]\n       k = 1\nOutput: output = 2\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: input [[5, 10],\n              [5, 2]]\n       k = 1\nOutput: output = 0\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N, M</code> &le; 10,000</li>\n  <li>1 &le; <code>input[i], k</code> &le; 100</li>\n\n  <li>Performance is measured with <code>K</code> = 1, <code>M</code> = 10,000, <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const int* input, int* output, int N, int M, int K) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, M: cute.Int32, K: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, M: int, K: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    M: Int32,\n    K: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Count 2D Array Element\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N, M)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # count the number of element with value k in an input array\n",
+        "        equality_tensor = input == K\n",
+        "        output[0] = torch.sum(equality_tensor)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([[1, 2, 3], [4, 5, 1]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 2,\n",
+        "            \"M\": 3,\n",
+        "            \"K\": 1,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1, 2, 3], [4, 5, 1]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "                \"M\": 3,\n",
+        "                \"K\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[2] * 16] * 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "                \"M\": 16,\n",
+        "                \"K\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (50, 50), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 50,\n",
+        "                \"M\": 50,\n",
+        "                \"K\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 101, (100, 100), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100,\n",
+        "                \"M\": 100,\n",
+        "                \"K\": 51,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (1000, 1000), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"M\": 1000,\n",
+        "                \"K\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 3, (10000, 10000), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 10000,\n",
+        "            \"M\": 10000,\n",
+        "            \"K\": 1,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/45_count_3d_array_element.ipynb b/challenges/colab_exports/medium/45_count_3d_array_element.ipynb
new file mode 100644
index 00000000..12d61c6c
--- /dev/null
+++ b/challenges/colab_exports/medium/45_count_3d_array_element.ipynb
@@ -0,0 +1,513 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that counts the number of elements with the integer value p in an 3D array of 32-bit integers.\n  The program should count the number of elements with p in an 3D array.\n  You are given an input 3D array <code>input</code> of length <code>N x M x K</code> and integer <code>p</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: input [[[1, 2, 3],\n               [4, 5, 1]],\n              [[1, 1, 1],\n               [2, 2, 2]]]\n       N = 2, M = 2, K = 3\n       p = 1\nOutput: output = 5\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: input [[[5, 10],\n               [5, 2],\n               [2, 2]]]\n       N = 1, M = 3, K = 2\n       p = 1\nOutput: output = 0\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N, M, K</code> &le; 1,000</li>\n  <li>1 &le; <code>input[i], p</code> &le; 100</li>\n\n  <li>Performance is measured with <code>K</code> = 500, <code>M</code> = 500, <code>N</code> = 500</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, int* output, int N, int M, int K, int P) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    M: cute.Int32,\n    K: cute.Int32,\n    P: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, M: int, K: int, P: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    M: Int32,\n    K: Int32,\n    P: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int, P: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int, P: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Count 3D Array Element\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, input: torch.Tensor, output: torch.Tensor, N: int, M: int, K: int, P: int\n",
+        "    ):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N, M, K)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # count the number of element with value k in an input array\n",
+        "        equality_tensor = input == P\n",
+        "        output[0] = torch.sum(equality_tensor)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"P\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "\n",
+        "        input = torch.tensor(\n",
+        "            [[[1, 2, 3], [4, 5, 1]], [[1, 1, 1], [2, 2, 2]]], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 2,\n",
+        "            \"M\": 2,\n",
+        "            \"K\": 3,\n",
+        "            \"P\": 1,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[[1, 2, 3], [4, 5, 1]], [[1, 1, 1], [2, 2, 2]]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "                \"M\": 2,\n",
+        "                \"K\": 3,\n",
+        "                \"P\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[[2] * 16] * 3] * 15, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 15,\n",
+        "                \"M\": 3,\n",
+        "                \"K\": 16,\n",
+        "                \"P\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (50, 50, 50), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 50,\n",
+        "                \"M\": 50,\n",
+        "                \"K\": 50,\n",
+        "                \"P\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 101, (100, 100, 100), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100,\n",
+        "                \"M\": 100,\n",
+        "                \"K\": 100,\n",
+        "                \"P\": 51,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (100, 200, 300), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100,\n",
+        "                \"M\": 200,\n",
+        "                \"K\": 300,\n",
+        "                \"P\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 3, (500, 500, 500), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 500,\n",
+        "            \"M\": 500,\n",
+        "            \"K\": 500,\n",
+        "            \"P\": 2,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/47_subarray_sum.ipynb b/challenges/colab_exports/medium/47_subarray_sum.ipynb
new file mode 100644
index 00000000..bd86f292
--- /dev/null
+++ b/challenges/colab_exports/medium/47_subarray_sum.ipynb
@@ -0,0 +1,497 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a program that computes the sum of a subarray of 32-bit integers.\n  You are given an input array <code>input</code> of length <code>N</code>, and two indices <code>S</code> and <code>E</code>.\n  <code>S</code> and <code>E</code> are inclusive, 0-based start and end indices \u2014 compute the sum of <code>input[S..E]</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: input = [1, 2, 1, 3, 4], S = 1, E = 3\nOutput: output = 6\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: input = [1, 2, 3, 4], S = 0, E = 3\nOutput: output = 10\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>1 &le; <code>input[i]</code> &le; 10</li>\n  <li>0 &le; <code>S</code> &le; <code>E</code> &le; <code>N - 1</code></li>\n\n  <li>Performance is measured with <code>N</code> = 100,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, int* output, int N, int S, int E) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, S: cute.Int32, E: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, S: int, E: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    S: Int32,\n    E: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, S: int, E: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, S: int, E: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Subarray Sum\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, S: int, E: int):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # add all element of subarray (input[S], ..., input[E])\n",
+        "        output[0] = torch.sum(input[S : E + 1])\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"S\": (ctypes.c_int, \"in\"),\n",
+        "            \"E\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([1, 2, 1, 3, 4], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 5,\n",
+        "            \"S\": 1,\n",
+        "            \"E\": 3,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1, 2, 3, 4], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "                \"S\": 0,\n",
+        "                \"E\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "                \"S\": 0,\n",
+        "                \"E\": 15,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 5, (32,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 32,\n",
+        "                \"S\": 0,\n",
+        "                \"E\": 31,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 10, (1000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"S\": 0,\n",
+        "                \"E\": 500,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (100000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100000,\n",
+        "                \"S\": 123,\n",
+        "                \"E\": 98765,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 11, (100000000,), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 100000000,\n",
+        "            \"S\": 17651,\n",
+        "            \"E\": 98765431,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/48_2d_subarray_sum.ipynb b/challenges/colab_exports/medium/48_2d_subarray_sum.ipynb
new file mode 100644
index 00000000..efffc849
--- /dev/null
+++ b/challenges/colab_exports/medium/48_2d_subarray_sum.ipynb
@@ -0,0 +1,531 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a program that computes the sum of a 2D subarray of 32-bit integers.\n    You are given an input 2D array <code>input</code> of length <code>N x M</code>, and two row indices <code>S_ROW</code> and <code>E_ROW</code> and two column indices <code>S_COL</code> and <code>E_COL</code>.\n    <code>S_ROW</code>, <code>E_ROW</code>, <code>S_COL</code> and <code>E_COL</code> are inclusive, 0-based start and end indices \u2014 compute the sum of <code>input[S_ROW..E_ROW][S_COL..E_COL]</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only native features (external libraries are not permitted)</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [[1, 2, 3],\n                 [4, 5, 1]]\n\n        S_ROW = 0, E_ROW = 1, S_COL = 1, E_COL = 2\nOutput: output = 11\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [[5, 10],\n                 [5, 2]]\n        S_ROW = 0, E_ROW = 0, S_COL = 1, E_COL = 1\nOutput: output = 10\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n    <li>1 &le; <code>N, M</code> &le; 10,000</li>\n    <li>1 &le; <code>input[i]</code> &le; 10</li>\n    <li>0 &le; <code>S_ROW</code> &le; <code>E_ROW</code> &le; <code>N - 1</code></li>\n    <li>0 &le; <code>S_COL</code> &le; <code>E_COL</code> &le; <code>M - 1</code></li>\n\n  <li>Performance is measured with <code>M</code> = 10,000, <code>N</code> = 10,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, int* output, int N, int M, int S_ROW, int E_ROW, int S_COL,\n                      int E_COL) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    M: cute.Int32,\n    S_ROW: cute.Int32,\n    E_ROW: cute.Int32,\n    S_COL: cute.Int32,\n    E_COL: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array, N: int, M: int, S_ROW: int, E_ROW: int, S_COL: int, E_COL: int\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    M: Int32,\n    S_ROW: Int32,\n    E_ROW: Int32,\n    S_COL: Int32,\n    E_COL: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    M: int,\n    S_ROW: int,\n    E_ROW: int,\n    S_COL: int,\n    E_COL: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    M: int,\n    S_ROW: int,\n    E_ROW: int,\n    S_COL: int,\n    E_COL: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"2D Subarray Sum\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        M: int,\n",
+        "        S_ROW: int,\n",
+        "        E_ROW: int,\n",
+        "        S_COL: int,\n",
+        "        E_COL: int,\n",
+        "    ):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N, M)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # add all elements of subarray (input[S_ROW..E_ROW][S_COL..E_COL])\n",
+        "        output[0] = torch.sum(input[S_ROW : E_ROW + 1, S_COL : E_COL + 1])\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"S_ROW\": (ctypes.c_int, \"in\"),\n",
+        "            \"E_ROW\": (ctypes.c_int, \"in\"),\n",
+        "            \"S_COL\": (ctypes.c_int, \"in\"),\n",
+        "            \"E_COL\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([[1, 2, 3], [4, 5, 1]], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 2,\n",
+        "            \"M\": 3,\n",
+        "            \"S_ROW\": 0,\n",
+        "            \"E_ROW\": 1,\n",
+        "            \"S_COL\": 1,\n",
+        "            \"E_COL\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[5, 10], [5, 2]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2,\n",
+        "                \"M\": 2,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 0,\n",
+        "                \"S_COL\": 1,\n",
+        "                \"E_COL\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[2] * 16] * 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "                \"M\": 16,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 2,\n",
+        "                \"S_COL\": 0,\n",
+        "                \"E_COL\": 15,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (50, 50), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 50,\n",
+        "                \"M\": 50,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 49,\n",
+        "                \"S_COL\": 0,\n",
+        "                \"E_COL\": 49,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (100, 100), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100,\n",
+        "                \"M\": 100,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 79,\n",
+        "                \"S_COL\": 1,\n",
+        "                \"E_COL\": 87,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (1000, 1000), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"M\": 1000,\n",
+        "                \"S_ROW\": 10,\n",
+        "                \"E_ROW\": 951,\n",
+        "                \"S_COL\": 12,\n",
+        "                \"E_COL\": 810,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 11, (10000, 10000), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 10000,\n",
+        "            \"M\": 10000,\n",
+        "            \"S_ROW\": 0,\n",
+        "            \"E_ROW\": 9998,\n",
+        "            \"S_COL\": 1,\n",
+        "            \"E_COL\": 9999,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/49_3d_subarray_sum.ipynb b/challenges/colab_exports/medium/49_3d_subarray_sum.ipynb
new file mode 100644
index 00000000..c19b3cb3
--- /dev/null
+++ b/challenges/colab_exports/medium/49_3d_subarray_sum.ipynb
@@ -0,0 +1,560 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a program that computes the sum of a 3D subarray of 32-bit integers.\n    You are given an input 3D array <code>input</code> of length <code>N x M x K</code>, and two depth indices <code>S_DEP</code> and <code>E_DEP</code> and two row indices <code>S_ROW</code> and <code>E_ROW</code> and two column indices <code>S_COL</code> and <code>E_COL</code>.\n    <code>S_DEP</code>, <code>E_DEP</code>, <code>S_ROW</code>, <code>E_ROW</code>, <code>S_COL</code> and <code>E_COL</code> are inclusive, 0-based start and end indices \u2014 compute the sum of <code>input[S_DEP..E_DEP][S_ROW..E_ROW][S_COL..E_COL]</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only native features (external libraries are not permitted)</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [[[1, 2, 3],\n                  [4, 5, 1]],\n                 [[1, 1, 1],\n                  [2, 2, 2]]]\n        N = 2, M = 2, K = 3\n        S_DEP = 0, E_DEP = 1, S_ROW = 0, E_ROW = 0, S_COL = 1, E_COL = 2\nOutput: output = 7\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [[[5, 10],\n                  [5, 2],\n                  [2, 2]]]\n        N = 1, M = 3, K = 2\n        S_DEP = 0, E_DEP = 0, S_ROW = 0, E_ROW = 2, S_COL = 1, E_COL = 1\nOutput: output = 14\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n    <li>1 &le; <code>N, M, K</code> &le; 500</li>\n    <li>1 &le; <code>input[i]</code> &le; 10</li>\n    <li>0 &le; <code>S_DEP</code> &le; <code>E_DEP</code> &le; <code>N - 1</code></li>\n    <li>0 &le; <code>S_ROW</code> &le; <code>E_ROW</code> &le; <code>M - 1</code></li>\n    <li>0 &le; <code>S_COL</code> &le; <code>E_COL</code> &le; <code>K - 1</code></li>\n\n  <li>Performance is measured with <code>K</code> = 500, <code>M</code> = 500, <code>N</code> = 500</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, int* output, int N, int M, int K, int S_DEP, int E_DEP,\n                      int S_ROW, int E_ROW, int S_COL, int E_COL) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    output: cute.Tensor,\n    N: cute.Int32,\n    M: cute.Int32,\n    K: cute.Int32,\n    S_DEP: cute.Int32,\n    E_DEP: cute.Int32,\n    S_ROW: cute.Int32,\n    E_ROW: cute.Int32,\n    S_COL: cute.Int32,\n    E_COL: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(\n    input: jax.Array,\n    N: int,\n    M: int,\n    K: int,\n    S_DEP: int,\n    E_DEP: int,\n    S_ROW: int,\n    E_ROW: int,\n    S_COL: int,\n    E_COL: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    M: Int32,\n    K: Int32,\n    S_DEP: Int32,\n    E_DEP: Int32,\n    S_ROW: Int32,\n    E_ROW: Int32,\n    S_COL: Int32,\n    E_COL: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    M: int,\n    K: int,\n    S_DEP: int,\n    E_DEP: int,\n    S_ROW: int,\n    E_ROW: int,\n    S_COL: int,\n    E_COL: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    M: int,\n    K: int,\n    S_DEP: int,\n    E_DEP: int,\n    S_ROW: int,\n    E_ROW: int,\n    S_COL: int,\n    E_COL: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"3D Subarray Sum\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        M: int,\n",
+        "        K: int,\n",
+        "        S_DEP: int,\n",
+        "        E_DEP: int,\n",
+        "        S_ROW: int,\n",
+        "        E_ROW: int,\n",
+        "        S_COL: int,\n",
+        "        E_COL: int,\n",
+        "    ):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N, M, K)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # add all elements of subarray (input[S_DEP..E_DEP][S_ROW..E_ROW][S_COL..E_COL])\n",
+        "        output[0] = torch.sum(input[S_DEP : E_DEP + 1, S_ROW : E_ROW + 1, S_COL : E_COL + 1])\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"S_DEP\": (ctypes.c_int, \"in\"),\n",
+        "            \"E_DEP\": (ctypes.c_int, \"in\"),\n",
+        "            \"S_ROW\": (ctypes.c_int, \"in\"),\n",
+        "            \"E_ROW\": (ctypes.c_int, \"in\"),\n",
+        "            \"S_COL\": (ctypes.c_int, \"in\"),\n",
+        "            \"E_COL\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor(\n",
+        "            [[[1, 2, 3], [4, 5, 1]], [[1, 1, 1], [2, 2, 2]]], device=\"cuda\", dtype=dtype\n",
+        "        )\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 2,\n",
+        "            \"M\": 2,\n",
+        "            \"K\": 3,\n",
+        "            \"S_DEP\": 0,\n",
+        "            \"E_DEP\": 1,\n",
+        "            \"S_ROW\": 0,\n",
+        "            \"E_ROW\": 0,\n",
+        "            \"S_COL\": 1,\n",
+        "            \"E_COL\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[[5, 10], [5, 2], [2, 2]]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "                \"M\": 3,\n",
+        "                \"K\": 2,\n",
+        "                \"S_DEP\": 0,\n",
+        "                \"E_DEP\": 0,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 2,\n",
+        "                \"S_COL\": 1,\n",
+        "                \"E_COL\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[[2] * 16] * 20] * 30, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 30,\n",
+        "                \"M\": 20,\n",
+        "                \"K\": 16,\n",
+        "                \"S_DEP\": 0,\n",
+        "                \"E_DEP\": 29,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 19,\n",
+        "                \"S_COL\": 0,\n",
+        "                \"E_COL\": 15,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (50, 50, 50), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 50,\n",
+        "                \"M\": 50,\n",
+        "                \"K\": 50,\n",
+        "                \"S_DEP\": 0,\n",
+        "                \"E_DEP\": 49,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 49,\n",
+        "                \"S_COL\": 0,\n",
+        "                \"E_COL\": 49,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (77, 87, 57), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 77,\n",
+        "                \"M\": 87,\n",
+        "                \"K\": 57,\n",
+        "                \"S_DEP\": 0,\n",
+        "                \"E_DEP\": 76,\n",
+        "                \"S_ROW\": 0,\n",
+        "                \"E_ROW\": 37,\n",
+        "                \"S_COL\": 1,\n",
+        "                \"E_COL\": 50,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(1, 11, (100, 100, 100), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 100,\n",
+        "                \"M\": 100,\n",
+        "                \"K\": 100,\n",
+        "                \"S_DEP\": 10,\n",
+        "                \"E_DEP\": 91,\n",
+        "                \"S_ROW\": 77,\n",
+        "                \"E_ROW\": 91,\n",
+        "                \"S_COL\": 12,\n",
+        "                \"E_COL\": 81,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(1, 11, (500, 500, 500), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 500,\n",
+        "            \"M\": 500,\n",
+        "            \"K\": 500,\n",
+        "            \"S_DEP\": 11,\n",
+        "            \"E_DEP\": 498,\n",
+        "            \"S_ROW\": 0,\n",
+        "            \"E_ROW\": 499,\n",
+        "            \"S_COL\": 1,\n",
+        "            \"E_COL\": 489,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/4_reduction.ipynb b/challenges/colab_exports/medium/4_reduction.ipynb
new file mode 100644
index 00000000..4e519c0c
--- /dev/null
+++ b/challenges/colab_exports/medium/4_reduction.ipynb
@@ -0,0 +1,490 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that performs parallel reduction on an array of 32-bit floating point numbers to compute their sum.\n  The program should take an input array and produce a single output value containing the sum of all elements.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]\nOutput: 36.0\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput: [-2.5, 1.5, -1.0, 2.0]\nOutput: 0.0\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>-1000.0 &le; <code>input[i]</code> &le; 1000.0</li>\n  <li>The final sum will always fit within a 32-bit float</li>\n\n  <li>Performance is measured with <code>N</code> = 4,194,304</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Reduction\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "        output[0] = torch.sum(input.double()).float()\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        N = 8\n",
+        "        return {\"input\": input, \"output\": output, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 8,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-2.5, 1.5, -1.0, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([42.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_ones\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.ones(1024, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "        # non_power_of_two\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_random\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-1000.0, 1000.0),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_random_2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(15000000, device=\"cuda\", dtype=dtype).uniform_(0.0, 1000.0),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 15000000,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4_194_304\n",
+        "        input = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(0.0, 1000.0)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"input\": input, \"output\": output, \"N\": N}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/50_rms_normalization.ipynb b/challenges/colab_exports/medium/50_rms_normalization.ipynb
new file mode 100644
index 00000000..045d6634
--- /dev/null
+++ b/challenges/colab_exports/medium/50_rms_normalization.ipynb
@@ -0,0 +1,551 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement RMS Normalization forward pass for 1D input vectors. Given an input tensor of shape [N] where N is the number of elements, compute the normalized output using a scalar scale (<code>gamma</code>) and shift (<code>beta</code>) parameter.\n</p>\n\n<p>\n  RMS Normalization computes:\n  $$\n  \\begin{align}\n  \\text{rms} &= \\sqrt{\\frac{1}{N} \\sum_{i=1}^{N} x_i^2 + \\epsilon} \\\\\n  \\hat{x}_i &= \\frac{x_i}{\\text{rms}} \\\\\n  y_i &= \\gamma \\hat{x}_i + \\beta\n  \\end{align}\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>output</code> tensor</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [1.0, 2.0, 3.0, 4.0]  (N=4)\n        gamma = 1.0\n        beta = 0.0\n        eps = 1e-5\nOutput: output = [0.36514813, 0.73029625, 1.0954444, 1.4605925 ]\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [1.0, 2.0, 3.0]  (N=3)\n        gamma = 1.0\n        beta = 0.0\n        eps = 1e-5\nOutput: output = [0.46290955, 0.9258191, 1.3887286]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>N</code> \u2264 100,000</li>\n  <li><code>eps</code> = 1e-5</li>\n  <li>-100.0 \u2264 input values \u2264 100.0</li>\n  <li>0.1 \u2264 gamma \u2264 10.0</li>\n  <li>-10.0 \u2264 beta \u2264 10.0</li>\n\n  <li>Performance is measured with <code>N</code> = 100,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float gamma, float beta, float* output, int N,\n                      float eps) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(\n    input: cute.Tensor,\n    gamma: cute.Float32,\n    beta: cute.Float32,\n    output: cute.Tensor,\n    N: cute.Int32,\n    eps: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input, gamma, beta are tensors on the GPU\n@jax.jit\ndef solve(input: jax.Array, gamma: jax.Array, beta: jax.Array, N: int, eps: float) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    gamma: Float32,\n    beta: Float32,\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    eps: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(\n    input: torch.Tensor,\n    gamma: torch.Tensor,\n    beta: torch.Tensor,\n    output: torch.Tensor,\n    N: int,\n    eps: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, gamma: float, beta: float, output: torch.Tensor, N: int, eps: float):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"RMS Normalization\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        gamma: float,\n",
+        "        beta: float,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "        eps: float,\n",
+        "    ):\n",
+        "        assert input.shape == output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "\n",
+        "        # RMSNorm: compute root mean square (without mean-centering)\n",
+        "        rms = torch.sqrt(torch.mean(input**2) + eps)  # shape: scalar\n",
+        "\n",
+        "        # Normalize\n",
+        "        normalized = input / rms  # shape: [N]\n",
+        "\n",
+        "        # Scale and shift\n",
+        "        output.copy_(gamma * normalized + beta)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"gamma\": (ctypes.c_float, \"in\"),\n",
+        "            \"beta\": (ctypes.c_float, \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"eps\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 4\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        gamma = 1.0\n",
+        "        beta = 0.0\n",
+        "        output = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        eps = 1e-5\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"gamma\": gamma,\n",
+        "            \"beta\": beta,\n",
+        "            \"output\": output,\n",
+        "            \"N\": N,\n",
+        "            \"eps\": eps,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_small\n",
+        "        N = 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": 1.0,\n",
+        "                \"beta\": 0.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_feature\n",
+        "        N = 1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": 2.0,\n",
+        "                \"beta\": -1.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all zeros\n",
+        "        N = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": 1.0,\n",
+        "                \"beta\": 0.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative numbers\n",
+        "        N = 5\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, -2.0, -3.0, -4.0, -5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": 1.0,\n",
+        "                \"beta\": 0.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # different gamma/beta\n",
+        "        N = 3\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0.0, 1.0, 2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"gamma\": 0.5,\n",
+        "                \"beta\": -1.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large values\n",
+        "        N = 8\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"gamma\": 1.5,\n",
+        "                \"beta\": 0.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large N\n",
+        "        N = 2000\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-100.0, 100.0),\n",
+        "                \"gamma\": 1.3,\n",
+        "                \"beta\": 0.0,\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": N,\n",
+        "                \"eps\": 1e-5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 100000\n",
+        "        return {\n",
+        "            \"input\": torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "            \"gamma\": 1.5,\n",
+        "            \"beta\": 0.0,\n",
+        "            \"output\": torch.empty(N, device=\"cuda\", dtype=dtype),\n",
+        "            \"N\": N,\n",
+        "            \"eps\": 1e-5,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/51_max_subarray_sum.ipynb b/challenges/colab_exports/medium/51_max_subarray_sum.ipynb
new file mode 100644
index 00000000..3a0c5ef0
--- /dev/null
+++ b/challenges/colab_exports/medium/51_max_subarray_sum.ipynb
@@ -0,0 +1,517 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a program that computes the maximum sum of any contiguous subarray of length exactly <code>window_size</code>. You are given an array <code>input</code> of length <code>N</code> consisting of 32-bit signed integers, and an integer <code>window_size</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>Use only native features (external libraries are not permitted)</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The final result must be stored in the <code>output</code> variable</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:  input = [1, 2, 4, 2, 3], window_size = 2\nOutput: output = 6\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:  input = [-1, -4, -2, 1], window_size = 3\nOutput: output = -5\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n    <li>1 &le; <code>N</code> &le; 50,000</li>\n    <li>-10 &le; <code>input[i]</code> &le; 10</li>\n    <li>1 &le; <code>window_size</code> &le; <code>N</code></li>\n\n  <li>Performance is measured with <code>N</code> = 50,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const int* input, int* output, int N, int window_size) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32, window_size: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int, window_size: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    input: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Int32, MutExternalOrigin],\n    N: Int32,\n    window_size: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Max Subarray Sum\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int, window_size: int):\n",
+        "        # Validate input types and shapes\n",
+        "        assert input.shape == (N,)\n",
+        "        assert output.shape == (1,)\n",
+        "        assert input.dtype == torch.int32\n",
+        "        assert output.dtype == torch.int32\n",
+        "\n",
+        "        # Computes the maximum sum of any contiguous subarray of length exactly window_size\n",
+        "        # using a sliding window approach.\n",
+        "\n",
+        "        # Compute the sum of the first window_size elements (the initial window)\n",
+        "        current_sum = input[:window_size].sum()\n",
+        "\n",
+        "        # Initialize max_sum with the sum of the first window\n",
+        "        max_sum = current_sum\n",
+        "\n",
+        "        # Slide the window across the array from index window_size to N - 1\n",
+        "        for i in range(window_size, N):\n",
+        "            # Update the current sum by subtracting the element leaving the window\n",
+        "            # and adding the new element entering the window\n",
+        "            current_sum += input[i] - input[i - window_size]\n",
+        "\n",
+        "            # Update max_sum if the current sum is greater\n",
+        "            max_sum = torch.max(max_sum, current_sum)\n",
+        "\n",
+        "        # Store the final result in the output tensor\n",
+        "        output[0] = max_sum\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"window_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.tensor([1, 2, 4, 2, 3], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 5,\n",
+        "            \"window_size\": 2,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1, -4, -2, 1], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "                \"window_size\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_same_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "                \"window_size\": 15,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_minus_value\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-10] * 1000, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"window_size\": 500,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # increasing_sequence\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(-10, 11, (123,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 123,\n",
+        "                \"window_size\": 7,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # medium_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(-10, 11, (1000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "                \"window_size\": 476,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_size\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.randint(-10, 11, (10000,), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "                \"window_size\": 7011,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.int32\n",
+        "        input = torch.randint(-10, 11, (50000,), device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 50000,\n",
+        "            \"window_size\": 25000,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/55_attn_w_linear_bias.ipynb b/challenges/colab_exports/medium/55_attn_w_linear_bias.ipynb
new file mode 100644
index 00000000..e1cf5c75
--- /dev/null
+++ b/challenges/colab_exports/medium/55_attn_w_linear_bias.ipynb
@@ -0,0 +1,574 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement Attention with Linear Biases (ALiBi), following the method described in\n  <a href=\"https://arxiv.org/pdf/2108.12409\" target=\"_blank\">\n  \"Train Short, Test Long: Attention with Linear Biases Enables Input Length Extrapolation\"\n  </a>, for a given set of matrices.\n  Given the query matrix <code>Q</code> of size <code>M\u00d7d</code>, key matrix <code>K</code> of size <code>N\u00d7d</code>, and value matrix\n  <code>V</code> of size <code>N\u00d7d</code>, your program should compute the output matrix using the formula:\n</p>\n\n<p>\n  $$\n  \\text{Attention}_{ALiBi}(Q, K, V) = \\text{softmax}\\Bigl( \\frac{QK^T}{\\sqrt{d}} + \\alpha \\cdot \\Delta \\Bigr)V\n  $$\n</p>\n\n<p>\n  where &alpha; is a slope controlling the linear bias and <code>&Delta; = i - j</code> represents the relative position between query <code>i</code> and key <code>j</code>.\n  The softmax function is applied row-wise. <code>Q</code>, <code>K</code>, <code>V</code>, <code>output</code>, and <code>&alpha;</code> are all of data type <code>float32</code>;\n  <code>M</code>, <code>N</code>, <code>d</code> are of data type <code>int32</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the output matrix\n    <code>output</code>\n  </li>\n</ul>\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>K</code> (3\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0 \\\\\n0.0 & 0.0 & 1.0 & 0.0\n\\end{bmatrix}\n$$\n<code>V</code> (3\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0 \\\\\n9.0 & 10.0 & 11.0 & 12.0\n\\end{bmatrix}\n$$\n$\\alpha = 0.5$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n3.05 & 4.05 & 6.05 & 7.05 \\\\\n3.93 & 4.93 & 5.93 & 6.93\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Example 2:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (1\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 2.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 \\\\\n0.0 & 1.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\n<code>\u03b1</code> = 0.8\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (1\u00d72):\n$$\n\\begin{bmatrix}\n3.95 & 4.95\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>Matrix <code>Q</code> is of size <code>M\u00d7d</code> and matrices <code>K</code> and <code>V</code> are of size\n    <code>N\u00d7d</code></li>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 2048</li>\n  <li>1 &le; <code>d</code> &le; 1024</li>\n  <li>-1.0 &le; <code>&alpha;</code> &le; 1.0</li>\n\n  <li>Performance is measured with <code>M</code> = 2,048, <code>N</code> = 2,048</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int M, int N,\n                      int d, float alpha) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    d: cute.Int32,\n    alpha: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(\n    Q: jax.Array, K: jax.Array, V: jax.Array, M: int, N: int, d: int, alpha: float\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    d: Int32,\n    alpha: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    N: int,\n    d: int,\n    alpha: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    N: int,\n    d: int,\n    alpha: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Attention with Linear Biases\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        d: int,\n",
+        "        alpha: float,\n",
+        "    ):\n",
+        "        assert Q.shape == (M, d)\n",
+        "        assert K.shape == (N, d)\n",
+        "        assert V.shape == (N, d)\n",
+        "        assert output.shape == (M, d)\n",
+        "\n",
+        "        scale = d**0.5\n",
+        "        attn = torch.matmul(Q, K.t()) / scale\n",
+        "\n",
+        "        pos_bias = alpha * (\n",
+        "            torch.arange(M, device=Q.device).unsqueeze(1)\n",
+        "            - torch.arange(N, device=K.device).unsqueeze(0)\n",
+        "        )\n",
+        "        attn = attn + pos_bias\n",
+        "\n",
+        "        attn = torch.softmax(attn, dim=1)  # M , N\n",
+        "        torch.matmul(attn, V, out=output)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"d\": (ctypes.c_int, \"in\"),\n",
+        "            \"alpha\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        V = torch.tensor(\n",
+        "            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": 2, \"N\": 3, \"d\": 4, \"alpha\": 0.5}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example 1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor([[1.0, 2.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.tensor([[1.0, 0.0], [0.0, 1.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.tensor([[3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 1,\n",
+        "                \"N\": 2,\n",
+        "                \"d\": 2,\n",
+        "                \"alpha\": 0.8,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # basic_example 2\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 3,\n",
+        "                \"d\": 4,\n",
+        "                \"alpha\": 0.5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, 5, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 3,\n",
+        "                \"d\": 5,\n",
+        "                \"alpha\": 0.5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(4, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"N\": 4,\n",
+        "                \"d\": 3,\n",
+        "                \"alpha\": 1.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((64, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(64, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 64,\n",
+        "                \"N\": 128,\n",
+        "                \"d\": 32,\n",
+        "                \"alpha\": -0.76,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # different alpha\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((64, 32), device=\"cuda\", dtype=dtype).uniform_(-1, 1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-1, 1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-1, 1),\n",
+        "                \"output\": torch.empty(64, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 64,\n",
+        "                \"N\": 128,\n",
+        "                \"d\": 32,\n",
+        "                \"alpha\": -0.3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N, d = 2048, 2048, 1024\n",
+        "        Q = torch.empty((M, d), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        K = torch.empty((N, d), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        V = torch.empty((N, d), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        output = torch.empty(M, d, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": M, \"N\": N, \"d\": d, \"alpha\": 0.5}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/57_fp16_batched_matmul.ipynb b/challenges/colab_exports/medium/57_fp16_batched_matmul.ipynb
new file mode 100644
index 00000000..cc96a9ca
--- /dev/null
+++ b/challenges/colab_exports/medium/57_fp16_batched_matmul.ipynb
@@ -0,0 +1,499 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a batched matrix multiplication in FP16. Given a batch of matrices <code>A</code> of shape <code>[B, M, K]</code> and a batch of matrices <code>B</code> of shape <code>[B, K, N]</code>, compute the output batch <code>C</code> of shape <code>[B, M, N]</code> such that for each batch index <code>b</code>:\n  $$\n    C_b = A_b \\times B_b\n  $$\n  All matrices are stored in row-major order and use 16-bit floating point numbers (FP16/<code>half</code>). Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16</li>\n  <li>The final result must be stored in the <code>C</code> array as <code>half</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\nB = 2, M = 2, K = 3, N = 2\nA = [\n  [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],\n  [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]\n]\nB = [\n  [[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]],\n  [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]\n]\nOutput:\nC = [\n  [[22.0, 28.0], [49.0, 64.0]],\n  [[92.0, 68.0], [128.0, 95.0]]\n]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>B</code> &le; 128</li>\n  <li>1 &le; <code>M</code>, <code>N</code>, <code>K</code> &le; 1024</li>\n\n  <li>Performance is measured with <code>K</code> = 256, <code>M</code> = 256, <code>N</code> = 256</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_fp16.h>\n#include <cuda_runtime.h>\n\n// A, B, C are device pointers\nextern \"C\" void solve(const half* A, const half* B, half* C, int BATCH, int M, int N, int K) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    BATCH: cute.Int32,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, BATCH: int, M: int, N: int, K: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    A: UnsafePointer[Float16, MutExternalOrigin],\n    B: UnsafePointer[Float16, MutExternalOrigin],\n    C: UnsafePointer[Float16, MutExternalOrigin],\n    BATCH: Int32,\n    M: Int32,\n    N: Int32,\n    K: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, BATCH: int, M: int, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, b, c are tensors on the GPU\ndef solve(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, BATCH: int, M: int, N: int, K: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"FP16 Batched Matrix Multiplication\",\n",
+        "            atol=5e-2,\n",
+        "            rtol=5e-2,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, BATCH: int, M: int, N: int, K: int\n",
+        "    ):\n",
+        "        # A: (BATCH, M, K), B: (BATCH, K, N), C: (BATCH, M, N)\n",
+        "        A = A.view(BATCH, M, K)\n",
+        "        B = B.view(BATCH, K, N)\n",
+        "        # Use FP32 for accumulation, then convert to FP16\n",
+        "        A_f32 = A.to(torch.float32)\n",
+        "        B_f32 = B.to(torch.float32)\n",
+        "        result = torch.bmm(A_f32, B_f32)\n",
+        "        C.copy_(result.to(torch.float16))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_uint16), \"out\"),\n",
+        "            \"BATCH\": (ctypes.c_int, \"in\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        BATCH, M, K, N = 2, 2, 3, 2\n",
+        "        A = torch.tensor(\n",
+        "            [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        B = torch.tensor(\n",
+        "            [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        C = torch.empty(BATCH, M, N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"BATCH\": BATCH, \"M\": M, \"N\": N, \"K\": K}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float16\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        # 1. basic_example\n",
+        "        A1 = torch.tensor(\n",
+        "            [[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]], [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        ).flatten()\n",
+        "        B1 = torch.tensor(\n",
+        "            [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], [[6.0, 5.0], [4.0, 3.0], [2.0, 1.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        ).flatten()\n",
+        "        C1 = torch.empty((2, 2, 2), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A1, \"B\": B1, \"C\": C1, \"BATCH\": 2, \"M\": 2, \"N\": 2, \"K\": 3})\n",
+        "\n",
+        "        # 2. single_batch\n",
+        "        A2 = torch.tensor(\n",
+        "            [[[1.0, 0.0, 2.0], [0.0, 1.0, 2.0], [2.0, 1.0, 0.0]]], device=device, dtype=dtype\n",
+        "        ).flatten()\n",
+        "        B2 = torch.tensor(\n",
+        "            [[[2.0, 1.0, 0.0], [1.0, 2.0, 0.0], [0.0, 1.0, 2.0]]], device=device, dtype=dtype\n",
+        "        ).flatten()\n",
+        "        C2 = torch.empty((1, 3, 3), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A2, \"B\": B2, \"C\": C2, \"BATCH\": 1, \"M\": 3, \"N\": 3, \"K\": 3})\n",
+        "\n",
+        "        # 3. batch_4_small\n",
+        "        A3 = torch.empty((4, 2, 2), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B3 = torch.empty((4, 2, 2), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C3 = torch.empty((4, 2, 2), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A3, \"B\": B3, \"C\": C3, \"BATCH\": 4, \"M\": 2, \"N\": 2, \"K\": 2})\n",
+        "\n",
+        "        # 4. batch_8_rectangular\n",
+        "        A4 = torch.empty((8, 4, 2), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        B4 = torch.empty((8, 2, 3), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        C4 = torch.empty((8, 4, 3), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A4, \"B\": B4, \"C\": C4, \"BATCH\": 8, \"M\": 4, \"N\": 3, \"K\": 2})\n",
+        "\n",
+        "        # 5. batch_16_medium\n",
+        "        A5 = torch.empty((16, 16, 16), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B5 = torch.empty((16, 16, 16), device=device, dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C5 = torch.empty((16, 16, 16), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A5, \"B\": B5, \"C\": C5, \"BATCH\": 16, \"M\": 16, \"N\": 16, \"K\": 16})\n",
+        "\n",
+        "        # 6. batch_2_non_square\n",
+        "        A6 = torch.empty((2, 8, 4), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        B6 = torch.empty((2, 4, 6), device=device, dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        C6 = torch.empty((2, 8, 6), device=device, dtype=dtype)\n",
+        "        tests.append({\"A\": A6, \"B\": B6, \"C\": C6, \"BATCH\": 2, \"M\": 8, \"N\": 6, \"K\": 4})\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        BATCH, M, N, K = 32, 256, 256, 256\n",
+        "        A = torch.empty(BATCH, M, K, device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        B = torch.empty(BATCH, K, N, device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        C = torch.empty(BATCH, M, N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"BATCH\": BATCH, \"M\": M, \"N\": N, \"K\": K}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/58_fp16_dot_product.ipynb b/challenges/colab_exports/medium/58_fp16_dot_product.ipynb
new file mode 100644
index 00000000..ef74ab04
--- /dev/null
+++ b/challenges/colab_exports/medium/58_fp16_dot_product.ipynb
@@ -0,0 +1,504 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a GPU program that computes the dot product of two vectors containing 16-bit floating point numbers (FP16/<code>half</code>).\n    The dot product is the sum of the products of the corresponding elements of two vectors.\n</p>\n<p>\n    Mathematically, the dot product of two vectors $A$ and $B$ of length $n$ is defined as:\n    $$\n    A \\cdot B = \\sum_{i=0}^{n-1} A_i \\cdot B_i = A_0 \\cdot B_0 + A_1 \\cdot B_1 + \\ldots + A_{n-1} \\cdot B_{n-1}\n    $$\n</p>\n<p>\n    All inputs are stored as 16-bit floating point numbers (FP16/<code>half</code>). For best precision, accumulation during multiplication should use FP32 before converting the final result to FP16.\n</p>\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>Accumulation during multiplication should use FP32 for better precision before converting the final result to FP16</li>\n    <li>The final result must be stored in the output variable as <code>half</code></li>\n</ul>\n<h2>Example 1:</h2>\n<pre>Input:  A = [1.0, 2.0, 3.0, 4.0]\n               B = [5.0, 6.0, 7.0, 8.0]\n       Output: result = 70.0  (1.0*5.0 + 2.0*6.0 + 3.0*7.0 + 4.0*8.0)</pre>\n<h2>Example 2:</h2>\n<pre>Input:  A = [0.5, 1.5, 2.5]\n               B = [2.0, 3.0, 4.0]\n       Output: result = 15.5  (0.5*2.0 + 1.5*3.0 + 2.5*4.0)</pre>\n<h2>Constraints</h2>\n<ul>\n    <li><code>A</code> and <code>B</code> have identical lengths</li>\n    <li>1 \u2264 <code>N</code> \u2264 100,000,000</li>\n\n  <li>Performance is measured with <code>N</code> = 100,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_fp16.h>\n#include <cuda_runtime.h>\n\n// A, B, result are device pointers\nextern \"C\" void solve(const half* A, const half* B, half* result, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, result are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, result: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on the GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# A, B, result are device pointers\n@export\ndef solve(\n    A: UnsafePointer[Float16, MutExternalOrigin],\n    B: UnsafePointer[Float16, MutExternalOrigin],\n    result: UnsafePointer[Float16, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, result are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# A, B, result are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"FP16 Dot Product\", atol=5e-2, rtol=5e-2, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, B: torch.Tensor, result: torch.Tensor, N: int):\n",
+        "        assert A.shape == (N,)\n",
+        "        assert B.shape == (N,)\n",
+        "        assert result.shape == (1,)\n",
+        "        # Use FP32 for accumulation, then convert to FP16\n",
+        "        A_f32 = A.to(torch.float32)\n",
+        "        B_f32 = B.to(torch.float32)\n",
+        "        result_f32 = torch.dot(A_f32, B_f32)\n",
+        "        result[0] = result_f32.to(torch.float16)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"result\": (ctypes.POINTER(ctypes.c_uint16), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        A = torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        result = torch.empty(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"result\": result,\n",
+        "            \"N\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float16\n",
+        "        tests = []\n",
+        "        # basic_small\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, 2.0, 3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([5.0, 6.0, 7.0, 8.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([0.0] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([0.0] * 16, device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 16,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([-1.0, -2.0, -3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-5.0, -6.0, -7.0, -8.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # mixed_positive_negative\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-1.0, 2.0, -3.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # orthogonal_vectors\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([1.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([0.0, 1.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # medium_sized_vector\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"B\": torch.empty(1000, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1000,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_vector\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"B\": torch.empty(10000, device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"result\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10000,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float16\n",
+        "        N = 100000000\n",
+        "        A = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        B = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        result = torch.zeros(1, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"result\": result,\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/5_softmax.ipynb b/challenges/colab_exports/medium/5_softmax.ipynb
new file mode 100644
index 00000000..8a57f508
--- /dev/null
+++ b/challenges/colab_exports/medium/5_softmax.ipynb
@@ -0,0 +1,506 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Write a program that computes the softmax function for an array of 32-bit floating-point numbers on a GPU.  The softmax function is defined as follows:\n</p>\n\n<p>\n    For an input array $x$ of length $n$, the softmax of $x$, denoted $\\sigma(x)$, is an array of length $n$ where the $i$-th element is:\n</p>\n\n<p>\n    $\\sigma(x)_i = \\frac{e^{x_i}}{\\sum_{j=1}^{n} e^{x_j}}$\n</p>\n\n<p>\n  Your solution should handle potential overflow issues by using the \"max trick\".  Subtract the maximum value of the input array from each element before exponentiation.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the array <code>output</code></li>\n</ul>\n\n<h2>Example 1:</h2 >\n<pre>\nInput: [1.0, 2.0, 3.0], N = 3\nOutput: [0.090, 0.244, 0.665] (approximately)\n</pre>\n\n<h2>Example 2:</h2 >\n<pre>\nInput: [-10.0, -5.0, 0.0, 5.0, 10.0], N = 5\nOutput: [2.047e-09, 3.038e-07, 4.509e-05, 6.693e-03, 9.933e-01] (approximately)\n</pre>\n\n<h2>Constraints</h2>\n\n<ul>\n  <li>1 &le; <code>N</code> &le; 500,000</li>\n\n  <li>Performance is measured with <code>N</code> = 500,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n__global__ void softmax_kernel(const float* input, float* output, int N) {}\n\n// input, output are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* input, float* output, int N) {\n    int threadsPerBlock = 256;\n    int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;\n\n    softmax_kernel<<<blocksPerGrid, threadsPerBlock>>>(input, output, N);\n    cudaDeviceSynchronize();\n}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n@triton.jit\ndef softmax_kernel(input, output, N, BLOCK_SIZE: tl.constexpr):\n    input = input.to(tl.pointer_type(tl.float32))\n    output = output.to(tl.pointer_type(tl.float32))\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(name=\"Softmax\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\")\n",
+        "\n",
+        "    def reference_impl(self, input: torch.Tensor, output: torch.Tensor, N: int):\n",
+        "        assert input.shape == output.shape == (N,)\n",
+        "        assert input.dtype == output.dtype\n",
+        "        assert input.device == output.device\n",
+        "        max_val = torch.max(input)\n",
+        "        exp_x = torch.exp(input - max_val)\n",
+        "        sum_exp = torch.sum(exp_x)\n",
+        "        output.copy_(exp_x / sum_exp)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(3, device=\"cuda\", dtype=dtype)\n",
+        "        N = 3\n",
+        "        return {\"input\": input, \"output\": output, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "        # basic_small\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # negative_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([-1.0, -2.0, -3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # mixed_positive_negative\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1.0, -2.0, 3.0, -4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # very_small_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1e-6, 1e-7, 1e-8, 1e-9], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_numbers\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([10.0, 15.0, 20.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        # single_element\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(1, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # all_same_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([2.5] * 10, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(10, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 10,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_array\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty(2048, device=\"cuda\", dtype=dtype).uniform_(0.0, 10.0),\n",
+        "                \"output\": torch.empty(2048, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 2048,\n",
+        "            }\n",
+        "        )\n",
+        "        # large_max_small_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([1000.0, 1.0, 2.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 500000\n",
+        "        input = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0)\n",
+        "        output = torch.empty(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"input\": input, \"output\": output, \"N\": N}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/60_top_p_sampling.ipynb b/challenges/colab_exports/medium/60_top_p_sampling.ipynb
new file mode 100644
index 00000000..1cde2178
--- /dev/null
+++ b/challenges/colab_exports/medium/60_top_p_sampling.ipynb
@@ -0,0 +1,586 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Write a GPU program that implements top-p (nucleus) sampling for LLM inference.\n</p>\n\n<p>\n  Top-p sampling is a text generation technique where you sample from the smallest set of tokens whose cumulative probability exceeds threshold p.\n  This balances randomness and quality better than pure top-k or greedy sampling.\n</p>\n\n<p>\n  Given logits (unnormalized scores) from a language model:\n  <ol>\n    <li>Convert logits to probabilities using softmax</li>\n    <li>Sort tokens by probability (descending)</li>\n    <li>Find the smallest set where cumulative probability \u2265 p (the \"nucleus\")</li>\n    <li>Renormalize the nucleus probabilities to sum to 1</li>\n    <li>Sample a token from the nucleus using the provided random seed</li>\n  </ol>\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>Ensure numerical stability when computing softmax</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\n  logits = [1.0, 2.0, 3.0, 0.5]\n  p = 0.9\n  seed = 42\n\nOutput:\n  sampled_token = 2 or 1\n  (tokens with highest probabilities, sampled randomly)\n</pre>\n\n<h2>Example 2:</h2>\n<pre>\nInput:\n  logits = [10.0, 1.0, 1.0]\n  p = 0.5\n  seed = 123\n\nOutput:\n  sampled_token = 0\n  (single token dominates the probability mass)\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>3 &le; <code>vocab_size</code> &le; 50,000</li>\n  <li>-100.0 &le; <code>logits[i]</code> &le; 100.0</li>\n  <li>0.0 &lt; <code>p</code> &le; 1.0</li>\n  <li>0 &le; <code>sampled_token</code> &lt; vocab_size</li>\n\n  <li>Performance is measured with <code>vocab_size</code> = 50,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\nextern \"C\" void solve(const float* logits, const float* p, const int* seed, int* sampled_token,\n                      int vocab_size) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n@cute.jit\ndef solve(\n    logits: cute.Tensor,\n    p: cute.Tensor,\n    seed: cute.Tensor,\n    sampled_token: cute.Tensor,\n    vocab_size: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n@jax.jit\ndef solve(logits: jax.Array, p: jax.Array, seed: jax.Array, vocab_size: int) -> jax.Array:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\nfrom std.gpu import block_dim, block_idx, thread_idx\n\n\n@export\ndef solve(\n    logits: UnsafePointer[Float32, MutExternalOrigin],\n    p: UnsafePointer[Float32, MutExternalOrigin],\n    seed: UnsafePointer[Int32, MutExternalOrigin],\n    sampled_token: UnsafePointer[Int32, MutExternalOrigin],\n    vocab_size: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\ndef solve(\n    logits: torch.Tensor,\n    p: torch.Tensor,\n    seed: torch.Tensor,\n    sampled_token: torch.Tensor,\n    vocab_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\ndef solve(\n    logits: torch.Tensor,\n    p: torch.Tensor,\n    seed: torch.Tensor,\n    sampled_token: torch.Tensor,\n    vocab_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Top-p Sampling\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        logits: torch.Tensor,\n",
+        "        p: torch.Tensor,\n",
+        "        seed: torch.Tensor,\n",
+        "        sampled_token: torch.Tensor,\n",
+        "        vocab_size: int,\n",
+        "    ):\n",
+        "        assert logits.shape == (vocab_size,)\n",
+        "        assert p.shape == (1,)\n",
+        "        assert seed.shape == (1,)\n",
+        "        assert sampled_token.shape == (1,)\n",
+        "        assert logits.dtype == torch.float32\n",
+        "        assert p.dtype == torch.float32\n",
+        "\n",
+        "        p_value = p.item()\n",
+        "        seed_value = seed.item()\n",
+        "\n",
+        "        max_logit = torch.max(logits)\n",
+        "        exp_logits = torch.exp(logits - max_logit)\n",
+        "        probs = exp_logits / torch.sum(exp_logits)\n",
+        "\n",
+        "        sorted_probs, sorted_indices = torch.sort(probs, descending=True)\n",
+        "        cumsum = torch.cumsum(sorted_probs, dim=0)\n",
+        "\n",
+        "        cutoff_idx = torch.searchsorted(cumsum, p_value, right=False).item()\n",
+        "        cutoff_idx = min(cutoff_idx + 1, vocab_size)\n",
+        "\n",
+        "        nucleus_probs = sorted_probs[:cutoff_idx]\n",
+        "        nucleus_indices = sorted_indices[:cutoff_idx]\n",
+        "\n",
+        "        nucleus_probs = nucleus_probs / torch.sum(nucleus_probs)\n",
+        "\n",
+        "        torch.manual_seed(seed_value)\n",
+        "        sampled_idx = torch.multinomial(nucleus_probs, 1).item()\n",
+        "        sampled_token[0] = nucleus_indices[sampled_idx]\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"logits\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"p\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"seed\": (ctypes.POINTER(ctypes.c_int32), \"in\"),\n",
+        "            \"sampled_token\": (ctypes.POINTER(ctypes.c_int32), \"out\"),\n",
+        "            \"vocab_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        logits = torch.tensor([1.0, 2.0, 3.0, 0.5], device=\"cuda\", dtype=torch.float32)\n",
+        "        p = torch.tensor([0.9], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([42], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"logits\": logits,\n",
+        "            \"p\": p,\n",
+        "            \"seed\": seed,\n",
+        "            \"sampled_token\": sampled_token,\n",
+        "            \"vocab_size\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "\n",
+        "        logits = torch.tensor([1.0, 2.0, 3.0], device=\"cuda\", dtype=torch.float32)\n",
+        "        p = torch.tensor([0.95], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([123], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.randn(10, device=\"cuda\", dtype=torch.float32)\n",
+        "        p = torch.tensor([0.9], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([456], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 10,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.randn(100, device=\"cuda\", dtype=torch.float32) * 5.0\n",
+        "        p = torch.tensor([0.85], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([789], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 100,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.zeros(50, device=\"cuda\", dtype=torch.float32)\n",
+        "        logits[0] = 10.0\n",
+        "        p = torch.tensor([0.5], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([111], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 50,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.randn(500, device=\"cuda\", dtype=torch.float32) * 3.0\n",
+        "        p = torch.tensor([0.92], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([222], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 500,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.linspace(-5, 5, 200, device=\"cuda\", dtype=torch.float32)\n",
+        "        p = torch.tensor([0.8], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([333], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 200,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.randn(1000, device=\"cuda\", dtype=torch.float32) * 2.0\n",
+        "        p = torch.tensor([0.95], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([444], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 1000,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        logits = torch.randn(5000, device=\"cuda\", dtype=torch.float32)\n",
+        "        p = torch.tensor([0.9], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([555], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"logits\": logits,\n",
+        "                \"p\": p,\n",
+        "                \"seed\": seed,\n",
+        "                \"sampled_token\": sampled_token,\n",
+        "                \"vocab_size\": 5000,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        vocab_size = 50000\n",
+        "        logits = torch.randn(vocab_size, device=\"cuda\", dtype=torch.float32) * 3.0\n",
+        "        p = torch.tensor([0.9], device=\"cuda\", dtype=torch.float32)\n",
+        "        seed = torch.tensor([999], device=\"cuda\", dtype=torch.int32)\n",
+        "        sampled_token = torch.zeros(1, device=\"cuda\", dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"logits\": logits,\n",
+        "            \"p\": p,\n",
+        "            \"seed\": seed,\n",
+        "            \"sampled_token\": sampled_token,\n",
+        "            \"vocab_size\": vocab_size,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/61_rope_embedding.ipynb b/challenges/colab_exports/medium/61_rope_embedding.ipynb
new file mode 100644
index 00000000..2ff315ee
--- /dev/null
+++ b/challenges/colab_exports/medium/61_rope_embedding.ipynb
@@ -0,0 +1,560 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n    Implement a GPU program that computes the Rotary Positional Embedding (RoPE) for a batch of query vectors.\n    RoPE is a method for encoding positional information in transformer models by rotating the query and key vectors using precomputed cosine and sine components.\n</p>\n<p>\n    Mathematically, given a query vector $x$ and corresponding cosine and sine vectors, the operation is defined as:\n    $$\n    \\text{RoPE}(x) = x \\odot \\cos + \\text{rotate\\_half}(x) \\odot \\sin\n    $$\n</p>\n<p>\n    Where $\\odot$ denotes element-wise multiplication. The $\\text{rotate\\_half}(x)$ operation swaps the first and second halves of the vector and negates the first half. For a vector of dimension $d$:\n    $$\n    \\text{rotate\\_half}([x_1, \\dots, x_{d/2}, x_{d/2+1}, \\dots, x_d]) = [-x_{d/2+1}, \\dots, -x_d, x_1, \\dots, x_{d/2}]\n    $$\n</p>\n<h2>Implementation Requirements</h2>\n<ul>\n    <li>External libraries are not permitted</li>\n    <li>The <code>solve</code> function signature must remain unchanged</li>\n    <li>The input tensors <code>Q</code>, <code>cos</code>, and <code>sin</code> have shape <code>(M, D)</code>, where <code>M</code> is the number of tokens and <code>D</code> is the head dimension</li>\n    <li><code>D</code> (head dimension) is guaranteed to be an even number</li>\n    <li>The final result must be stored in the output variable with the same shape <code>(M, D)</code></li>\n</ul>\n<h2>Example 1:</h2>\n<pre>Input:  Q   = [[1.0, 2.0, 3.0, 4.0],\n               [1.0, 1.0, 1.0, 1.0]]\n        Cos = [[1.0, 1.0, 1.0, 1.0],\n               [0.0, 0.0, 0.0, 0.0]]\n        Sin = [[0.0, 0.0, 0.0, 0.0],\n               [1.0, 1.0, 1.0, 1.0]]\nOutput: result = [[1.0, 2.0, 3.0, 4.0],\n                  [-1.0, -1.0, 1.0, 1.0]]\n        (Row 0 is identity via Cos; Row 1 is rotated via Sin)</pre>\n<h2>Constraints</h2>\n<ul>\n    <li><code>Q</code>, <code>cos</code>, and <code>sin</code> have identical dimensions</li>\n    <li><code>D</code> % 2 == 0</li>\n    <li>1 \u2264 <code>M</code>, <code>D</code> \u2264 10,000</li>\n\n  <li>Performance is measured with <code>D</code> = 128, <code>M</code> = 1,048,576</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, cos, sin, output are device pointers\nextern \"C\" void solve(float* Q, float* cos, float* sin, float* output, int M, int D) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, cos, sin, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    cos: cute.Tensor,\n    sin: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    D: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, cos, sin are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, cos: jax.Array, sin: jax.Array, M: int, D: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, cos, sin, output are device pointers\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    cos: UnsafePointer[Float32, MutExternalOrigin],\n    sin: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    D: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, cos, sin, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, output: torch.Tensor, M: int, D: int\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, cos, sin, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, output: torch.Tensor, M: int, D: int\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Rotary Positional Embedding\",\n",
+        "            atol=1e-4,\n",
+        "            rtol=1e-4,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        cos: torch.Tensor,\n",
+        "        sin: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        D: int,\n",
+        "    ):\n",
+        "        assert Q.shape == (M, D)\n",
+        "        assert cos.shape == (M, D)\n",
+        "        assert sin.shape == (M, D)\n",
+        "        assert output.shape == (M, D)\n",
+        "\n",
+        "        # rotate_half implementation\n",
+        "        # Split the last dimension into two halves\n",
+        "        x1 = Q[..., : D // 2]\n",
+        "        x2 = Q[..., D // 2 :]\n",
+        "        # Concatenate -x2 and x1\n",
+        "        rotated_Q = torch.cat((-x2, x1), dim=-1)\n",
+        "\n",
+        "        # RoPE calculation\n",
+        "        # Output = Q * Cos + rotate_half(Q) * Sin\n",
+        "        result = (Q * cos) + (rotated_Q * sin)\n",
+        "\n",
+        "        output.copy_(result)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"cos\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"sin\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"D\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        M = 1024\n",
+        "        D = 64\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        Q = torch.randn(M, D, device=\"cuda\", dtype=dtype)\n",
+        "        Cos = torch.randn(M, D, device=\"cuda\", dtype=dtype)\n",
+        "        Sin = torch.randn(M, D, device=\"cuda\", dtype=dtype)\n",
+        "        Output = torch.zeros(M, D, device=\"cuda\", dtype=dtype)\n",
+        "\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"cos\": Cos,\n",
+        "            \"sin\": Sin,\n",
+        "            \"output\": Output,\n",
+        "            \"M\": M,\n",
+        "            \"D\": D,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "        dtype = torch.float32\n",
+        "\n",
+        "        # Test 1: Small input\n",
+        "        M = 4\n",
+        "        D = 4\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"cos\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"sin\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"D\": D,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test 2: Larger input\n",
+        "        M = 128\n",
+        "        D = 64\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"cos\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"sin\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(M, D, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"D\": D,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices: outputs should remain zero when inputs are zero\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 6), device=\"cuda\", dtype=dtype),\n",
+        "                \"cos\": torch.zeros((3, 6), device=\"cuda\", dtype=dtype),\n",
+        "                \"sin\": torch.zeros((3, 6), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(3, 6, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"D\": 6,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # minimal_dims: smallest even D that still allows rotation\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.randn((1, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"cos\": torch.randn((1, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"sin\": torch.randn((1, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.zeros(1, 2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 1,\n",
+        "                \"D\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values: negative and positive entries\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0, 4.0], [5.0, -6.0, 7.0, -8.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"cos\": torch.tensor(\n",
+        "                    [[0.5, 0.5, 0.5, 0.5], [0.1, 0.2, 0.3, 0.4]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"sin\": torch.tensor(\n",
+        "                    [[0.5, -0.5, 0.5, -0.5], [0.4, -0.3, 0.2, -0.1]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.zeros(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"D\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices: random uniform values for stress testing\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((256, 128), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"cos\": torch.empty((256, 128), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"sin\": torch.empty((256, 128), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"output\": torch.zeros(256, 128, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 256,\n",
+        "                \"D\": 128,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        M = 1024 * 1024  # 1M tokens\n",
+        "        D = 128\n",
+        "        dtype = torch.float32\n",
+        "        return {\n",
+        "            \"Q\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "            \"cos\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "            \"sin\": torch.randn(M, D, device=\"cuda\", dtype=dtype),\n",
+        "            \"output\": torch.zeros(M, D, device=\"cuda\", dtype=dtype),\n",
+        "            \"M\": M,\n",
+        "            \"D\": D,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/64_weight_dequantization.ipynb b/challenges/colab_exports/medium/64_weight_dequantization.ipynb
new file mode 100644
index 00000000..b5e020c6
--- /dev/null
+++ b/challenges/colab_exports/medium/64_weight_dequantization.ipynb
@@ -0,0 +1,526 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that \"dequantizes\" a weight matrix on the GPU. You are given an input matrix <code>X</code> of shape <code>[M, N]</code> containing quantized values and a scale matrix <code>S</code> of shape <code>[ceil(M/T), ceil(N/T)]</code>, where <code>T</code> is the tile size.\n</p>\n<p>\n  For each element $X_{i,j}$, the corresponding scale factor is $S_{row, col}$ where $row = \\lfloor i / T \\rfloor$ and $col = \\lfloor j / T \\rfloor$.\n  The output $Y_{i,j}$ should be computed as:\n  $$\n    Y_{i,j} = X_{i,j} \\times S_{row, col}\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the output buffer <code>Y</code></li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\nM = 4, N = 4, TILE_SIZE = 2\nX = [\n  [10, 10,  5,  5],\n  [10, 10,  5,  5],\n  [ 2,  2,  8,  8],\n  [ 2,  2,  8,  8]\n]\nS = [\n  [0.5, 2.0],\n  [4.0, 0.25]\n]\n\nOutput:\nY = [\n  [ 5.0,  5.0, 10.0, 10.0],\n  [ 5.0,  5.0, 10.0, 10.0],\n  [ 8.0,  8.0,  2.0,  2.0],\n  [ 8.0,  8.0,  2.0,  2.0]\n]\nExplanation:\nTile (0,0) of X is multiplied by S[0,0] (0.5).\nTile (0,1) of X is multiplied by S[0,1] (2.0).\nTile (1,0) is multiplied by S[1,0] (4.0).\nTile (1,1) is multiplied by S[1,1] (0.25).\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 8192</li>\n  <li><code>TILE_SIZE</code> &in; {16, 32, 64, 128}</li>\n\n  <li>Performance is measured with <code>M</code> = 8,192, <code>N</code> = 8,192, <code>TILE_SIZE</code> = 128</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// X, S, Y are device pointers\nextern \"C\" void solve(const float* X, const float* S, float* Y, int M, int N, int TILE_SIZE) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# X, S, Y are tensors on the GPU\n@cute.jit\ndef solve(\n    X: cute.Tensor,\n    S: cute.Tensor,\n    Y: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    TILE_SIZE: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# X, S are tensors on the GPU\n@jax.jit\ndef solve(X: jax.Array, S: jax.Array, M: int, N: int, TILE_SIZE: int) -> jax.Array:\n    # return output tensor Y directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# X, S, Y are device pointers\n@export\ndef solve(\n    X: UnsafePointer[Float32, MutExternalOrigin],\n    S: UnsafePointer[Float32, MutExternalOrigin],\n    Y: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    TILE_SIZE: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# X, S, Y are tensors on the GPU\ndef solve(X: torch.Tensor, S: torch.Tensor, Y: torch.Tensor, M: int, N: int, TILE_SIZE: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# X, S, Y are tensors on the GPU\ndef solve(X: torch.Tensor, S: torch.Tensor, Y: torch.Tensor, M: int, N: int, TILE_SIZE: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Weight Dequantization\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self, X: torch.Tensor, S: torch.Tensor, Y: torch.Tensor, M: int, N: int, TILE_SIZE: int\n",
+        "    ):\n",
+        "        s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        assert X.shape == (M, N)\n",
+        "        assert S.shape == (s_rows, s_cols)\n",
+        "        assert Y.shape == (M, N)\n",
+        "        assert X.dtype == torch.float32\n",
+        "        assert S.dtype == torch.float32\n",
+        "        assert Y.dtype == torch.float32\n",
+        "\n",
+        "        S_expanded = S.repeat_interleave(TILE_SIZE, dim=0)[:M, :]\n",
+        "        S_expanded = S_expanded.repeat_interleave(TILE_SIZE, dim=1)[:, :N]\n",
+        "\n",
+        "        Y.copy_(X * S_expanded)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"X\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"S\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"Y\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"TILE_SIZE\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        M, N = 256, 256\n",
+        "        TILE_SIZE = 128\n",
+        "        X = torch.randn(M, N, device=\"cuda\", dtype=torch.float32)\n",
+        "        # S shape\n",
+        "        s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        S = torch.randn(s_rows, s_cols, device=\"cuda\", dtype=torch.float32)\n",
+        "        Y = torch.empty_like(X)\n",
+        "\n",
+        "        return {\n",
+        "            \"X\": X,\n",
+        "            \"S\": S,\n",
+        "            \"Y\": Y,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"TILE_SIZE\": TILE_SIZE,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "\n",
+        "        test_configs = [\n",
+        "            # Edge cases - small sizes\n",
+        "            (1, 1, 16),\n",
+        "            (2, 3, 16),\n",
+        "            (4, 4, 16),\n",
+        "            # Power-of-2 sizes\n",
+        "            (64, 64, 32),\n",
+        "            (128, 128, 64),\n",
+        "            (256, 256, 128),\n",
+        "            (512, 512, 128),\n",
+        "            # Non-power-of-2 sizes (padding needed)\n",
+        "            (30, 50, 16),\n",
+        "            (100, 100, 32),\n",
+        "            (130, 200, 128),\n",
+        "            (255, 255, 64),\n",
+        "            # Realistic sizes\n",
+        "            (1024, 1024, 128),\n",
+        "            (2048, 4096, 128),\n",
+        "        ]\n",
+        "\n",
+        "        for M, N, TILE_SIZE in test_configs:\n",
+        "            s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "            s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "            tests.append(\n",
+        "                {\n",
+        "                    \"X\": torch.randn(M, N, device=\"cuda\", dtype=torch.float32),\n",
+        "                    \"S\": torch.randn(s_rows, s_cols, device=\"cuda\", dtype=torch.float32),\n",
+        "                    \"Y\": torch.zeros(M, N, device=\"cuda\", dtype=torch.float32),\n",
+        "                    \"M\": M,\n",
+        "                    \"N\": N,\n",
+        "                    \"TILE_SIZE\": TILE_SIZE,\n",
+        "                }\n",
+        "            )\n",
+        "\n",
+        "        # Zero input\n",
+        "        M, N, TILE_SIZE = 64, 64, 32\n",
+        "        s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.zeros(M, N, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"S\": torch.randn(s_rows, s_cols, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"Y\": torch.zeros(M, N, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"TILE_SIZE\": TILE_SIZE,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Negative values\n",
+        "        M, N, TILE_SIZE = 128, 128, 64\n",
+        "        s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"X\": torch.randn(M, N, device=\"cuda\", dtype=torch.float32).sub_(0.5),\n",
+        "                \"S\": torch.randn(s_rows, s_cols, device=\"cuda\", dtype=torch.float32).sub_(0.5),\n",
+        "                \"Y\": torch.zeros(M, N, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"TILE_SIZE\": TILE_SIZE,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        M, N = 8192, 8192\n",
+        "        TILE_SIZE = 128\n",
+        "        X = torch.randn(M, N, device=\"cuda\", dtype=torch.float32)\n",
+        "        s_rows = (M + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        s_cols = (N + TILE_SIZE - 1) // TILE_SIZE\n",
+        "        S = torch.randn(s_rows, s_cols, device=\"cuda\", dtype=torch.float32)\n",
+        "        Y = torch.empty_like(X)\n",
+        "\n",
+        "        return {\n",
+        "            \"X\": X,\n",
+        "            \"S\": S,\n",
+        "            \"Y\": Y,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"TILE_SIZE\": TILE_SIZE,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/67_moe_topk_gating.ipynb b/challenges/colab_exports/medium/67_moe_topk_gating.ipynb
new file mode 100644
index 00000000..08d60baf
--- /dev/null
+++ b/challenges/colab_exports/medium/67_moe_topk_gating.ipynb
@@ -0,0 +1,581 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that performs Top-K Gating for Mixture of Experts (MoE) models. Given a logit matrix of shape <code>[M, E]</code> where M is the number of tokens and E is the number of experts, identify the k largest values in each row, extract their indices, and apply softmax to get mixing weights.\n</p>\n\n<p>\n  For each row i, the operation computes:\n  $$\n  \\begin{align}\n  \\text{indices}_i, \\text{vals}_i &= \\text{TopK}(\\text{logits}_i, k) \\\\\n  \\text{vals}_i &= \\text{logits}_i[\\text{indices}_i] \\\\\n  \\text{weights}_i &= \\text{Softmax}(\\text{vals}_i)\n  \\end{align}\n  $$\n</p>\n\n<p>\n  The selected experts must remain ordered by descending logit value, matching the order returned by\n  <code>topk</code>. The <code>topk_weights</code> array must correspond positionally to\n  <code>topk_indices</code> in that same order.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>External libraries are not permitted</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in the <code>topk_weights</code> and <code>topk_indices</code> arrays</li>\n</ul>\n\n<h2>Example 1:</h2>\n<pre>\nInput:\n  logits = [[1.0, 2.0, 3.0, 4.0],\n            [4.0, 3.0, 2.0, 1.0]]\n  M = 2, E = 4, k = 2\n\nOutput:\n  topk_weights = [[0.7311, 0.2689],\n                  [0.7311, 0.2689]]\n  topk_indices = [[3, 2],\n                  [0, 1]]\n\nExplanation:\nRow 0: Top-2 values are 4.0 and 3.0 at indices 3 and 2.\n       Softmax([4.0, 3.0]) = [0.7311, 0.2689]\nRow 1: Top-2 values are 4.0 and 3.0 at indices 0 and 1.\n       Softmax([4.0, 3.0]) = [0.7311, 0.2689]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 \u2264 <code>M</code> \u2264 10,000 (number of tokens)</li>\n  <li>1 \u2264 <code>E</code> \u2264 256 (number of experts)</li>\n  <li>1 \u2264 <code>k</code> \u2264 <code>E</code> (top-k selection, typically k=2)</li>\n  <li>All tensors are stored on GPU</li>\n  <li>Logits are 32-bit floats</li>\n  <li>Indices are 32-bit integers</li>\n\n  <li>Performance is measured with <code>M</code> = 1,024, <code>k</code> = 2</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// logits, topk_weights, topk_indices are device pointers\nextern \"C\" void solve(const float* logits, float* topk_weights, int* topk_indices, int M, int E,\n                      int k) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# logits, topk_weights, topk_indices are tensors on the GPU\n@cute.jit\ndef solve(\n    logits: cute.Tensor,\n    topk_weights: cute.Tensor,\n    topk_indices: cute.Tensor,\n    M: cute.Int32,\n    E: cute.Int32,\n    k: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# logits is a tensor on the GPU\n@jax.jit\ndef solve(logits: jax.Array, M: int, E: int, k: int) -> tuple[jax.Array, jax.Array]:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n@export\ndef solve(\n    logits: UnsafePointer[Float32, MutExternalOrigin],\n    topk_weights: UnsafePointer[Float32, MutExternalOrigin],\n    topk_indices: UnsafePointer[Int32, MutExternalOrigin],\n    M: Int32,\n    E: Int32,\n    k: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# logits, topk_weights, topk_indices are tensors on the GPU\ndef solve(\n    logits: torch.Tensor,\n    topk_weights: torch.Tensor,\n    topk_indices: torch.Tensor,\n    M: int,\n    E: int,\n    k: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# logits, topk_weights, topk_indices are tensors on the GPU\ndef solve(\n    logits: torch.Tensor,\n    topk_weights: torch.Tensor,\n    topk_indices: torch.Tensor,\n    M: int,\n    E: int,\n    k: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"MoE Top-K Gating\", atol=1e-05, rtol=1e-05, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        logits: torch.Tensor,\n",
+        "        topk_weights: torch.Tensor,\n",
+        "        topk_indices: torch.Tensor,\n",
+        "        M: int,\n",
+        "        E: int,\n",
+        "        k: int,\n",
+        "    ):\n",
+        "        \"\"\"\n",
+        "        Computes the Top-K gating for Mixture of Experts.\n",
+        "\n",
+        "        For each row in logits, select the k highest values, apply softmax to them,\n",
+        "        and return the weights and indices.\n",
+        "        \"\"\"\n",
+        "        assert logits.shape == (M, E)\n",
+        "        assert topk_weights.shape == (M, k)\n",
+        "        assert topk_indices.shape == (M, k)\n",
+        "        assert logits.is_cuda and topk_weights.is_cuda and topk_indices.is_cuda\n",
+        "        assert topk_indices.dtype == torch.int32\n",
+        "\n",
+        "        # 1. TopK Selection\n",
+        "        # logits: (M, E) -> vals: (M, k), indices: (M, k)\n",
+        "        vals, indices = torch.topk(logits, k, dim=-1)\n",
+        "\n",
+        "        # 2. Softmax on the top k values\n",
+        "        weights = torch.softmax(vals, dim=-1)\n",
+        "\n",
+        "        # 3. Write output\n",
+        "        topk_weights.copy_(weights)\n",
+        "        topk_indices.copy_(indices.to(torch.int32))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"logits\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"topk_weights\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"topk_indices\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"E\": (ctypes.c_int, \"in\"),\n",
+        "            \"k\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        M = 2\n",
+        "        E = 4\n",
+        "        k = 2\n",
+        "\n",
+        "        # Example from problem description\n",
+        "        logits_data = torch.tensor(\n",
+        "            [[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0]], device=\"cuda\", dtype=dtype_float\n",
+        "        )\n",
+        "        topk_weights_data = torch.zeros((M, k), device=\"cuda\", dtype=dtype_float)\n",
+        "        topk_indices_data = torch.zeros((M, k), device=\"cuda\", dtype=dtype_int)\n",
+        "\n",
+        "        return {\n",
+        "            \"logits\": logits_data,\n",
+        "            \"topk_weights\": topk_weights_data,\n",
+        "            \"topk_indices\": topk_indices_data,\n",
+        "            \"M\": M,\n",
+        "            \"E\": E,\n",
+        "            \"k\": k,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        # Test case 1: Basic example from problem description\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [4.0, 3.0, 2.0, 1.0]], device=\"cuda\", dtype=dtype_float\n",
+        "                ),\n",
+        "                \"topk_weights\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 2,\n",
+        "                \"E\": 4,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 2: k=1 (single expert per token)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[5.0, 1.0, 3.0], [2.0, 8.0, 4.0], [6.0, 2.0, 9.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"topk_weights\": torch.zeros((3, 1), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((3, 1), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 3,\n",
+        "                \"E\": 3,\n",
+        "                \"k\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 3: k=E (all experts)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [3.0, 1.0, 2.0]], device=\"cuda\", dtype=dtype_float\n",
+        "                ),\n",
+        "                \"topk_weights\": torch.zeros((2, 3), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((2, 3), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 2,\n",
+        "                \"E\": 3,\n",
+        "                \"k\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 4: Typical MoE configuration (M=4, E=8, k=2)\n",
+        "        torch.manual_seed(42)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.randn((4, 8), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_weights\": torch.zeros((4, 2), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((4, 2), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 4,\n",
+        "                \"E\": 8,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 5: Larger E with small k (M=8, E=64, k=2)\n",
+        "        torch.manual_seed(123)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.randn((8, 64), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_weights\": torch.zeros((8, 2), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((8, 2), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 8,\n",
+        "                \"E\": 64,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 6: Test with negative logits\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.tensor(\n",
+        "                    [[-1.0, -2.0, -3.0, -4.0], [-4.0, -1.0, -2.0, -3.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype_float,\n",
+        "                ),\n",
+        "                \"topk_weights\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((2, 2), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 2,\n",
+        "                \"E\": 4,\n",
+        "                \"k\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Test case 7: Medium size test (M=100, E=16, k=4)\n",
+        "        torch.manual_seed(456)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"logits\": torch.randn((100, 16), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_weights\": torch.zeros((100, 4), device=\"cuda\", dtype=dtype_float),\n",
+        "                \"topk_indices\": torch.zeros((100, 4), device=\"cuda\", dtype=dtype_int),\n",
+        "                \"M\": 100,\n",
+        "                \"E\": 16,\n",
+        "                \"k\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype_float = torch.float32\n",
+        "        dtype_int = torch.int32\n",
+        "        M = 1024\n",
+        "        E = 64\n",
+        "        k = 2\n",
+        "\n",
+        "        torch.manual_seed(789)\n",
+        "        return {\n",
+        "            \"logits\": torch.randn((M, E), device=\"cuda\", dtype=dtype_float),\n",
+        "            \"topk_weights\": torch.zeros((M, k), device=\"cuda\", dtype=dtype_float),\n",
+        "            \"topk_indices\": torch.zeros((M, k), device=\"cuda\", dtype=dtype_int),\n",
+        "            \"M\": M,\n",
+        "            \"E\": E,\n",
+        "            \"k\": k,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/69_jacobi_stencil_2d.ipynb b/challenges/colab_exports/medium/69_jacobi_stencil_2d.ipynb
new file mode 100644
index 00000000..485291dd
--- /dev/null
+++ b/challenges/colab_exports/medium/69_jacobi_stencil_2d.ipynb
@@ -0,0 +1,610 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given a 2D grid of 32-bit floating point values, apply one iteration of the 5-point Jacobi stencil:\n  each interior cell of the output is set to the average of its four cardinal neighbors (top, bottom,\n  left, right) from the input grid. Boundary cells (first/last row and column) are copied unchanged\n  from the input to the output.\n</p>\n\n<svg width=\"320\" height=\"280\" viewBox=\"0 0 320 280\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\" font-family=\"monospace\" font-size=\"11\">\n  <rect width=\"320\" height=\"280\" rx=\"8\" fill=\"#222\"/>\n  <defs>\n    <marker id=\"arrj\" markerWidth=\"7\" markerHeight=\"5\" refX=\"7\" refY=\"2.5\" orient=\"auto\">\n      <polygon points=\"0 0, 7 2.5, 0 5\" fill=\"#44aa66\"/>\n    </marker>\n  </defs>\n\n  <!-- Title -->\n  <text x=\"160\" y=\"18\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">5-Point Jacobi Stencil</text>\n\n  <!-- 5x5 grid, cell 44x34, origin (30, 28) -->\n  <!-- Row 0 (boundary) -->\n  <rect x=\"30\"  y=\"28\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"78\"  y=\"28\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"126\" y=\"28\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"174\" y=\"28\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"222\" y=\"28\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <!-- Row 1 -->\n  <rect x=\"30\"  y=\"66\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"78\"  y=\"66\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#333\" stroke=\"#555\"/>\n  <rect x=\"126\" y=\"66\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"2\"/>\n  <rect x=\"174\" y=\"66\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#333\" stroke=\"#555\"/>\n  <rect x=\"222\" y=\"66\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <!-- Row 2 -->\n  <rect x=\"30\"  y=\"104\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"78\"  y=\"104\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"2\"/>\n  <rect x=\"126\" y=\"104\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"2\"/>\n  <rect x=\"174\" y=\"104\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"2\"/>\n  <rect x=\"222\" y=\"104\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <!-- Row 3 -->\n  <rect x=\"30\"  y=\"142\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"78\"  y=\"142\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#333\" stroke=\"#555\"/>\n  <rect x=\"126\" y=\"142\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"2\"/>\n  <rect x=\"174\" y=\"142\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#333\" stroke=\"#555\"/>\n  <rect x=\"222\" y=\"142\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <!-- Row 4 (boundary) -->\n  <rect x=\"30\"  y=\"180\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"78\"  y=\"180\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"126\" y=\"180\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"174\" y=\"180\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n  <rect x=\"222\" y=\"180\" width=\"44\" height=\"34\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\"/>\n\n  <!-- Neighbor labels -->\n  <text x=\"148\" y=\"88\" text-anchor=\"middle\" font-weight=\"bold\" fill=\"#44aa66\" font-size=\"12\">T</text>\n  <text x=\"100\" y=\"126\" text-anchor=\"middle\" font-weight=\"bold\" fill=\"#44aa66\" font-size=\"12\">L</text>\n  <text x=\"148\" y=\"124\" text-anchor=\"middle\" fill=\"#4477bb\" font-size=\"9\">(2,2)</text>\n  <text x=\"196\" y=\"126\" text-anchor=\"middle\" font-weight=\"bold\" fill=\"#44aa66\" font-size=\"12\">R</text>\n  <text x=\"148\" y=\"164\" text-anchor=\"middle\" font-weight=\"bold\" fill=\"#44aa66\" font-size=\"12\">B</text>\n\n  <!-- Arrows from neighbors to center -->\n  <line x1=\"148\" y1=\"100\" x2=\"148\" y2=\"107\" stroke=\"#44aa66\" stroke-width=\"1.5\" marker-end=\"url(#arrj)\"/>\n  <line x1=\"122\" y1=\"121\" x2=\"129\" y2=\"121\" stroke=\"#44aa66\" stroke-width=\"1.5\" marker-end=\"url(#arrj)\"/>\n  <line x1=\"174\" y1=\"121\" x2=\"167\" y2=\"121\" stroke=\"#44aa66\" stroke-width=\"1.5\" marker-end=\"url(#arrj)\"/>\n  <line x1=\"148\" y1=\"142\" x2=\"148\" y2=\"135\" stroke=\"#44aa66\" stroke-width=\"1.5\" marker-end=\"url(#arrj)\"/>\n\n  <!-- Legend -->\n  <rect x=\"30\" y=\"228\" width=\"14\" height=\"14\" rx=\"2\" fill=\"#1e2d4d\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"48\" y=\"240\" fill=\"#ccc\" font-size=\"9\">Center cell</text>\n  <rect x=\"115\" y=\"228\" width=\"14\" height=\"14\" rx=\"2\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"133\" y=\"240\" fill=\"#ccc\" font-size=\"9\">Neighbors</text>\n  <rect x=\"200\" y=\"228\" width=\"14\" height=\"14\" rx=\"2\" fill=\"#2a2a2a\" stroke=\"#444\" stroke-width=\"1.5\"/>\n  <text x=\"218\" y=\"240\" fill=\"#ccc\" font-size=\"9\">Boundary</text>\n\n  <!-- Formula -->\n  <text x=\"160\" y=\"264\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">output[i,j] = &#xbc; &#xd7; (top + bottom + left + right)</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>output</code></li>\n  <li>Read exclusively from <code>input</code> and write exclusively to <code>output</code> (do not update <code>input</code>)</li>\n</ul>\n\n<h2>Example:</h2>\n<p>\nInput ($4 \\times 4$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0 \\\\\n9.0 & 10.0 & 11.0 & 12.0 \\\\\n13.0 & 14.0 & 15.0 & 16.0\n\\end{bmatrix}\n$$\nOutput ($4 \\times 4$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0 \\\\\n9.0 & 10.0 & 11.0 & 12.0 \\\\\n13.0 & 14.0 & 15.0 & 16.0\n\\end{bmatrix}\n$$\nInterior cell $(1,1)$: $0.25 \\times (\\text{input}[0,1] + \\text{input}[2,1] + \\text{input}[1,0] + \\text{input}[1,2])$\n$= 0.25 \\times (2.0 + 10.0 + 5.0 + 7.0) = 6.0$<br>\nInterior cell $(1,2)$: $0.25 \\times (\\text{input}[0,2] + \\text{input}[2,2] + \\text{input}[1,1] + \\text{input}[1,3])$\n$= 0.25 \\times (3.0 + 11.0 + 6.0 + 8.0) = 7.0$<br>\nInterior cell $(2,1)$: $0.25 \\times (\\text{input}[1,1] + \\text{input}[3,1] + \\text{input}[2,0] + \\text{input}[2,2])$\n$= 0.25 \\times (6.0 + 14.0 + 9.0 + 11.0) = 10.0$<br>\nInterior cell $(2,2)$: $0.25 \\times (\\text{input}[1,2] + \\text{input}[3,2] + \\text{input}[2,1] + \\text{input}[2,3])$\n$= 0.25 \\times (7.0 + 15.0 + 10.0 + 12.0) = 11.0$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>rows</code>, <code>cols</code> &le; 16,384</li>\n  <li>Input values are in the range [-100, 100]</li>\n  <li>All values are 32-bit floats</li>\n  <li>Performance is measured with <code>rows</code> = 8,192, <code>cols</code> = 8,192</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// input, output are device pointers\nextern \"C\" void solve(const float* input, float* output, int rows, int cols) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# input, output are tensors on the GPU\n@cute.jit\ndef solve(input: cute.Tensor, output: cute.Tensor, rows: cute.Int32, cols: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# input is a tensor on the GPU\n@jax.jit\ndef solve(input: jax.Array, rows: int, cols: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# input, output are device pointers\n@export\ndef solve(\n    input: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    rows: Int32,\n    cols: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# input, output are tensors on the GPU\ndef solve(input: torch.Tensor, output: torch.Tensor, rows: int, cols: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"2D Jacobi Stencil\",\n",
+        "            atol=1e-05,\n",
+        "            rtol=1e-05,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        input: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        rows: int,\n",
+        "        cols: int,\n",
+        "    ):\n",
+        "        assert input.shape == (rows, cols)\n",
+        "        assert output.shape == (rows, cols)\n",
+        "        assert input.dtype == torch.float32\n",
+        "        assert input.device.type == \"cuda\"\n",
+        "\n",
+        "        # Copy boundary cells unchanged\n",
+        "        output.copy_(input)\n",
+        "\n",
+        "        # Apply 5-point stencil to interior cells:\n",
+        "        # output[i, j] = 0.25 * (input[i-1,j] + input[i+1,j] + input[i,j-1] + input[i,j+1])\n",
+        "        output[1:-1, 1:-1] = 0.25 * (\n",
+        "            input[0:-2, 1:-1]  # top neighbor\n",
+        "            + input[2:, 1:-1]  # bottom neighbor\n",
+        "            + input[1:-1, 0:-2]  # left neighbor\n",
+        "            + input[1:-1, 2:]  # right neighbor\n",
+        "        )\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"input\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"rows\": (ctypes.c_int, \"in\"),\n",
+        "            \"cols\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        input = torch.tensor(\n",
+        "            [\n",
+        "                [1.0, 2.0, 3.0, 4.0],\n",
+        "                [5.0, 6.0, 7.0, 8.0],\n",
+        "                [9.0, 10.0, 11.0, 12.0],\n",
+        "                [13.0, 14.0, 15.0, 16.0],\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.empty((4, 4), device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"input\": input,\n",
+        "            \"output\": output,\n",
+        "            \"rows\": 4,\n",
+        "            \"cols\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # minimal_3x3 (only one interior cell)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty((3, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 3,\n",
+        "                \"cols\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # minimal_1x1 (all boundary, no interior cells)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[42.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((1, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 1,\n",
+        "                \"cols\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_row (all boundary)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1.0, 2.0, 3.0, 4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((1, 4), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 1,\n",
+        "                \"cols\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # single_col (all boundary)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.tensor([[1.0], [2.0], [3.0], [4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((4, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 4,\n",
+        "                \"cols\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # all_zeros (interior should stay zero)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.zeros((16, 16), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((16, 16), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 16,\n",
+        "                \"cols\": 16,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # uniform_constant (interior stays the same when all values equal)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.full((32, 32), 3.14, device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty((32, 32), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 32,\n",
+        "                \"cols\": 32,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # power_of_2_square_64\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((64, 64), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0),\n",
+        "                \"output\": torch.empty((64, 64), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 64,\n",
+        "                \"cols\": 64,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # power_of_2_square_128\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((128, 128), device=\"cuda\", dtype=dtype).uniform_(-10.0, 10.0),\n",
+        "                \"output\": torch.empty((128, 128), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 128,\n",
+        "                \"cols\": 128,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_power_of_2_30x30\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((30, 30), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"output\": torch.empty((30, 30), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 30,\n",
+        "                \"cols\": 30,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_power_of_2_100x100\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((100, 100), device=\"cuda\", dtype=dtype).uniform_(-3.0, 3.0),\n",
+        "                \"output\": torch.empty((100, 100), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 100,\n",
+        "                \"cols\": 100,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_square_255x33\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((255, 33), device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0),\n",
+        "                \"output\": torch.empty((255, 33), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 255,\n",
+        "                \"cols\": 33,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # negative_values_non_square_17x97\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((17, 97), device=\"cuda\", dtype=dtype).uniform_(-100.0, 0.0),\n",
+        "                \"output\": torch.empty((17, 97), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 17,\n",
+        "                \"cols\": 97,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # realistic_medium_512x256\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((512, 256), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"output\": torch.empty((512, 256), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 512,\n",
+        "                \"cols\": 256,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # realistic_large_1024x1024\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"input\": torch.empty((1024, 1024), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0),\n",
+        "                \"output\": torch.empty((1024, 1024), device=\"cuda\", dtype=dtype),\n",
+        "                \"rows\": 1024,\n",
+        "                \"cols\": 1024,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        rows = 8192\n",
+        "        cols = 8192\n",
+        "        return {\n",
+        "            \"input\": torch.empty((rows, cols), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "            \"output\": torch.empty((rows, cols), device=\"cuda\", dtype=dtype),\n",
+        "            \"rows\": rows,\n",
+        "            \"cols\": cols,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/6_softmax_attention.ipynb b/challenges/colab_exports/medium/6_softmax_attention.ipynb
new file mode 100644
index 00000000..b58969e9
--- /dev/null
+++ b/challenges/colab_exports/medium/6_softmax_attention.ipynb
@@ -0,0 +1,524 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p> Implement a GPU program that computes the softmax attention operation for a given set of matrices. Given the query\n  matrix <code>Q</code> of size <code>M\u00d7d</code>, key matrix <code>K</code> of size <code>N\u00d7d</code>, and value matrix\n  <code>V</code> of size <code>N\u00d7d</code>, your program should compute the output matrix using the formula:\n  $$\\text{Attention}(Q, K, V) = \\text{softmax}\\Bigl( \\frac{QK^T}{\\sqrt{d}} \\Bigr)V,$$ where the softmax function is\n  applied row-wise. </p>\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The\n    <code>solve</code> function signature must remain unchanged\n  </li>\n  <li>The final result must be stored in the output matrix\n    <code>output</code>\n  </li>\n</ul>\n<h2>Example 1:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0\n\\end{bmatrix}\n$$\n<code>K</code> (3\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 & 0.0 & 0.0 \\\\\n0.0 & 1.0 & 0.0 & 0.0 \\\\\n0.0 & 0.0 & 1.0 & 0.0\n\\end{bmatrix}\n$$\n<code>V</code> (3\u00d74):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 & 3.0 & 4.0 \\\\\n5.0 & 6.0 & 7.0 & 8.0 \\\\\n9.0 & 10.0 & 11.0 & 12.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (2\u00d74):\n$$\n\\begin{bmatrix}\n4.29 & 5.29 & 6.29 & 7.29 \\\\\n5.00 & 6.00 & 7.00 & 8.00\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Example 2:</h2>\n<p>\n<strong>Input:</strong><br>\n<code>Q</code> (1\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 2.0\n\\end{bmatrix}\n$$\n<code>K</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 \\\\\n0.0 & 1.0\n\\end{bmatrix}\n$$\n<code>V</code> (2\u00d72):\n$$\n\\begin{bmatrix}\n3.0 & 4.0 \\\\\n5.0 & 6.0\n\\end{bmatrix}\n$$\n</p>\n\n<p>\n<strong>Output:</strong><br>\n<code>output</code> (1\u00d72):\n$$\n\\begin{bmatrix}\n4.34 & 5.34\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>Matrix <code>Q</code> is of size <code>M\u00d7d</code> and matrices <code>K</code> and <code>V</code> are of size\n    <code>N\u00d7d</code></li>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 100,000</li>\n  <li>1 &le; <code>d</code> &le; 128</li>\n\n  <li>Performance is measured with <code>M</code> = 512, <code>N</code> = 256</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int M, int N,\n                      int d) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    d: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on the GPU\n@jax.jit\ndef solve(Q: jax.Array, K: jax.Array, V: jax.Array, M: int, N: int, d: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# Q, K, V, output are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    d: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, N: int, d: int\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor, K: torch.Tensor, V: torch.Tensor, output: torch.Tensor, M: int, N: int, d: int\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Softmax Attention\", atol=1e-04, rtol=1e-04, num_gpus=1, access_tier=\"free\"\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        d: int,\n",
+        "    ):\n",
+        "        scale = d**0.5\n",
+        "        attn = torch.matmul(Q, K.t()) / scale\n",
+        "        attn = torch.softmax(attn, dim=1)\n",
+        "        torch.matmul(attn, V, out=output)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"d\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        Q = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype)\n",
+        "        K = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        V = torch.tensor(\n",
+        "            [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.empty(2, 4, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": 2, \"N\": 3, \"d\": 4}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # basic_example\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=\"cuda\", dtype=dtype\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(2, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 3,\n",
+        "                \"d\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # zero_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"K\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"V\": torch.zeros((3, 5), device=\"cuda\", dtype=dtype),\n",
+        "                \"output\": torch.empty(3, 5, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 3,\n",
+        "                \"d\": 5,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # mixed_values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.tensor(\n",
+        "                    [[-1.0, 2.0, -3.0], [4.0, -5.0, 6.0], [-7.0, 8.0, -9.0], [10.0, -11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"K\": torch.tensor(\n",
+        "                    [[2.0, -1.0, 3.0], [-4.0, 5.0, -6.0], [7.0, -8.0, 9.0], [-10.0, 11.0, -12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"V\": torch.tensor(\n",
+        "                    [[1.0, 0.5, -0.5], [-1.0, 2.0, 3.0], [4.0, -2.0, 1.0], [0.0, 1.0, -1.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"output\": torch.empty(4, 3, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"N\": 4,\n",
+        "                \"d\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # large_matrices\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"Q\": torch.empty((64, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"K\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"V\": torch.empty((128, 32), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1),\n",
+        "                \"output\": torch.empty(64, 32, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 64,\n",
+        "                \"N\": 128,\n",
+        "                \"d\": 32,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N, d = 512, 256, 128\n",
+        "        Q = torch.empty((512, 128), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        K = torch.empty((256, 128), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        V = torch.empty((256, 128), device=\"cuda\", dtype=dtype).uniform_(-0.1, 0.1)\n",
+        "        output = torch.empty(M, d, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"M\": M, \"N\": N, \"d\": d}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/70_segmented_prefix_sum.ipynb b/challenges/colab_exports/medium/70_segmented_prefix_sum.ipynb
new file mode 100644
index 00000000..4130b7d4
--- /dev/null
+++ b/challenges/colab_exports/medium/70_segmented_prefix_sum.ipynb
@@ -0,0 +1,552 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given an array of <code>N</code> 32-bit floating point <code>values</code> and an integer array\n  <code>flags</code> of the same length, where <code>flags[i] = 1</code> marks the start of a new\n  segment and <code>flags[i] = 0</code> continues the current segment, compute the\n  <strong>exclusive prefix sum within each segment</strong> and store the result in\n  <code>output</code>. The first element is always a segment start\n  (<code>flags[0] = 1</code>). Within each segment, <code>output[i]</code> equals the sum of all\n  <code>values</code> elements in the same segment that appear before index <code>i</code>, so the\n  first element of every segment is always <code>0.0</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>output</code></li>\n  <li>Read from <code>values</code> and <code>flags</code>; write to <code>output</code></li>\n</ul>\n\n<h2>Example</h2>\n<pre>\nInput values: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]\nInput flags:  [  1,   0,   0,   1,   0,   1]\n\nSegments:     [1.0, 2.0, 3.0] | [4.0, 5.0] | [6.0]\n\nOutput:       [0.0, 1.0, 3.0,   0.0, 4.0,   0.0]\n</pre>\n<p>\n  Segment 1: exclusive prefix sums of [1, 2, 3] &rarr; [0, 1, 3]<br>\n  Segment 2: exclusive prefix sums of [4, 5] &rarr; [0, 4]<br>\n  Segment 3: exclusive prefix sums of [6] &rarr; [0]\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li><code>flags[0] = 1</code> always (the first element starts the first segment)</li>\n  <li><code>flags[i]</code> &isin; {0, 1} for all <code>i</code></li>\n  <li>Values are 32-bit floats in the range [-100, 100]</li>\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// values, flags, output are device pointers\nextern \"C\" void solve(const float* values, const int* flags, float* output, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# values, flags, output are tensors on the GPU\n@cute.jit\ndef solve(values: cute.Tensor, flags: cute.Tensor, output: cute.Tensor, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# values, flags are tensors on the GPU\n@jax.jit\ndef solve(values: jax.Array, flags: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# values, flags, output are device pointers\n@export\ndef solve(\n    values: UnsafePointer[Float32, MutExternalOrigin],\n    flags: UnsafePointer[Int32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# values, flags, output are tensors on the GPU\ndef solve(values: torch.Tensor, flags: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# values, flags, output are tensors on the GPU\ndef solve(values: torch.Tensor, flags: torch.Tensor, output: torch.Tensor, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Segmented Exclusive Prefix Sum\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        values: torch.Tensor,\n",
+        "        flags: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        N: int,\n",
+        "    ):\n",
+        "        assert values.shape == (N,)\n",
+        "        assert flags.shape == (N,)\n",
+        "        assert output.shape == (N,)\n",
+        "        assert values.dtype == torch.float32\n",
+        "        assert flags.dtype == torch.int32\n",
+        "        assert values.device.type == \"cuda\"\n",
+        "\n",
+        "        # Global exclusive prefix sum (use float64 for accuracy in reference).\n",
+        "        excl = torch.empty(N, dtype=torch.float64, device=\"cuda\")\n",
+        "        excl[0] = 0.0\n",
+        "        if N > 1:\n",
+        "            excl[1:] = torch.cumsum(values[:-1].double(), dim=0)\n",
+        "\n",
+        "        # The exclusive prefix sum within each segment equals the global exclusive\n",
+        "        # prefix sum minus the global exclusive prefix sum at the segment start.\n",
+        "        # Use segment IDs (0-indexed) to index the per-segment offsets.\n",
+        "        seg_ids = torch.cumsum(flags.long(), dim=0) - 1  # segment index for each element\n",
+        "        seg_mask = flags.bool()\n",
+        "        # excl value at each segment start\n",
+        "        seg_start_excl = excl[seg_mask]  # shape: (num_segments,)\n",
+        "        # Broadcast segment start offset to every element in that segment\n",
+        "        per_elem_offset = seg_start_excl[seg_ids]\n",
+        "\n",
+        "        output.copy_((excl - per_elem_offset).float())\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"values\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"flags\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype_f = torch.float32\n",
+        "        dtype_i = torch.int32\n",
+        "        # Three segments: [1,2,3], [4,5], [6]\n",
+        "        # exclusive prefix sums: [0,1,3], [0,4], [0]\n",
+        "        values = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], device=\"cuda\", dtype=dtype_f)\n",
+        "        flags = torch.tensor([1, 0, 0, 1, 0, 1], device=\"cuda\", dtype=dtype_i)\n",
+        "        output = torch.empty(6, device=\"cuda\", dtype=dtype_f)\n",
+        "        return {\n",
+        "            \"values\": values,\n",
+        "            \"flags\": flags,\n",
+        "            \"output\": output,\n",
+        "            \"N\": 6,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype_f = torch.float32\n",
+        "        dtype_i = torch.int32\n",
+        "        tests = []\n",
+        "\n",
+        "        def make_test(vals, segs):\n",
+        "            \"\"\"vals: list of floats, segs: list of segment start indices\"\"\"\n",
+        "            N = len(vals)\n",
+        "            flags = torch.zeros(N, dtype=dtype_i)\n",
+        "            for s in segs:\n",
+        "                flags[s] = 1\n",
+        "            return {\n",
+        "                \"values\": torch.tensor(vals, device=\"cuda\", dtype=dtype_f),\n",
+        "                \"flags\": flags.cuda(),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype_f),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "\n",
+        "        def make_random_test(N, avg_seg_len, seed=None):\n",
+        "            if seed is not None:\n",
+        "                torch.manual_seed(seed)\n",
+        "            vals = torch.empty(N, dtype=dtype_f).uniform_(-10.0, 10.0)\n",
+        "            flags = torch.zeros(N, dtype=dtype_i)\n",
+        "            flags[0] = 1\n",
+        "            i = avg_seg_len\n",
+        "            while i < N:\n",
+        "                flags[i] = 1\n",
+        "                i += max(1, int(torch.randint(1, 2 * avg_seg_len + 1, (1,)).item()))\n",
+        "            return {\n",
+        "                \"values\": vals.cuda(),\n",
+        "                \"flags\": flags.cuda(),\n",
+        "                \"output\": torch.empty(N, device=\"cuda\", dtype=dtype_f),\n",
+        "                \"N\": N,\n",
+        "            }\n",
+        "\n",
+        "        # Edge: single element, single segment\n",
+        "        tests.append(make_test([5.0], [0]))\n",
+        "\n",
+        "        # Edge: two elements, one segment\n",
+        "        tests.append(make_test([3.0, 7.0], [0]))\n",
+        "\n",
+        "        # Edge: two elements, two segments\n",
+        "        tests.append(make_test([3.0, 7.0], [0, 1]))\n",
+        "\n",
+        "        # Edge: four elements, all in one segment\n",
+        "        tests.append(make_test([1.0, 2.0, 3.0, 4.0], [0]))\n",
+        "\n",
+        "        # Four elements, each its own segment (all outputs = 0)\n",
+        "        tests.append(make_test([1.0, -2.0, 3.0, -4.0], [0, 1, 2, 3]))\n",
+        "\n",
+        "        # Negative values in mixed segments: two segments of length 3\n",
+        "        tests.append(make_test([-1.0, -2.0, -3.0, 5.0, 6.0, -7.0], [0, 3]))\n",
+        "\n",
+        "        # Power-of-2: N=16, two equal segments\n",
+        "        tests.append(make_test([float(i) for i in range(16)], [0, 8]))\n",
+        "\n",
+        "        # Power-of-2: N=32, segments of length 4\n",
+        "        tests.append(make_test([1.0] * 32, list(range(0, 32, 4))))\n",
+        "\n",
+        "        # Power-of-2: N=64, random segment lengths ~8\n",
+        "        tests.append(make_random_test(64, avg_seg_len=8, seed=42))\n",
+        "\n",
+        "        # Power-of-2: N=128, random segment lengths ~16\n",
+        "        tests.append(make_random_test(128, avg_seg_len=16, seed=7))\n",
+        "\n",
+        "        # Non-power-of-2: N=30, segments of length ~5\n",
+        "        tests.append(make_random_test(30, avg_seg_len=5, seed=13))\n",
+        "\n",
+        "        # Non-power-of-2: N=100, small segments of length ~3\n",
+        "        tests.append(make_random_test(100, avg_seg_len=3, seed=99))\n",
+        "\n",
+        "        # Non-power-of-2: N=255, segments spanning multiple warps\n",
+        "        tests.append(make_random_test(255, avg_seg_len=32, seed=17))\n",
+        "\n",
+        "        # Realistic: N=1024, segments of length ~64\n",
+        "        tests.append(make_random_test(1024, avg_seg_len=64, seed=11))\n",
+        "\n",
+        "        # Realistic: N=10000, segments crossing block boundaries\n",
+        "        tests.append(make_random_test(10000, avg_seg_len=256, seed=55))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype_f = torch.float32\n",
+        "        dtype_i = torch.int32\n",
+        "        N = 50_000_000\n",
+        "        torch.manual_seed(42)\n",
+        "        vals = torch.empty(N, dtype=dtype_f).uniform_(-1.0, 1.0)\n",
+        "        flags = torch.zeros(N, dtype=dtype_i)\n",
+        "        flags[0] = 1\n",
+        "        # Segments of average length 256 (crosses many thread blocks)\n",
+        "        seg_starts = torch.arange(256, N, 256, dtype=torch.long)\n",
+        "        flags[seg_starts] = 1\n",
+        "        return {\n",
+        "            \"values\": vals.cuda(),\n",
+        "            \"flags\": flags.cuda(),\n",
+        "            \"output\": torch.empty(N, device=\"cuda\", dtype=dtype_f),\n",
+        "            \"N\": N,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/71_parallel_merge.ipynb b/challenges/colab_exports/medium/71_parallel_merge.ipynb
new file mode 100644
index 00000000..2154329f
--- /dev/null
+++ b/challenges/colab_exports/medium/71_parallel_merge.ipynb
@@ -0,0 +1,535 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given two sorted arrays <code>A</code> of length <code>M</code> and <code>B</code> of length\n  <code>N</code>, both containing 32-bit floating-point values in non-decreasing order, produce a\n  single sorted array <code>C</code> of length <code>M + N</code> containing all elements of\n  <code>A</code> and <code>B</code> in non-decreasing order.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final merged result must be stored in <code>C</code></li>\n</ul>\n\n<h2>Example</h2>\n<pre>\nInput:\n  A = [1.0, 3.0, 5.0, 7.0],  M = 4\n  B = [2.0, 4.0, 6.0, 8.0],  N = 4\n\nOutput:\n  C = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]\n</pre>\n\n<pre>\nInput:\n  A = [-1.0, 1.0, 3.0],  M = 3\n  B = [2.0],             N = 1\n\nOutput:\n  C = [-1.0, 1.0, 2.0, 3.0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 50,000,000</li>\n  <li><code>M + N</code> &le; 50,000,000</li>\n  <li>Both <code>A</code> and <code>B</code> are sorted in non-decreasing order</li>\n  <li>Elements are 32-bit floats</li>\n  <li>Performance is measured with <code>M</code> = 25,000,000, <code>N</code> = 25,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, B, C are device pointers (i.e. pointers to memory on the GPU)\nextern \"C\" void solve(const float* A, const float* B, float* C, int M, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, B: cute.Tensor, C: cute.Tensor, M: cute.Uint32, N: cute.Uint32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, M: int, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# A, B, C are device pointers (i.e. pointers to memory on the GPU)\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Parallel Merge\",\n",
+        "            atol=0.0,\n",
+        "            rtol=0.0,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        C: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "    ):\n",
+        "        assert A.shape == (M,), f\"Expected A.shape=({M},), got {A.shape}\"\n",
+        "        assert B.shape == (N,), f\"Expected B.shape=({N},), got {B.shape}\"\n",
+        "        assert C.shape == (M + N,), f\"Expected C.shape=({M + N},), got {C.shape}\"\n",
+        "        assert A.dtype == torch.float32\n",
+        "        assert B.dtype == torch.float32\n",
+        "        assert C.dtype == torch.float32\n",
+        "        assert A.device.type == \"cuda\"\n",
+        "\n",
+        "        merged, _ = torch.sort(torch.cat([A, B]))\n",
+        "        C.copy_(merged)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor([1.0, 3.0, 5.0, 7.0], device=\"cuda\", dtype=dtype)\n",
+        "        B = torch.tensor([2.0, 4.0, 6.0, 8.0], device=\"cuda\", dtype=dtype)\n",
+        "        M, N = 4, 4\n",
+        "        C = torch.empty(M + N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"M\": M, \"N\": N}\n",
+        "\n",
+        "    def _make_test(self, M: int, N: int, lo: float = -10.0, hi: float = 10.0) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A, _ = torch.sort(torch.empty(M, device=\"cuda\", dtype=dtype).uniform_(lo, hi))\n",
+        "        B, _ = torch.sort(torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(lo, hi))\n",
+        "        C = torch.empty(M + N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"M\": M, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge cases \u2014 tiny sizes\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([1.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 1,\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([-1.0, 1.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 1,\n",
+        "                \"N\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([-1.0, 1.0, 3.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([2.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "        # All zeros\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.zeros(2, device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        tests.append(self._make_test(16, 16))\n",
+        "        tests.append(self._make_test(32, 32, lo=-100.0, hi=0.0))  # all negative\n",
+        "        tests.append(self._make_test(64, 128))\n",
+        "        tests.append(self._make_test(512, 512))\n",
+        "        tests.append(self._make_test(1024, 1024))\n",
+        "\n",
+        "        # Non-power-of-2 sizes\n",
+        "        tests.append(self._make_test(30, 33))\n",
+        "        tests.append(self._make_test(100, 77))\n",
+        "        tests.append(self._make_test(255, 127))\n",
+        "\n",
+        "        # A entirely less than B (no interleaving needed)\n",
+        "        A_low, _ = torch.sort(torch.empty(256, device=\"cuda\", dtype=dtype).uniform_(-20.0, -10.0))\n",
+        "        B_high, _ = torch.sort(torch.empty(256, device=\"cuda\", dtype=dtype).uniform_(10.0, 20.0))\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_low,\n",
+        "                \"B\": B_high,\n",
+        "                \"C\": torch.empty(512, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 256,\n",
+        "                \"N\": 256,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Many duplicate values\n",
+        "        A_dup = torch.sort(torch.randint(0, 5, (128,), device=\"cuda\").to(dtype=dtype)).values\n",
+        "        B_dup = torch.sort(torch.randint(0, 5, (128,), device=\"cuda\").to(dtype=dtype)).values\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_dup,\n",
+        "                \"B\": B_dup,\n",
+        "                \"C\": torch.empty(256, device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 128,\n",
+        "                \"N\": 128,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Realistic size\n",
+        "        tests.append(self._make_test(5000, 7000))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M = 25_000_000\n",
+        "        N = 25_000_000\n",
+        "        A, _ = torch.sort(torch.empty(M, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0))\n",
+        "        B, _ = torch.sort(torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0))\n",
+        "        C = torch.empty(M + N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"B\": B, \"C\": C, \"M\": M, \"N\": N}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/72_stream_compaction.ipynb b/challenges/colab_exports/medium/72_stream_compaction.ipynb
new file mode 100644
index 00000000..d9ada0b1
--- /dev/null
+++ b/challenges/colab_exports/medium/72_stream_compaction.ipynb
@@ -0,0 +1,488 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given a 1D array <code>A</code> of <code>N</code> 32-bit floating point numbers, compact all\n  positive elements (<code>A[i] &gt; 0</code>) to the front of the output array <code>out</code>,\n  preserving their original relative order. Fill any remaining positions with <code>0.0</code>.\n  Stream compaction is a fundamental GPU primitive used throughout rendering, sparse computation,\n  and collision detection.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native GPU features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>\n    The first <em>k</em> positions of <code>out</code> must contain the <em>k</em> elements of\n    <code>A</code> where <code>A[i] &gt; 0</code>, in their original order\n  </li>\n  <li>Positions <em>k</em> through <em>N&minus;1</em> of <code>out</code> must be <code>0.0</code></li>\n  <li>Elements where <code>A[i] = 0.0</code> are <strong>not</strong> selected</li>\n</ul>\n\n<h2>Example</h2>\n<pre>\nInput:  A = [1.0, -2.0, 3.0, 0.0, -1.0, 4.0]\nOutput: out = [1.0, 3.0, 4.0, 0.0, 0.0, 0.0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>N</code> &le; 100,000,000</li>\n  <li>&minus;1000.0 &le; <code>A[i]</code> &le; 1000.0</li>\n  <li><code>out</code> is pre-allocated with <code>N</code> elements, initialised to <code>0.0</code></li>\n  <li>Performance is measured with <code>N</code> = 50,000,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, out are device pointers\nextern \"C\" void solve(const float* A, int N, float* out) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, out are tensors on the GPU\n@cute.jit\ndef solve(A: cute.Tensor, N: cute.Uint32, out: cute.Tensor):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A is a tensor on GPU\n@jax.jit\ndef solve(A: jax.Array, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# A, out are device pointers\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    N: Int32,\n    out: UnsafePointer[Float32, MutExternalOrigin],\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, out are tensors on the GPU\ndef solve(A: torch.Tensor, N: int, out: torch.Tensor):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# A, out are tensors on the GPU\ndef solve(A: torch.Tensor, N: int, out: torch.Tensor):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Stream Compaction\",\n",
+        "            atol=0.0,\n",
+        "            rtol=0.0,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, A: torch.Tensor, N: int, out: torch.Tensor):\n",
+        "        assert A.shape == (N,), f\"Expected A.shape=({N},), got {A.shape}\"\n",
+        "        assert out.shape == (N,), f\"Expected out.shape=({N},), got {out.shape}\"\n",
+        "        assert A.dtype == torch.float32\n",
+        "        assert out.dtype == torch.float32\n",
+        "        assert A.device.type == \"cuda\"\n",
+        "\n",
+        "        mask = A > 0\n",
+        "        selected = A[mask]\n",
+        "        k = selected.numel()\n",
+        "        out[:k].copy_(selected)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"out\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor([1.0, -2.0, 3.0, 0.0, -1.0, 4.0], device=\"cuda\", dtype=dtype)\n",
+        "        N = 6\n",
+        "        out = torch.zeros(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"N\": N, \"out\": out}\n",
+        "\n",
+        "    def _make_test(self, N: int, lo: float = -2.0, hi: float = 2.0) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(lo, hi)\n",
+        "        out = torch.zeros(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"N\": N, \"out\": out}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge cases \u2014 tiny sizes\n",
+        "        # N=1, zero (not positive, nothing selected)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([0.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "                \"out\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "        # N=1, positive (all selected)\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([5.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 1,\n",
+        "                \"out\": torch.zeros(1, device=\"cuda\", dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "        # N=4, mixed with exact zeros and negatives\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([-1.0, 2.0, 0.0, 4.0], device=\"cuda\", dtype=dtype),\n",
+        "                \"N\": 4,\n",
+        "                \"out\": torch.zeros(4, device=\"cuda\", dtype=dtype),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        # All positive \u2014 every element passes the predicate\n",
+        "        A_all_pos = torch.rand(16, device=\"cuda\", dtype=dtype) + 0.1\n",
+        "        tests.append({\"A\": A_all_pos, \"N\": 16, \"out\": torch.zeros(16, device=\"cuda\", dtype=dtype)})\n",
+        "\n",
+        "        # All negative \u2014 no element passes the predicate\n",
+        "        A_all_neg = -(torch.rand(32, device=\"cuda\", dtype=dtype) + 0.1)\n",
+        "        tests.append({\"A\": A_all_neg, \"N\": 32, \"out\": torch.zeros(32, device=\"cuda\", dtype=dtype)})\n",
+        "\n",
+        "        # Mixed, wide range\n",
+        "        tests.append(self._make_test(256, lo=-5.0, hi=5.0))\n",
+        "        tests.append(self._make_test(1024, lo=-10.0, hi=10.0))\n",
+        "\n",
+        "        # Non-power-of-2\n",
+        "        tests.append(self._make_test(100, lo=-3.0, hi=3.0))\n",
+        "        tests.append(self._make_test(255, lo=-1.0, hi=1.0))\n",
+        "\n",
+        "        # Realistic size\n",
+        "        tests.append(self._make_test(10000, lo=-100.0, hi=100.0))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        N = 50_000_000\n",
+        "        A = torch.empty(N, device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        out = torch.zeros(N, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"A\": A, \"N\": N, \"out\": out}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/75_sparse_matrix_dense_matrix_multiplication.ipynb b/challenges/colab_exports/medium/75_sparse_matrix_dense_matrix_multiplication.ipynb
new file mode 100644
index 00000000..e9e90196
--- /dev/null
+++ b/challenges/colab_exports/medium/75_sparse_matrix_dense_matrix_multiplication.ipynb
@@ -0,0 +1,645 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a GPU program that multiplies a sparse matrix <code>A</code> of dimensions <code>M</code> &times; <code>N</code>\n  by a dense matrix <code>B</code> of dimensions <code>N</code> &times; <code>K</code>, producing a dense output matrix\n  <code>C</code> of dimensions <code>M</code> &times; <code>K</code>.\n  All matrices are stored in row-major order using 32-bit floats.\n  The matrix <code>A</code> is approximately 60&ndash;70% sparse (i.e., 60&ndash;70% of elements are zero),\n  and <code>nnz</code> gives the number of non-zero elements in <code>A</code>.\n</p>\n\n<p>\n  Mathematically, the operation is defined as:\n  $$\n  C_{ij} = \\sum_{k=0}^{N-1} A_{ik} \\cdot B_{kj} \\quad \\text{for} \\quad i = 0, \\ldots, M-1,\\; j = 0, \\ldots, K-1\n  $$\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only GPU native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in matrix <code>C</code></li>\n</ul>\n\n<h2>Example</h2>\n<p>\nInput:<br>\nMatrix $A$ ($3 \\times 4$):\n$$\n\\begin{bmatrix}\n2.0 & 0.0 & 0.0 & 1.0 \\\\\n0.0 & 3.0 & 0.0 & 0.0 \\\\\n0.0 & 0.0 & 4.0 & 0.0\n\\end{bmatrix}\n$$\nMatrix $B$ ($4 \\times 2$):\n$$\n\\begin{bmatrix}\n1.0 & 2.0 \\\\\n3.0 & 4.0 \\\\\n5.0 & 6.0 \\\\\n7.0 & 8.0\n\\end{bmatrix}\n$$\nOutput:<br>\nMatrix $C$ ($3 \\times 2$):\n$$\n\\begin{bmatrix}\n9.0 & 12.0 \\\\\n9.0 & 12.0 \\\\\n20.0 & 24.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code>, <code>K</code> &le; 8,192</li>\n  <li>All values in <code>A</code> and <code>B</code> are 32-bit floats in the range [&minus;10, 10]</li>\n  <li>The matrix <code>A</code> is approximately 60&ndash;70% sparse</li>\n  <li>Performance is measured with <code>M</code> = 4,096, <code>N</code> = 2,048, <code>K</code> = 512</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// A, B, C are device pointers\nextern \"C\" void solve(const float* A, const float* B, float* C, int M, int N, int K, int nnz) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# A, B, C are tensors on the GPU\n@cute.jit\ndef solve(\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n    nnz: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# A, B are tensors on GPU\n@jax.jit\ndef solve(A: jax.Array, B: jax.Array, M: int, N: int, K: int, nnz: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# A, B, C are device pointers\n@export\ndef solve(\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n    nnz: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# A, B, C are tensors on the GPU\ndef solve(A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, M: int, N: int, K: int, nnz: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Sparse Matrix-Dense Matrix Multiplication\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        C: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        K: int,\n",
+        "        nnz: int,\n",
+        "    ):\n",
+        "        if A.shape == (M * N,):\n",
+        "            A_matrix = A.view(M, N)\n",
+        "        elif A.shape == (M, N):\n",
+        "            A_matrix = A\n",
+        "        else:\n",
+        "            raise AssertionError(\n",
+        "                f\"A.shape {A.shape} does not match expected {(M * N,)} or {(M, N)}\"\n",
+        "            )\n",
+        "        if B.shape == (N * K,):\n",
+        "            B_matrix = B.view(N, K)\n",
+        "        elif B.shape == (N, K):\n",
+        "            B_matrix = B\n",
+        "        else:\n",
+        "            raise AssertionError(\n",
+        "                f\"B.shape {B.shape} does not match expected {(N * K,)} or {(N, K)}\"\n",
+        "            )\n",
+        "        assert C.shape == (M, K) or C.shape == (\n",
+        "            M * K,\n",
+        "        ), f\"C.shape {C.shape} does not match expected {(M, K)} or {(M * K,)}\"\n",
+        "        assert A_matrix.dtype == torch.float32\n",
+        "        assert B_matrix.dtype == torch.float32\n",
+        "        assert A_matrix.device.type == \"cuda\"\n",
+        "        assert B_matrix.device.type == \"cuda\"\n",
+        "        assert C.device.type == \"cuda\"\n",
+        "        result = torch.matmul(A_matrix, B_matrix)\n",
+        "        C.copy_(result.view(C.shape))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"nnz\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        A = torch.tensor(\n",
+        "            [\n",
+        "                [2.0, 0.0, 0.0, 1.0],\n",
+        "                [0.0, 3.0, 0.0, 0.0],\n",
+        "                [0.0, 0.0, 4.0, 0.0],\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        B = torch.tensor(\n",
+        "            [\n",
+        "                [1.0, 2.0],\n",
+        "                [3.0, 4.0],\n",
+        "                [5.0, 6.0],\n",
+        "                [7.0, 8.0],\n",
+        "            ],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        C = torch.empty((3, 2), device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": 3,\n",
+        "            \"N\": 4,\n",
+        "            \"K\": 2,\n",
+        "            \"nnz\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        tests = []\n",
+        "\n",
+        "        # edge_1x1x1\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[3.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[2.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty((1, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 1,\n",
+        "                \"N\": 1,\n",
+        "                \"K\": 1,\n",
+        "                \"nnz\": 1,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # edge_2x2_k1_spmv_like\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.tensor([[1.0, 0.0], [0.0, 2.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[3.0], [4.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty((2, 1), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 2,\n",
+        "                \"N\": 2,\n",
+        "                \"K\": 1,\n",
+        "                \"nnz\": 2,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # edge_zero_matrix\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.zeros((3, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]], device=\"cuda\", dtype=dtype),\n",
+        "                \"C\": torch.empty((3, 2), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 3,\n",
+        "                \"N\": 3,\n",
+        "                \"K\": 2,\n",
+        "                \"nnz\": 0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # edge_identity_a\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": torch.eye(4, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": torch.tensor(\n",
+        "                    [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0], [10.0, 11.0, 12.0]],\n",
+        "                    device=\"cuda\",\n",
+        "                    dtype=dtype,\n",
+        "                ),\n",
+        "                \"C\": torch.empty((4, 3), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": 4,\n",
+        "                \"N\": 4,\n",
+        "                \"K\": 3,\n",
+        "                \"nnz\": 4,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # power_of_2_16x16x8\n",
+        "        M, N, K = 16, 16, 8\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.65\n",
+        "        A_sparse = A_dense * mask\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"B\": torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((M, K), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"K\": K,\n",
+        "                \"nnz\": int(mask.sum().item()),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # power_of_2_64x32x16\n",
+        "        M, N, K = 64, 32, 16\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-3.0, 3.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.70\n",
+        "        A_sparse = A_dense * mask\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"B\": torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((M, K), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"K\": K,\n",
+        "                \"nnz\": int(mask.sum().item()),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_power_of_2_negative_values\n",
+        "        M, N, K = 30, 50, 20\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-5.0, 5.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.65\n",
+        "        A_sparse = A_dense * mask\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"B\": torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-3.0, 3.0),\n",
+        "                \"C\": torch.empty((M, K), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"K\": K,\n",
+        "                \"nnz\": int(mask.sum().item()),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # non_power_of_2_255x100x33\n",
+        "        M, N, K = 255, 100, 33\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-2.0, 2.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.70\n",
+        "        A_sparse = A_dense * mask\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"B\": torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((M, K), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"K\": K,\n",
+        "                \"nnz\": int(mask.sum().item()),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # realistic_1000x500x64\n",
+        "        M, N, K = 1000, 500, 64\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.65\n",
+        "        A_sparse = A_dense * mask\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"A\": A_sparse,\n",
+        "                \"B\": torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0),\n",
+        "                \"C\": torch.empty((M, K), device=\"cuda\", dtype=dtype),\n",
+        "                \"M\": M,\n",
+        "                \"N\": N,\n",
+        "                \"K\": K,\n",
+        "                \"nnz\": int(mask.sum().item()),\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M = 4096\n",
+        "        N = 2048\n",
+        "        K = 512\n",
+        "        A_dense = torch.empty((M, N), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        mask = torch.rand((M, N), device=\"cuda\") > 0.65\n",
+        "        A_sparse = A_dense * mask\n",
+        "        nnz = int(mask.sum().item())\n",
+        "        B = torch.empty((N, K), device=\"cuda\", dtype=dtype).uniform_(-1.0, 1.0)\n",
+        "        C = torch.empty((M, K), device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"A\": A_sparse,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "            \"nnz\": nnz,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/76_adder_transformer.ipynb b/challenges/colab_exports/medium/76_adder_transformer.ipynb
new file mode 100644
index 00000000..400ca8f9
--- /dev/null
+++ b/challenges/colab_exports/medium/76_adder_transformer.ipynb
@@ -0,0 +1,728 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\nRun batched autoregressive inference for a 10-parameter transformer that adds two 10-digit\nnumbers. Given prompts of shape <code>[batch_size, 31]</code> (int32) and a 10-float weight\nbuffer, write output logits of shape <code>[batch_size, 11, 10]</code> &mdash; one logit\nrow per decode step over the 10-digit vocabulary (0&ndash;9). All tensors are float32 except\nthe int32 prompts.\n</p>\n\n<p>\nThe model comes from the\n<a href=\"https://gist.github.com/Lokimorty/d54e5c61997e00fb922b6692739a0f6c\">AdderBoard</a>\ncompetition for the smallest autoregressive transformer that adds 10-digit numbers at\n&ge;99% accuracy. It encodes carry propagation in 10 learned parameters via RoPE geometry,\ntied embeddings, and SwiGLU gating.\n</p>\n\n<svg viewBox=\"0 0 720 540\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto; max-width:720px;\"\n     font-family=\"monospace\" font-size=\"13\">\n  <rect width=\"720\" height=\"540\" rx=\"12\" fill=\"#222\"/>\n\n  <!-- Input -->\n  <rect x=\"270\" y=\"20\" width=\"180\" height=\"36\" rx=\"6\" fill=\"#335\" stroke=\"#4477bb\"/>\n  <text x=\"360\" y=\"43\" text-anchor=\"middle\" fill=\"#ccc\">Token Prompt [B,31]</text>\n\n  <!-- Embedding -->\n  <rect x=\"250\" y=\"80\" width=\"220\" height=\"36\" rx=\"6\" fill=\"#2a4a2a\" stroke=\"#44aa66\"/>\n  <text x=\"360\" y=\"103\" text-anchor=\"middle\" fill=\"#ccc\">Embed: [w0-w1*d&sup2;, -d]</text>\n  <line x1=\"360\" y1=\"56\" x2=\"360\" y2=\"80\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Unit RMSNorm 1 -->\n  <rect x=\"270\" y=\"140\" width=\"180\" height=\"32\" rx=\"6\" fill=\"#333\" stroke=\"#888\"/>\n  <text x=\"360\" y=\"161\" text-anchor=\"middle\" fill=\"#ccc\">Unit RMSNorm</text>\n  <line x1=\"360\" y1=\"116\" x2=\"360\" y2=\"140\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Attention block -->\n  <rect x=\"200\" y=\"195\" width=\"320\" height=\"105\" rx=\"8\" fill=\"none\" stroke=\"#4477bb\" stroke-dasharray=\"4\"/>\n  <text x=\"210\" y=\"213\" fill=\"#4477bb\" font-size=\"11\">Self-Attention (1 head, dim=2)</text>\n\n  <rect x=\"215\" y=\"220\" width=\"90\" height=\"28\" rx=\"4\" fill=\"#335\" stroke=\"#4477bb\"/>\n  <text x=\"260\" y=\"239\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">Q Proj [2p]</text>\n  <rect x=\"315\" y=\"220\" width=\"90\" height=\"28\" rx=\"4\" fill=\"#335\" stroke=\"#4477bb\"/>\n  <text x=\"360\" y=\"239\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">K Proj [0p]</text>\n  <rect x=\"415\" y=\"220\" width=\"90\" height=\"28\" rx=\"4\" fill=\"#335\" stroke=\"#4477bb\"/>\n  <text x=\"460\" y=\"239\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">V Proj [1p]</text>\n\n  <rect x=\"215\" y=\"258\" width=\"290\" height=\"28\" rx=\"4\" fill=\"#335\" stroke=\"#4477bb\"/>\n  <text x=\"360\" y=\"277\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\">QK Norm + RoPE(&omega;=2&pi;/19) + Causal Attn</text>\n\n  <line x1=\"360\" y1=\"172\" x2=\"360\" y2=\"195\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Residual 1 -->\n  <text x=\"555\" y=\"265\" fill=\"#888\" font-size=\"11\">+ residual</text>\n  <line x1=\"540\" y1=\"98\" x2=\"570\" y2=\"98\" stroke=\"#888\" stroke-width=\"1\" stroke-dasharray=\"3\"/>\n  <line x1=\"570\" y1=\"98\" x2=\"570\" y2=\"320\" stroke=\"#888\" stroke-width=\"1\" stroke-dasharray=\"3\"/>\n  <line x1=\"570\" y1=\"320\" x2=\"520\" y2=\"320\" stroke=\"#888\" stroke-width=\"1\" stroke-dasharray=\"3\" marker-end=\"url(#arr)\"/>\n\n  <!-- Add node 1 -->\n  <circle cx=\"500\" cy=\"320\" r=\"14\" fill=\"#333\" stroke=\"#888\"/>\n  <text x=\"500\" y=\"325\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"16\">+</text>\n  <line x1=\"360\" y1=\"300\" x2=\"360\" y2=\"320\" stroke=\"#666\" stroke-width=\"1.5\"/>\n  <line x1=\"360\" y1=\"320\" x2=\"486\" y2=\"320\" stroke=\"#666\" stroke-width=\"1.5\"/>\n\n  <!-- Unit RMSNorm 2 -->\n  <rect x=\"270\" y=\"350\" width=\"180\" height=\"32\" rx=\"6\" fill=\"#333\" stroke=\"#888\"/>\n  <text x=\"360\" y=\"371\" text-anchor=\"middle\" fill=\"#ccc\">Unit RMSNorm</text>\n  <line x1=\"500\" y1=\"334\" x2=\"500\" y2=\"342\" stroke=\"#666\" stroke-width=\"1.5\"/>\n  <line x1=\"500\" y1=\"342\" x2=\"360\" y2=\"342\" stroke=\"#666\" stroke-width=\"1.5\"/>\n  <line x1=\"360\" y1=\"342\" x2=\"360\" y2=\"350\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- MLP block -->\n  <rect x=\"200\" y=\"400\" width=\"320\" height=\"36\" rx=\"6\" fill=\"#2a4a2a\" stroke=\"#44aa66\"/>\n  <text x=\"360\" y=\"423\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\">MLP: Gate + SwiGLU + Carry [3p]</text>\n  <line x1=\"360\" y1=\"382\" x2=\"360\" y2=\"400\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Final norm + output -->\n  <rect x=\"250\" y=\"460\" width=\"220\" height=\"36\" rx=\"6\" fill=\"#333\" stroke=\"#888\"/>\n  <text x=\"360\" y=\"483\" text-anchor=\"middle\" fill=\"#ccc\">RMSNorm [2p] + Logits</text>\n  <line x1=\"360\" y1=\"436\" x2=\"360\" y2=\"460\" stroke=\"#666\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Param counts -->\n  <text x=\"30\" y=\"520\" fill=\"#666\" font-size=\"11\">Total: 10 parameters (2+2+1+2+1+2)</text>\n\n  <defs>\n    <marker id=\"arr\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L8,3 L0,6\" fill=\"none\" stroke=\"#666\" stroke-width=\"1\"/>\n    </marker>\n  </defs>\n</svg>\n\n<h2>Model Architecture</h2>\n\n<p>Single-layer pre-norm transformer. Hidden dim 2, 1 head, head dim 2, vocab 10 (digits\n0&ndash;9), tied input/output embeddings.</p>\n\n<p>Each step runs the full sequence <code>[batch_size, seq_len, 2]</code> through:</p>\n\n<p><strong>1. Token Embedding</strong> (2 parameters: <code>w0</code>, <code>w1</code>)</p>\n<p>$$e(d) = \\begin{bmatrix} w_0 - w_1 \\cdot d^2 \\\\ -d \\end{bmatrix}$$</p>\n\n<p><strong>2. Unit RMSNorm</strong> (no parameters)</p>\n<p>$$\\text{UnitRMSNorm}(x) = \\frac{x}{\\sqrt{\\text{mean}(x^2) + \\epsilon}}, \\quad \\epsilon = 10^{-6}$$</p>\n\n<p><strong>3. Self-Attention</strong> (3 parameters: <code>q0</code>, <code>q1</code>, <code>v0</code>)</p>\n<p>Projections applied to the normed hidden state <code>h</code> with shape <code>[*, 2]</code>:</p>\n<p>$$Q = \\begin{bmatrix} h_0 \\cdot q_0 \\\\ h_0 \\cdot q_1 \\end{bmatrix}, \\quad\nK = \\begin{bmatrix} h_0 \\\\ 0 \\end{bmatrix}, \\quad\nV = \\begin{bmatrix} h_1 \\cdot v_0 \\\\ 0 \\end{bmatrix}$$</p>\n\n<p>After projection, Q and K are each normalized with Unit RMSNorm, then RoPE is applied\nwith angular frequency <code>&omega; = 2&pi;/19</code>:</p>\n<p>$$\\text{RoPE}(x, p) = \\begin{bmatrix} x_0 \\cos(p\\omega) - x_1 \\sin(p\\omega) \\\\\nx_0 \\sin(p\\omega) + x_1 \\cos(p\\omega) \\end{bmatrix}$$</p>\n\n<p>Scaled dot-product attention with causal mask uses scale factor:</p>\n<p>$$\\text{scale} = \\frac{1}{\\sqrt{d_h}} \\cdot S^2$$</p>\n<p>where $d_h = 2$ is the head dimension and $S^2$ is the QK-norm scale constant\n(see weight table below for exact value).</p>\n\n<p>The output projection maps <code>[attn_0, attn_1]</code> &rarr; <code>[0, attn_0]</code>\n(no parameters), followed by a residual connection.</p>\n\n<p><strong>4. MLP</strong> (3 parameters: <code>a</code>, <code>c</code>, <code>carry</code>)</p>\n<p>Applied to the unit-RMSNorm of the post-attention hidden state:</p>\n<p>$$g_0 = h_0 \\cdot a + h_1 \\cdot c, \\quad g_1 = h_0 \\cdot (a - c / 1000) + h_1 \\cdot c$$</p>\n<p>$$\\text{base} = h_0, \\quad \\text{up} = [\\text{base}, \\text{base}]$$</p>\n<p>$$\\text{mix} = \\text{SiLU}([g_0, g_1]) \\odot \\text{up}$$</p>\n<p>$$\\text{MLP}(h) = \\begin{bmatrix} 0 \\\\ \\text{carry} \\cdot (\\text{mix}_1 - \\text{mix}_0) \\end{bmatrix}$$</p>\n<p>followed by a residual connection.</p>\n\n<p><strong>5. Final RMSNorm</strong> (2 parameters: <code>n0</code>, <code>n1</code>)</p>\n<p>Standard RMSNorm with learned weight:</p>\n<p>$$\\text{out} = \\frac{h}{\\sqrt{\\text{mean}(h^2) + \\epsilon}} \\odot [n_0, n_1]$$</p>\n\n<p><strong>6. Output Logits</strong> (tied with embedding)</p>\n<p>$$\\text{logits} = \\text{out} \\cdot E^T \\quad \\text{where } E_{d} = e(d)$$</p>\n\n<h2>Autoregressive Decoding</h2>\n<p>Starting from the 31-token prompt, repeat 11 times:</p>\n<ol>\n  <li>Run the full forward pass on the current sequence</li>\n  <li>Extract logits at the last position &rarr; store in output</li>\n  <li>Append <code>argmax(logits)</code> as the next token</li>\n</ol>\n<p>The sequence grows from length 31 to 42 over the 11 decode steps.</p>\n\n<h2>Weight Layout</h2>\n<table border=\"1\" cellpadding=\"6\" cellspacing=\"0\" style=\"border-collapse:collapse; color:#ccc; border-color:#555;\">\n  <tr style=\"background:#333;\"><th>Offset</th><th>Size</th><th>Name</th><th>Description</th></tr>\n  <tr><td>0</td><td>2</td><td>embed</td><td>Embedding: <code>e(d) = [w0 - w1*d&sup2;, -d]</code></td></tr>\n  <tr><td>2</td><td>2</td><td>q_proj</td><td>Q projection weights <code>[q0, q1]</code></td></tr>\n  <tr><td>4</td><td>1</td><td>v_proj</td><td>V projection weight <code>v0</code></td></tr>\n  <tr><td>5</td><td>2</td><td>gate</td><td>MLP gate weights <code>[a, c]</code></td></tr>\n  <tr><td>7</td><td>1</td><td>carry</td><td>MLP carry weight</td></tr>\n  <tr><td>8</td><td>2</td><td>norm</td><td>Final RMSNorm weight <code>[n0, n1]</code></td></tr>\n</table>\n\n<h2>Token Encoding</h2>\n<p>Each input pair <code>(a, b)</code> of 10-digit numbers is encoded as a 31-token sequence:</p>\n<pre>\n[0, a_rev_0, ..., a_rev_9, 0, 0, 0, 0, 0, 0, 0, 0, 0, b_rev_0, ..., b_rev_9, 0]\n</pre>\n<p>where <code>a_rev</code> and <code>b_rev</code> are the digits in least-significant-first order,\nzero-padded to 10 digits. The model then generates 11 output tokens (digits of the sum, also\nleast-significant-first).</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement <code>solve(prompts, output, weights, batch_size)</code> with the exact signature shown (JAX exception: <code>solve(prompts, weights, batch_size)</code> returns the output tensor directly)</li>\n  <li>Do not use any external libraries beyond what the framework provides</li>\n  <li>The function must write logits into the <code>output</code> buffer (except JAX, which returns it)</li>\n  <li>Architecture constants are fixed: <code>vocab_size</code> = 10, <code>hidden_dim</code> = 2,\n      <code>head_dim</code> = 2, <code>num_heads</code> = 1, <code>prompt_len</code> = 31,\n      <code>decode_steps</code> = 11</li>\n  <li>RMSNorm epsilon = 10<sup>&minus;6</sup></li>\n  <li>RoPE angular frequency &omega; = 2&pi;/19</li>\n  <li>Attention scale = (1/&radic;2) &middot; <code>S</code>&sup2; where <code>S</code>&sup2; = ln(10) / (&radic;2 &middot; (cos(0.3&omega;) &minus; cos(0.7&omega;)))</li>\n  <li>SiLU activation: <code>silu(x) = x &middot; sigmoid(x)</code></li>\n</ul>\n\n<h2>Example</h2>\n<p>With <code>batch_size</code> = 2 and pairs (3, 5), (99, 1):</p>\n<pre>\nInput prompts (shape [2, 31]):\n  [0, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n  [0, 9, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n\nOutput logits shape: [2, 11, 10]\n  (logits at each of 11 decode steps over 10 digit classes)\n\nExpected decoded tokens (via argmax):\n  Pair (3, 5):   sum = 8       &rarr; [8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n  Pair (99, 1):  sum = 100     &rarr; [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li><code>batch_size</code>: 1 &le; <code>batch_size</code> &le; 100,000</li>\n  <li><code>prompts</code>: 32-bit integer tensor, values in [0, 9]</li>\n  <li><code>weights</code>: 32-bit float tensor with exactly 10 elements</li>\n  <li><code>output</code>: 32-bit float tensor of shape <code>[batch_size, 11, 10]</code></li>\n  <li>Input numbers are in range [0, 9,999,999,999] (10-digit unsigned integers)</li>\n  <li>Performance is measured with <code>batch_size</code> = 100,000</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// prompts, output, weights are device pointers\nextern \"C\" void solve(const int* prompts, float* output, const float* weights, int batch_size) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# prompts, output, weights are tensors on the GPU\n@cute.jit\ndef solve(\n    prompts: cute.Tensor,\n    output: cute.Tensor,\n    weights: cute.Tensor,\n    batch_size: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# prompts, weights are tensors on GPU\n@jax.jit\ndef solve(prompts: jax.Array, weights: jax.Array, batch_size: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from gpu.host import DeviceContext\nfrom gpu.id import block_dim, block_idx, thread_idx\nfrom memory import UnsafePointer\nfrom math import ceildiv\n\n\n# prompts, output, weights are device pointers\n@export\ndef solve(\n    prompts: UnsafePointer[Int32],\n    output: UnsafePointer[Float32],\n    weights: UnsafePointer[Float32],\n    batch_size: Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# prompts, output, weights are tensors on the GPU\ndef solve(prompts: torch.Tensor, output: torch.Tensor, weights: torch.Tensor, batch_size: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# prompts, output, weights are tensors on the GPU\ndef solve(prompts: torch.Tensor, output: torch.Tensor, weights: torch.Tensor, batch_size: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "# Model architecture constants\n",
+        "VOCAB_SIZE = 10\n",
+        "MODEL_DIM = 2\n",
+        "HEAD_DIM = 2\n",
+        "PROMPT_LEN = 31\n",
+        "OUTPUT_DIGITS = 11\n",
+        "RMS_EPS = 1e-6\n",
+        "\n",
+        "# Derived constants from the hand-crafted 10-parameter adder model\n",
+        "EMBED_CONST = 1000.0\n",
+        "CONST_NORM = math.sqrt(MODEL_DIM)\n",
+        "DIGIT_SCALE = EMBED_CONST / CONST_NORM\n",
+        "DECODE_QUAD = 1e-3\n",
+        "DECODE_CURVATURE = 0.1\n",
+        "ROPE_PERIOD = 19.0\n",
+        "OMEGA = 2.0 * math.pi / ROPE_PERIOD\n",
+        "PEAK_EPS = 0.3\n",
+        "PHI = OMEGA * (10.0 + PEAK_EPS)\n",
+        "TARGET_LOGIT_GAP = math.log(10.0)\n",
+        "ATTN_AMPLITUDE = TARGET_LOGIT_GAP / (\n",
+        "    math.cos(OMEGA * PEAK_EPS) - math.cos(OMEGA * (1.0 - PEAK_EPS))\n",
+        ")\n",
+        "QK_NORM_SCALE = math.sqrt(ATTN_AMPLITUDE / math.sqrt(2.0))\n",
+        "CARRY_ALPHA = 256.0 / CONST_NORM\n",
+        "ATTN_SCALE = (HEAD_DIM**-0.5) * (QK_NORM_SCALE**2)\n",
+        "\n",
+        "# Weight buffer layout (10 parameters total)\n",
+        "O_EMBED = 0  # [2] embedding: e(d) = [w0 - w1*d^2, -d]\n",
+        "O_QPROJ = 2  # [2] Q projection weights\n",
+        "O_VPROJ = 4  # [1] V projection weight\n",
+        "O_GATE = 5  # [2] MLP gate weights\n",
+        "O_CARRY = 7  # [1] MLP carry weight\n",
+        "O_NORM = 8  # [2] final RMSNorm weight\n",
+        "TOTAL_WEIGHTS = 10\n",
+        "\n",
+        "\n",
+        "def _encode_pair(a: int, b: int) -> list:\n",
+        "    a_digits = [int(c) for c in f\"{a:010d}\"][::-1]\n",
+        "    b_digits = [int(c) for c in f\"{b:010d}\"][::-1]\n",
+        "    return [0] + a_digits + [0] * 9 + b_digits + [0]\n",
+        "\n",
+        "\n",
+        "def _encode_pairs_batch(a_vals: torch.Tensor, b_vals: torch.Tensor, device) -> torch.Tensor:\n",
+        "    batch_size = a_vals.shape[0]\n",
+        "    prompts = torch.zeros(batch_size, PROMPT_LEN, device=device, dtype=torch.int32)\n",
+        "    a = a_vals.clone().to(torch.int64)\n",
+        "    for i in range(10):\n",
+        "        prompts[:, 1 + i] = (a % 10).to(torch.int32)\n",
+        "        a = a // 10\n",
+        "    b = b_vals.clone().to(torch.int64)\n",
+        "    for i in range(10):\n",
+        "        prompts[:, 20 + i] = (b % 10).to(torch.int32)\n",
+        "        b = b // 10\n",
+        "    return prompts\n",
+        "\n",
+        "\n",
+        "def _init_weights(device) -> torch.Tensor:\n",
+        "    w = torch.zeros(TOTAL_WEIGHTS, device=device, dtype=torch.float32)\n",
+        "    w[O_EMBED] = EMBED_CONST\n",
+        "    w[O_EMBED + 1] = DECODE_QUAD\n",
+        "    w[O_QPROJ] = math.cos(PHI)\n",
+        "    w[O_QPROJ + 1] = -math.sin(PHI)\n",
+        "    w[O_VPROJ] = -22.0 * DIGIT_SCALE\n",
+        "    w[O_GATE] = CARRY_ALPHA * (-94.0) / CONST_NORM\n",
+        "    w[O_GATE + 1] = CARRY_ALPHA * DIGIT_SCALE\n",
+        "    w[O_CARRY] = (100.0 / CARRY_ALPHA) * (1.0 / CONST_NORM)\n",
+        "    w[O_NORM] = (DECODE_CURVATURE / DECODE_QUAD) / CONST_NORM\n",
+        "    w[O_NORM + 1] = -(DIGIT_SCALE / 50.0)\n",
+        "    return w\n",
+        "\n",
+        "\n",
+        "def _unit_rms_norm(x: torch.Tensor) -> torch.Tensor:\n",
+        "    return x * torch.rsqrt(torch.mean(x * x, dim=-1, keepdim=True) + RMS_EPS)\n",
+        "\n",
+        "\n",
+        "def _forward_pass(seq: torch.Tensor, weights: torch.Tensor) -> torch.Tensor:\n",
+        "    batch_size, seq_len = seq.shape\n",
+        "    device = seq.device\n",
+        "\n",
+        "    embed_w = weights[O_EMBED : O_EMBED + 2]\n",
+        "    q_w = weights[O_QPROJ : O_QPROJ + 2]\n",
+        "    v_w = weights[O_VPROJ]\n",
+        "    gate_w = weights[O_GATE : O_GATE + 2]\n",
+        "    carry_w = weights[O_CARRY]\n",
+        "    norm_w = weights[O_NORM : O_NORM + 2]\n",
+        "\n",
+        "    digits = torch.arange(VOCAB_SIZE, device=device, dtype=torch.float32)\n",
+        "    embed_table = torch.stack(\n",
+        "        [embed_w[0] - embed_w[1] * digits * digits, -digits], dim=-1\n",
+        "    )  # [10, 2]\n",
+        "\n",
+        "    h = embed_table[seq.long()]  # [batch, seq_len, 2]\n",
+        "\n",
+        "    # Pre-attention unit RMSNorm (no learned parameters)\n",
+        "    h_norm = _unit_rms_norm(h)\n",
+        "\n",
+        "    # Q projection: [h0*qw0, h0*qw1]\n",
+        "    q = torch.stack([h_norm[..., 0] * q_w[0], h_norm[..., 0] * q_w[1]], dim=-1)\n",
+        "\n",
+        "    # K projection: [h0, 0]\n",
+        "    k = torch.stack([h_norm[..., 0], torch.zeros_like(h_norm[..., 0])], dim=-1)\n",
+        "\n",
+        "    # V projection: [h1*vw, 0]\n",
+        "    v = torch.stack([h_norm[..., 1] * v_w, torch.zeros_like(h_norm[..., 1])], dim=-1)\n",
+        "\n",
+        "    # QK norm\n",
+        "    q = _unit_rms_norm(q)\n",
+        "    k = _unit_rms_norm(k)\n",
+        "\n",
+        "    # RoPE\n",
+        "    positions = torch.arange(seq_len, device=device, dtype=torch.float32)\n",
+        "    angles = positions * OMEGA\n",
+        "    cos_a = torch.cos(angles)\n",
+        "    sin_a = torch.sin(angles)\n",
+        "\n",
+        "    q_rot = torch.stack(\n",
+        "        [q[..., 0] * cos_a - q[..., 1] * sin_a, q[..., 0] * sin_a + q[..., 1] * cos_a], dim=-1\n",
+        "    )\n",
+        "    k_rot = torch.stack(\n",
+        "        [k[..., 0] * cos_a - k[..., 1] * sin_a, k[..., 0] * sin_a + k[..., 1] * cos_a], dim=-1\n",
+        "    )\n",
+        "\n",
+        "    # Attention: [batch, 1, seq_len, 2]\n",
+        "    q_rot = q_rot.unsqueeze(1)\n",
+        "    k_rot = k_rot.unsqueeze(1)\n",
+        "    v = v.unsqueeze(1)\n",
+        "\n",
+        "    attn_scores = torch.matmul(q_rot, k_rot.transpose(-2, -1)) * ATTN_SCALE\n",
+        "    causal_mask = torch.triu(\n",
+        "        torch.full((seq_len, seq_len), float(\"-inf\"), device=device), diagonal=1\n",
+        "    )\n",
+        "    attn_scores = attn_scores + causal_mask.unsqueeze(0).unsqueeze(0)\n",
+        "    attn_probs = F.softmax(attn_scores, dim=-1)\n",
+        "    attn_out = torch.matmul(attn_probs, v).squeeze(1)  # [batch, seq_len, 2]\n",
+        "\n",
+        "    # O projection: [0, attn[..., 0]]\n",
+        "    o = torch.stack([torch.zeros_like(attn_out[..., 0]), attn_out[..., 0]], dim=-1)\n",
+        "\n",
+        "    # Residual\n",
+        "    h = h + o\n",
+        "\n",
+        "    # Pre-MLP unit RMSNorm\n",
+        "    h_norm2 = _unit_rms_norm(h)\n",
+        "\n",
+        "    # MLP gate projection\n",
+        "    a_gate = gate_w[0]\n",
+        "    c_gate = gate_w[1]\n",
+        "    g0 = h_norm2[..., 0] * a_gate + h_norm2[..., 1] * c_gate\n",
+        "    g1 = h_norm2[..., 0] * (a_gate - c_gate / EMBED_CONST) + h_norm2[..., 1] * c_gate\n",
+        "    gate = torch.stack([g0, g1], dim=-1)\n",
+        "\n",
+        "    # MLP carry projection with SwiGLU\n",
+        "    base = h_norm2[..., 0]\n",
+        "    up = base.unsqueeze(-1).expand_as(gate)\n",
+        "    mix = F.silu(gate) * up\n",
+        "    mlp_out = torch.stack([torch.zeros_like(base), carry_w * (mix[..., 1] - mix[..., 0])], dim=-1)\n",
+        "\n",
+        "    # Residual\n",
+        "    h = h + mlp_out\n",
+        "\n",
+        "    # Final RMSNorm (with learned weight)\n",
+        "    rms = torch.sqrt(torch.mean(h * h, dim=-1, keepdim=True) + RMS_EPS)\n",
+        "    h = (h / rms) * norm_w\n",
+        "\n",
+        "    # Output projection (tied with embedding)\n",
+        "    logits = h @ embed_table.T  # [batch, seq_len, 10]\n",
+        "    return logits\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Adder Transformer Inference\",\n",
+        "            atol=1e-2,\n",
+        "            rtol=1e-2,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        prompts: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        weights: torch.Tensor,\n",
+        "        batch_size: int,\n",
+        "    ):\n",
+        "        assert prompts.shape == (batch_size, PROMPT_LEN)\n",
+        "        assert prompts.dtype == torch.int32\n",
+        "        assert prompts.device.type == \"cuda\"\n",
+        "        assert output.shape == (batch_size, OUTPUT_DIGITS, VOCAB_SIZE)\n",
+        "        assert output.dtype == torch.float32\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "        assert weights.shape == (TOTAL_WEIGHTS,)\n",
+        "        assert weights.dtype == torch.float32\n",
+        "        assert weights.device.type == \"cuda\"\n",
+        "\n",
+        "        seq = prompts.clone()\n",
+        "        for step in range(OUTPUT_DIGITS):\n",
+        "            logits = _forward_pass(seq, weights)\n",
+        "            last_logits = logits[:, -1, :]\n",
+        "            output[:, step, :] = last_logits\n",
+        "            next_token = last_logits.argmax(dim=-1).to(torch.int32)\n",
+        "            seq = torch.cat([seq, next_token.unsqueeze(1)], dim=1)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"prompts\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"weights\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"batch_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        pairs = [(3, 5), (99, 1)]\n",
+        "        batch_size = len(pairs)\n",
+        "        prompts = torch.tensor(\n",
+        "            [_encode_pair(a, b) for a, b in pairs],\n",
+        "            device=device,\n",
+        "            dtype=torch.int32,\n",
+        "        )\n",
+        "        weights = _init_weights(device)\n",
+        "        output = torch.zeros(\n",
+        "            batch_size, OUTPUT_DIGITS, VOCAB_SIZE, device=device, dtype=torch.float32\n",
+        "        )\n",
+        "        return {\n",
+        "            \"prompts\": prompts,\n",
+        "            \"output\": output,\n",
+        "            \"weights\": weights,\n",
+        "            \"batch_size\": batch_size,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        device = \"cuda\"\n",
+        "        tests = []\n",
+        "\n",
+        "        def _make_test(pairs):\n",
+        "            batch_size = len(pairs)\n",
+        "            prompts = torch.tensor(\n",
+        "                [_encode_pair(a, b) for a, b in pairs],\n",
+        "                device=device,\n",
+        "                dtype=torch.int32,\n",
+        "            )\n",
+        "            weights = _init_weights(device)\n",
+        "            output = torch.zeros(\n",
+        "                batch_size, OUTPUT_DIGITS, VOCAB_SIZE, device=device, dtype=torch.float32\n",
+        "            )\n",
+        "            return {\n",
+        "                \"prompts\": prompts,\n",
+        "                \"output\": output,\n",
+        "                \"weights\": weights,\n",
+        "                \"batch_size\": batch_size,\n",
+        "            }\n",
+        "\n",
+        "        # Edge: single pair, both zero\n",
+        "        tests.append(_make_test([(0, 0)]))\n",
+        "\n",
+        "        # Edge: single pair, max carry propagation\n",
+        "        tests.append(_make_test([(9999999999, 1)]))\n",
+        "\n",
+        "        # Edge: small batch, simple sums\n",
+        "        tests.append(_make_test([(1, 2), (3, 4)]))\n",
+        "\n",
+        "        # Power-of-2 batch: 16\n",
+        "        torch.manual_seed(42)\n",
+        "        tests.append(\n",
+        "            _make_test(\n",
+        "                [\n",
+        "                    (torch.randint(0, 10**10, (1,)).item(), torch.randint(0, 10**10, (1,)).item())\n",
+        "                    for _ in range(16)\n",
+        "                ]\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # Power-of-2 batch: 64\n",
+        "        tests.append(\n",
+        "            _make_test(\n",
+        "                [\n",
+        "                    (torch.randint(0, 10**10, (1,)).item(), torch.randint(0, 10**10, (1,)).item())\n",
+        "                    for _ in range(64)\n",
+        "                ]\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # Non-power-of-2: 30\n",
+        "        tests.append(\n",
+        "            _make_test(\n",
+        "                [\n",
+        "                    (torch.randint(0, 10**10, (1,)).item(), torch.randint(0, 10**10, (1,)).item())\n",
+        "                    for _ in range(30)\n",
+        "                ]\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # Non-power-of-2: 100\n",
+        "        tests.append(\n",
+        "            _make_test(\n",
+        "                [\n",
+        "                    (torch.randint(0, 10**10, (1,)).item(), torch.randint(0, 10**10, (1,)).item())\n",
+        "                    for _ in range(100)\n",
+        "                ]\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # Realistic: 1000\n",
+        "        tests.append(\n",
+        "            _make_test(\n",
+        "                [\n",
+        "                    (torch.randint(0, 10**10, (1,)).item(), torch.randint(0, 10**10, (1,)).item())\n",
+        "                    for _ in range(1000)\n",
+        "                ]\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # All zeros\n",
+        "        tests.append(_make_test([(0, 0)] * 8))\n",
+        "\n",
+        "        # Max values\n",
+        "        tests.append(_make_test([(9999999999, 9999999999)] * 4))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        batch_size = 100000\n",
+        "        torch.manual_seed(123)\n",
+        "        a_vals = torch.randint(0, 10**10, (batch_size,), dtype=torch.int64)\n",
+        "        b_vals = torch.randint(0, 10**10, (batch_size,), dtype=torch.int64)\n",
+        "        prompts = _encode_pairs_batch(a_vals, b_vals, device)\n",
+        "        weights = _init_weights(device)\n",
+        "        output = torch.zeros(\n",
+        "            batch_size, OUTPUT_DIGITS, VOCAB_SIZE, device=device, dtype=torch.float32\n",
+        "        )\n",
+        "        return {\n",
+        "            \"prompts\": prompts,\n",
+        "            \"output\": output,\n",
+        "            \"weights\": weights,\n",
+        "            \"batch_size\": batch_size,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/78_2d_fft.ipynb b/challenges/colab_exports/medium/78_2d_fft.ipynb
new file mode 100644
index 00000000..ee2b8725
--- /dev/null
+++ b/challenges/colab_exports/medium/78_2d_fft.ipynb
@@ -0,0 +1,474 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Compute the 2D Discrete Fourier Transform (2D DFT) of a complex-valued signal stored on the GPU.\n  Given a 2D complex input signal of shape <code>M &times; N</code>, compute its 2D DFT spectrum\n  using the row-column decomposition: apply a 1D DFT along each row, then a 1D DFT along each\n  column of the result. All values are 32-bit floating point.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>spectrum</code></li>\n  <li>\n    The input and output are stored as 1D arrays of interleaved real and imaginary parts in\n    row-major order: element <code>x[m, n]</code> has its real part at index\n    <code>2*(m*N + n)</code> and imaginary part at index <code>2*(m*N + n) + 1</code>\n  </li>\n</ul>\n\n<h2>Example</h2>\n<p>\nInput: <code>M</code> = 2, <code>N</code> = 2<br>\nSignal $x[m, n]$ (real part):\n$$\n\\begin{bmatrix}\n1.0 & 0.0 \\\\\n0.0 & 0.0\n\\end{bmatrix}\n$$\nSignal $x[m, n]$ (imaginary part):\n$$\n\\begin{bmatrix}\n0.0 & 0.0 \\\\\n0.0 & 0.0\n\\end{bmatrix}\n$$\nOutput:<br>\nSpectrum $X[k, l]$ (real part):\n$$\n\\begin{bmatrix}\n1.0 & 1.0 \\\\\n1.0 & 1.0\n\\end{bmatrix}\n$$\nSpectrum $X[k, l]$ (imaginary part):\n$$\n\\begin{bmatrix}\n0.0 & 0.0 \\\\\n0.0 & 0.0\n\\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 4096</li>\n  <li>Signal values are 32-bit floating point (real and imaginary parts)</li>\n  <li>Performance is measured with <code>M</code> = 2,048, <code>N</code> = 2,048</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// signal, spectrum are device pointers\nextern \"C\" void solve(const float* signal, float* spectrum, int M, int N) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# signal, spectrum are tensors on the GPU\n@cute.jit\ndef solve(signal: cute.Tensor, spectrum: cute.Tensor, M: cute.Int32, N: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# signal is a tensor on GPU\n@jax.jit\ndef solve(signal: jax.Array, M: int, N: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# signal, spectrum are device pointers\n@export\ndef solve(\n    signal: UnsafePointer[Float32, MutExternalOrigin],\n    spectrum: UnsafePointer[Float32, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# signal, spectrum are tensors on the GPU\ndef solve(signal: torch.Tensor, spectrum: torch.Tensor, M: int, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# signal, spectrum are tensors on the GPU\ndef solve(signal: torch.Tensor, spectrum: torch.Tensor, M: int, N: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"2D FFT\",\n",
+        "            atol=1e-02,\n",
+        "            rtol=1e-02,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(self, signal: torch.Tensor, spectrum: torch.Tensor, M: int, N: int):\n",
+        "        assert signal.shape == (M * N * 2,)\n",
+        "        assert spectrum.shape == (M * N * 2,)\n",
+        "        assert signal.dtype == torch.float32\n",
+        "        assert spectrum.dtype == torch.float32\n",
+        "        assert signal.device == spectrum.device\n",
+        "\n",
+        "        sig_ri = signal.view(M, N, 2)\n",
+        "        sig_c = torch.complex(sig_ri[..., 0].contiguous(), sig_ri[..., 1].contiguous())\n",
+        "        spec_c = torch.fft.fft2(sig_c)\n",
+        "        spec_ri = torch.stack((spec_c.real, spec_c.imag), dim=-1).contiguous()\n",
+        "        spectrum.copy_(spec_ri.view(-1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"signal\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"spectrum\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N = 2, 2\n",
+        "        signal = torch.tensor([1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], device=\"cuda\", dtype=dtype)\n",
+        "        spectrum = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"signal\": signal, \"spectrum\": spectrum, \"M\": M, \"N\": N}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        cases = []\n",
+        "\n",
+        "        def make_case(M, N, low=-1.0, high=1.0):\n",
+        "            signal = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype).uniform_(low, high)\n",
+        "            spectrum = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "            return {\"signal\": signal, \"spectrum\": spectrum, \"M\": M, \"N\": N}\n",
+        "\n",
+        "        def make_zero_case(M, N):\n",
+        "            signal = torch.zeros(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "            spectrum = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "            return {\"signal\": signal, \"spectrum\": spectrum, \"M\": M, \"N\": N}\n",
+        "\n",
+        "        def make_impulse_case(M, N):\n",
+        "            signal = torch.zeros(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "            signal[0] = 1.0\n",
+        "            spectrum = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "            return {\"signal\": signal, \"spectrum\": spectrum, \"M\": M, \"N\": N}\n",
+        "\n",
+        "        # Edge cases: small sizes\n",
+        "        cases.append(make_impulse_case(1, 1))\n",
+        "        cases.append(make_zero_case(2, 2))\n",
+        "        cases.append(make_case(1, 4))\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        cases.append(make_case(16, 16))\n",
+        "        cases.append(make_case(32, 64))\n",
+        "\n",
+        "        # Non-power-of-2 sizes\n",
+        "        cases.append(make_case(3, 5))\n",
+        "        cases.append(make_case(30, 30))\n",
+        "\n",
+        "        # Mixed positive/negative values\n",
+        "        cases.append(make_case(100, 200, low=-5.0, high=5.0))\n",
+        "\n",
+        "        # Realistic sizes\n",
+        "        cases.append(make_case(256, 256))\n",
+        "        cases.append(make_case(512, 512))\n",
+        "\n",
+        "        return cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        M, N = 2048, 2048\n",
+        "        signal = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype).normal_(0.0, 1.0)\n",
+        "        spectrum = torch.empty(M * N * 2, device=\"cuda\", dtype=dtype)\n",
+        "        return {\"signal\": signal, \"spectrum\": spectrum, \"M\": M, \"N\": N}\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/80_grouped_query_attention.ipynb b/challenges/colab_exports/medium/80_grouped_query_attention.ipynb
new file mode 100644
index 00000000..b6d3c6ef
--- /dev/null
+++ b/challenges/colab_exports/medium/80_grouped_query_attention.ipynb
@@ -0,0 +1,560 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\nImplement Grouped Query Attention (GQA), the attention mechanism used in modern large language\nmodels such as LLaMA-3, Mistral, and Gemma. GQA reduces the KV-cache memory footprint during\ninference by sharing key and value heads across groups of query heads. Given query tensor\n<code>Q</code> with <code>num_q_heads</code> heads and key/value tensors <code>K</code>,\n<code>V</code> each with <code>num_kv_heads</code> heads, compute scaled dot-product attention\nwhere every group of <code>num_q_heads / num_kv_heads</code> consecutive query heads attends to\nthe same key and value head. All tensors use <code>float32</code>.\n</p>\n\n<svg width=\"700\" height=\"260\" viewBox=\"0 0 700 260\" xmlns=\"http://www.w3.org/2000/svg\" style=\"display:block; margin:20px auto;\">\n  <rect width=\"700\" height=\"260\" fill=\"#222\" rx=\"10\"/>\n  <!-- Title -->\n  <text x=\"350\" y=\"28\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"14\" text-anchor=\"middle\">Grouped Query Attention (num_q_heads=4, num_kv_heads=2, groups=2)</text>\n\n  <!-- Q heads -->\n  <text x=\"80\" y=\"60\" fill=\"#aaa\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">Q heads</text>\n  <rect x=\"20\" y=\"70\" width=\"60\" height=\"36\" fill=\"#2563eb\" rx=\"4\"/>\n  <text x=\"50\" y=\"93\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">Q[0]</text>\n  <rect x=\"100\" y=\"70\" width=\"60\" height=\"36\" fill=\"#2563eb\" rx=\"4\"/>\n  <text x=\"130\" y=\"93\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">Q[1]</text>\n  <rect x=\"180\" y=\"70\" width=\"60\" height=\"36\" fill=\"#7c3aed\" rx=\"4\"/>\n  <text x=\"210\" y=\"93\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">Q[2]</text>\n  <rect x=\"260\" y=\"70\" width=\"60\" height=\"36\" fill=\"#7c3aed\" rx=\"4\"/>\n  <text x=\"290\" y=\"93\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">Q[3]</text>\n\n  <!-- KV heads -->\n  <text x=\"80\" y=\"175\" fill=\"#aaa\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">KV heads</text>\n  <rect x=\"20\" y=\"185\" width=\"120\" height=\"36\" fill=\"#1d4ed8\" rx=\"4\"/>\n  <text x=\"80\" y=\"208\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">K[0], V[0]</text>\n  <rect x=\"180\" y=\"185\" width=\"120\" height=\"36\" fill=\"#5b21b6\" rx=\"4\"/>\n  <text x=\"240\" y=\"208\" fill=\"#fff\" font-family=\"monospace\" font-size=\"12\" text-anchor=\"middle\">K[1], V[1]</text>\n\n  <!-- Arrows group 0 -->\n  <line x1=\"50\" y1=\"106\" x2=\"70\" y2=\"185\" stroke=\"#60a5fa\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <line x1=\"130\" y1=\"106\" x2=\"90\" y2=\"185\" stroke=\"#60a5fa\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Arrows group 1 -->\n  <line x1=\"210\" y1=\"106\" x2=\"230\" y2=\"185\" stroke=\"#c4b5fd\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <line x1=\"290\" y1=\"106\" x2=\"250\" y2=\"185\" stroke=\"#c4b5fd\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Output boxes -->\n  <text x=\"80\" y=\"245\" fill=\"#aaa\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">group 0</text>\n  <text x=\"240\" y=\"245\" fill=\"#aaa\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">group 1</text>\n\n  <!-- bracket labels -->\n  <text x=\"430\" y=\"88\" fill=\"#60a5fa\" font-family=\"monospace\" font-size=\"12\">Q[0], Q[1] attend to K[0], V[0]</text>\n  <text x=\"430\" y=\"112\" fill=\"#c4b5fd\" font-family=\"monospace\" font-size=\"12\">Q[2], Q[3] attend to K[1], V[1]</text>\n  <text x=\"430\" y=\"150\" fill=\"#4ade80\" font-family=\"monospace\" font-size=\"12\">scale = 1 / sqrt(head_dim)</text>\n  <text x=\"430\" y=\"174\" fill=\"#4ade80\" font-family=\"monospace\" font-size=\"12\">scores = Q @ K^T * scale</text>\n  <text x=\"430\" y=\"198\" fill=\"#4ade80\" font-family=\"monospace\" font-size=\"12\">weights = softmax(scores)</text>\n  <text x=\"430\" y=\"222\" fill=\"#4ade80\" font-family=\"monospace\" font-size=\"12\">output = weights @ V</text>\n\n  <defs>\n    <marker id=\"arr\" markerWidth=\"6\" markerHeight=\"6\" refX=\"3\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L0,6 L6,3 z\" fill=\"#888\"/>\n    </marker>\n  </defs>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement the function <code>solve(Q, K, V, output, num_q_heads, num_kv_heads, seq_len, head_dim)</code>.</li>\n  <li>Do not change the function signature or use external libraries beyond the standard GPU frameworks.</li>\n  <li>Write the result into the provided <code>output</code> buffer.</li>\n  <li><code>num_q_heads</code> is always divisible by <code>num_kv_heads</code>.</li>\n  <li>Use scaled dot-product attention with scale factor <code>1 / sqrt(head_dim)</code> and a softmax over the key dimension.</li>\n</ul>\n\n<h2>Example</h2>\n<p>\n  With <code>num_q_heads</code> = 4, <code>num_kv_heads</code> = 2 (groups of 2), <code>seq_len</code> = 3,\n  <code>head_dim</code> = 4:\n</p>\n<p>\n  <strong>Input:</strong><br>\n  $Q_0$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  1 & 0 & 0 & 1 \\\\\n  0 & 1 & 1 & 0 \\\\\n  1 & 1 & 0 & 0\n  \\end{bmatrix}\n  $$\n  $Q_1$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  0 & 1 & 0 & 1 \\\\\n  1 & 0 & 1 & 0 \\\\\n  0 & 0 & 1 & 1\n  \\end{bmatrix}\n  $$\n  $Q_2$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  -1 & 0 & 0.5 & 0 \\\\\n  0 & -1 & 0 & 0.5 \\\\\n  0.5 & 0 & -1 & 0\n  \\end{bmatrix}\n  $$\n  $Q_3$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  0 & 0.5 & 0 & -1 \\\\\n  0.5 & 0 & 0 & -1 \\\\\n  0 & 0 & 0.5 & 0.5\n  \\end{bmatrix}\n  $$\n  $K_0$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  1 & 0 & 1 & 0 \\\\\n  0 & 1 & 0 & 1 \\\\\n  1 & 1 & 1 & 1\n  \\end{bmatrix}\n  $$\n  $K_1$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  0 & 1 & 0 & -1 \\\\\n  -1 & 0 & 1 & 0 \\\\\n  0 & -1 & 0 & 1\n  \\end{bmatrix}\n  $$\n  $V_0$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  1 & 2 & 3 & 4 \\\\\n  5 & 6 & 7 & 8 \\\\\n  9 & 10 & 11 & 12\n  \\end{bmatrix}\n  $$\n  $V_1$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  -1 & -2 & -3 & -4 \\\\\n  2 & 3 & 4 & 5 \\\\\n  6 & 7 & 8 & 9\n  \\end{bmatrix}\n  $$\n  Groups: $Q_0, Q_1 \\to K_0, V_0$; \\quad $Q_2, Q_3 \\to K_1, V_1$\n</p>\n<p>\n  <strong>Output</strong> (values rounded to 2 decimal places):<br>\n  $\\text{output}_0$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  5.71 & 6.71 & 7.71 & 8.71 \\\\\n  5.71 & 6.71 & 7.71 & 8.71 \\\\\n  5.71 & 6.71 & 7.71 & 8.71\n  \\end{bmatrix}\n  $$\n  $\\text{output}_1$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  6.07 & 7.07 & 8.07 & 9.07 \\\\\n  5.00 & 6.00 & 7.00 & 8.00 \\\\\n  5.71 & 6.71 & 7.71 & 8.71\n  \\end{bmatrix}\n  $$\n  $\\text{output}_2$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  2.24 & 2.76 & 3.27 & 3.79 \\\\\n  3.96 & 4.70 & 5.44 & 6.17 \\\\\n  2.40 & 2.60 & 2.79 & 2.98\n  \\end{bmatrix}\n  $$\n  $\\text{output}_3$ (3&times;4):\n  $$\n  \\begin{bmatrix}\n  0.76 & 0.58 & 0.40 & 0.22 \\\\\n  1.17 & 1.08 & 1.00 & 0.91 \\\\\n  2.84 & 3.37 & 3.91 & 4.44\n  \\end{bmatrix}\n  $$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>num_kv_heads</code> &le; <code>num_q_heads</code> &le; 64</li>\n  <li><code>num_q_heads</code> is divisible by <code>num_kv_heads</code></li>\n  <li>1 &le; <code>seq_len</code> &le; 4,096</li>\n  <li>8 &le; <code>head_dim</code> &le; 256; <code>head_dim</code> is a multiple of 8</li>\n  <li>All tensor values are <code>float32</code></li>\n  <li>Performance is measured with <code>num_q_heads</code> = 32, <code>num_kv_heads</code> = 8, <code>seq_len</code> = 1,024, <code>head_dim</code> = 128</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output,\n                      int num_q_heads, int num_kv_heads, int seq_len, int head_dim) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    num_q_heads: cute.Int32,\n    num_kv_heads: cute.Int32,\n    seq_len: cute.Int32,\n    head_dim: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on GPU\n@jax.jit\ndef solve(\n    Q: jax.Array,\n    K: jax.Array,\n    V: jax.Array,\n    num_q_heads: int,\n    num_kv_heads: int,\n    seq_len: int,\n    head_dim: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# Q, K, V, output are device pointers\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    num_q_heads: Int32,\n    num_kv_heads: Int32,\n    seq_len: Int32,\n    head_dim: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    num_q_heads: int,\n    num_kv_heads: int,\n    seq_len: int,\n    head_dim: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    num_q_heads: int,\n    num_kv_heads: int,\n    seq_len: int,\n    head_dim: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Grouped Query Attention\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        num_q_heads: int,\n",
+        "        num_kv_heads: int,\n",
+        "        seq_len: int,\n",
+        "        head_dim: int,\n",
+        "    ):\n",
+        "        assert Q.shape == (num_q_heads, seq_len, head_dim)\n",
+        "        assert K.shape == (num_kv_heads, seq_len, head_dim)\n",
+        "        assert V.shape == (num_kv_heads, seq_len, head_dim)\n",
+        "        assert output.shape == (num_q_heads, seq_len, head_dim)\n",
+        "        assert Q.dtype == K.dtype == V.dtype == output.dtype == torch.float32\n",
+        "        assert Q.device.type == \"cuda\"\n",
+        "        assert K.device.type == \"cuda\"\n",
+        "        assert V.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "        assert num_q_heads % num_kv_heads == 0\n",
+        "\n",
+        "        num_groups = num_q_heads // num_kv_heads\n",
+        "        scale = 1.0 / math.sqrt(head_dim)\n",
+        "\n",
+        "        # Expand K, V from (num_kv_heads, seq_len, head_dim)\n",
+        "        # to (num_q_heads, seq_len, head_dim) by repeating each KV head num_groups times\n",
+        "        K_expanded = K.repeat_interleave(num_groups, dim=0)\n",
+        "        V_expanded = V.repeat_interleave(num_groups, dim=0)\n",
+        "\n",
+        "        # Scaled dot-product attention: (num_q_heads, seq_len, seq_len)\n",
+        "        scores = torch.bmm(Q, K_expanded.transpose(1, 2)) * scale\n",
+        "\n",
+        "        # Softmax over the key dimension\n",
+        "        attn_weights = torch.softmax(scores, dim=-1)\n",
+        "\n",
+        "        # Weighted sum of values: (num_q_heads, seq_len, head_dim)\n",
+        "        output.copy_(torch.bmm(attn_weights, V_expanded))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"num_q_heads\": (ctypes.c_int, \"in\"),\n",
+        "            \"num_kv_heads\": (ctypes.c_int, \"in\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "            \"head_dim\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, num_q_heads, num_kv_heads, seq_len, head_dim, zero_inputs=False):\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        if zero_inputs:\n",
+        "            Q = torch.zeros(num_q_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "            K = torch.zeros(num_kv_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "            V = torch.zeros(num_kv_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            Q = torch.randn(num_q_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "            K = torch.randn(num_kv_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "            V = torch.randn(num_kv_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "        output = torch.zeros(num_q_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"num_q_heads\": num_q_heads,\n",
+        "            \"num_kv_heads\": num_kv_heads,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"head_dim\": head_dim,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        num_q_heads = 4\n",
+        "        num_kv_heads = 2\n",
+        "        seq_len = 3\n",
+        "        head_dim = 4\n",
+        "\n",
+        "        Q = torch.tensor(\n",
+        "            [\n",
+        "                [[1.0, 0.0, 0.0, 1.0], [0.0, 1.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0]],\n",
+        "                [[0.0, 1.0, 0.0, 1.0], [1.0, 0.0, 1.0, 0.0], [0.0, 0.0, 1.0, 1.0]],\n",
+        "                [[-1.0, 0.0, 0.5, 0.0], [0.0, -1.0, 0.0, 0.5], [0.5, 0.0, -1.0, 0.0]],\n",
+        "                [[0.0, 0.5, 0.0, -1.0], [0.5, 0.0, 0.0, -1.0], [0.0, 0.0, 0.5, 0.5]],\n",
+        "            ],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        K = torch.tensor(\n",
+        "            [\n",
+        "                [[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0], [1.0, 1.0, 1.0, 1.0]],\n",
+        "                [[0.0, 1.0, 0.0, -1.0], [-1.0, 0.0, 1.0, 0.0], [0.0, -1.0, 0.0, 1.0]],\n",
+        "            ],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        V = torch.tensor(\n",
+        "            [\n",
+        "                [[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],\n",
+        "                [[-1.0, -2.0, -3.0, -4.0], [2.0, 3.0, 4.0, 5.0], [6.0, 7.0, 8.0, 9.0]],\n",
+        "            ],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.zeros(num_q_heads, seq_len, head_dim, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"num_q_heads\": num_q_heads,\n",
+        "            \"num_kv_heads\": num_kv_heads,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"head_dim\": head_dim,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge case: MQA (num_kv_heads=1), single token\n",
+        "        tests.append(self._make_test_case(4, 1, 1, 8))\n",
+        "\n",
+        "        # Edge case: GQA with groups=2, tiny seq\n",
+        "        tests.append(self._make_test_case(2, 1, 2, 4))\n",
+        "\n",
+        "        # Zero inputs\n",
+        "        tests.append(self._make_test_case(4, 2, 4, 8, zero_inputs=True))\n",
+        "\n",
+        "        # Power-of-2: groups=4 (LLaMA-3 style ratio)\n",
+        "        tests.append(self._make_test_case(8, 2, 16, 32))\n",
+        "\n",
+        "        # Power-of-2: seq_len=32, head_dim=64\n",
+        "        tests.append(self._make_test_case(4, 2, 32, 64))\n",
+        "\n",
+        "        # Non-power-of-2 seq_len\n",
+        "        tests.append(self._make_test_case(4, 2, 30, 32))\n",
+        "\n",
+        "        # Non-power-of-2 seq_len, different grouping\n",
+        "        tests.append(self._make_test_case(6, 3, 100, 32))\n",
+        "\n",
+        "        # GQA groups=8 (Mistral style), seq_len=255\n",
+        "        tests.append(self._make_test_case(8, 1, 255, 64))\n",
+        "\n",
+        "        # MHA equivalent (num_q_heads == num_kv_heads)\n",
+        "        tests.append(self._make_test_case(8, 8, 64, 32))\n",
+        "\n",
+        "        # Realistic small inference batch\n",
+        "        tests.append(self._make_test_case(8, 2, 128, 64))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # LLaMA-3 8B style: 32 Q heads, 8 KV heads, head_dim=128\n",
+        "        return self._make_test_case(32, 8, 1024, 128)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/81_int4_matmul.ipynb b/challenges/colab_exports/medium/81_int4_matmul.ipynb
new file mode 100644
index 00000000..70390653
--- /dev/null
+++ b/challenges/colab_exports/medium/81_int4_matmul.ipynb
@@ -0,0 +1,538 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a weight-only INT4 quantized matrix multiplication (W4A16), a core kernel used in\n  modern LLM inference. Given a float16 activation matrix <code>x</code> of shape\n  <code>M &times; K</code> and a weight matrix stored in packed INT4 format, compute the output\n  matrix <code>y = x &times; W<sup>T</sup></code> of shape <code>M &times; N</code>, where\n  <code>W</code> is the dequantized float16 weight matrix of shape <code>N &times; K</code>.\n</p>\n\n<p>\n  <strong>Packing format:</strong> Each byte of <code>w_q</code> stores two INT4 weights. The\n  high nibble (bits 7&ndash;4) holds weight <code>w[n, 2i]</code> and the low nibble (bits\n  3&ndash;0) holds <code>w[n, 2i+1]</code>. INT4 values are stored unsigned in the range\n  [0,&nbsp;15] with an offset of 8, so the signed weight is <code>nibble&nbsp;&minus;&nbsp;8</code>,\n  giving values in [&minus;8,&nbsp;7].\n</p>\n\n<p>\n  <strong>Dequantization:</strong> Weights are dequantized group-wise. Each contiguous block of\n  <code>group_size</code> weights along the <code>K</code> dimension shares one float16 scale:\n</p>\n<pre>\nW[n, k] = (w_q_nibble[n, k] - 8) * scales[n, k // group_size]\n</pre>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The final result must be stored in <code>y</code></li>\n</ul>\n\n<h2>Example</h2>\n<p>\n  Input (<code>M</code> = 2, <code>N</code> = 4, <code>K</code> = 4, <code>group_size</code> = 2):\n</p>\n<p>\n  Activations $x$ (float16, $2 \\times 4$):\n  $$\n  \\begin{bmatrix}\n  1.0 & 0.0 & 1.0 & 0.0 \\\\\n  0.0 & 1.0 & 0.0 & 1.0\n  \\end{bmatrix}\n  $$\n  Packed weights $w\\_q$ (uint8, $4 \\times 2$) with signed INT4 values in brackets:\n  $$\n  \\begin{bmatrix}\n  \\texttt{0x99} & \\texttt{0x99} \\\\\n  \\texttt{0xAA} & \\texttt{0xAA} \\\\\n  \\texttt{0x77} & \\texttt{0x77} \\\\\n  \\texttt{0x88} & \\texttt{0x88}\n  \\end{bmatrix}\n  \\;\\Rightarrow\\;\n  W_{\\text{int4}} =\n  \\begin{bmatrix}\n  1 & 1 & 1 & 1 \\\\\n  2 & 2 & 2 & 2 \\\\\n  -1 & -1 & -1 & -1 \\\\\n  0 & 0 & 0 & 0\n  \\end{bmatrix}\n  $$\n  Scales (float16, $4 \\times 2$, all entries 0.5):\n  $$\n  \\begin{bmatrix}\n  0.5 & 0.5 \\\\\n  0.5 & 0.5 \\\\\n  0.5 & 0.5 \\\\\n  0.5 & 0.5\n  \\end{bmatrix}\n  \\;\\Rightarrow\\;\n  W_{\\text{dequant}} =\n  \\begin{bmatrix}\n  0.5 & 0.5 & 0.5 & 0.5 \\\\\n  1.0 & 1.0 & 1.0 & 1.0 \\\\\n  -0.5 & -0.5 & -0.5 & -0.5 \\\\\n  0.0 & 0.0 & 0.0 & 0.0\n  \\end{bmatrix}\n  $$\n  Output $y = x \\times W^T$ (float16, $2 \\times 4$):\n  $$\n  \\begin{bmatrix}\n  1.0 & 2.0 & -1.0 & 0.0 \\\\\n  1.0 & 2.0 & -1.0 & 0.0\n  \\end{bmatrix}\n  $$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code>, <code>N</code> &le; 8,192</li>\n  <li>1 &le; <code>K</code> &le; 8,192</li>\n  <li><code>K</code> is divisible by <code>2</code> and by <code>group_size</code></li>\n  <li><code>group_size</code> &isin; {2, 4, 8, 16, 32, 64, 128}</li>\n  <li>All tensors are stored in row-major order</li>\n  <li>Input dtype: <code>x</code> and <code>scales</code> are float16; <code>w_q</code> is uint8</li>\n  <li>Output dtype: <code>y</code> is float16</li>\n  <li>Performance is measured with <code>M</code> = 4,096, <code>N</code> = 4,096, <code>K</code> = 4,096, <code>group_size</code> = 128</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_fp16.h>\n#include <cuda_runtime.h>\n#include <stdint.h>\n\n// x, w_q, scales, y are device pointers\nextern \"C\" void solve(const __half* x, const uint8_t* w_q, const __half* scales, __half* y, int M,\n                      int N, int K, int group_size) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, w_q, scales, y are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    w_q: cute.Tensor,\n    scales: cute.Tensor,\n    y: cute.Tensor,\n    M: cute.Int32,\n    N: cute.Int32,\n    K: cute.Int32,\n    group_size: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, w_q, scales are tensors on GPU\n@jax.jit\ndef solve(\n    x: jax.Array,\n    w_q: jax.Array,\n    scales: jax.Array,\n    M: int,\n    N: int,\n    K: int,\n    group_size: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# x, w_q, scales, y are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float16, MutExternalOrigin],\n    w_q: UnsafePointer[UInt8, MutExternalOrigin],\n    scales: UnsafePointer[Float16, MutExternalOrigin],\n    y: UnsafePointer[Float16, MutExternalOrigin],\n    M: Int32,\n    N: Int32,\n    K: Int32,\n    group_size: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, w_q, scales, y are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    w_q: torch.Tensor,\n    scales: torch.Tensor,\n    y: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    group_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, w_q, scales, y are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    w_q: torch.Tensor,\n    scales: torch.Tensor,\n    y: torch.Tensor,\n    M: int,\n    N: int,\n    K: int,\n    group_size: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"INT4 Weight-Only Quantized MatMul\",\n",
+        "            atol=1e-02,\n",
+        "            rtol=1e-02,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        w_q: torch.Tensor,\n",
+        "        scales: torch.Tensor,\n",
+        "        y: torch.Tensor,\n",
+        "        M: int,\n",
+        "        N: int,\n",
+        "        K: int,\n",
+        "        group_size: int,\n",
+        "    ):\n",
+        "        assert x.shape == (M, K)\n",
+        "        assert w_q.shape == (N, K // 2)\n",
+        "        assert scales.shape == (N, K // group_size)\n",
+        "        assert y.shape == (M, N)\n",
+        "        assert x.dtype == torch.float16\n",
+        "        assert w_q.dtype == torch.uint8\n",
+        "        assert scales.dtype == torch.float16\n",
+        "        assert y.dtype == torch.float16\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert w_q.device.type == \"cuda\"\n",
+        "        assert scales.device.type == \"cuda\"\n",
+        "        assert y.device.type == \"cuda\"\n",
+        "\n",
+        "        # Unpack INT4 weights from packed uint8 bytes.\n",
+        "        # w_q[n, i] stores two weights: w[n, 2*i] in the high nibble (bits 7:4)\n",
+        "        # and w[n, 2*i+1] in the low nibble (bits 3:0).\n",
+        "        # INT4 values are stored unsigned (0\u201315) with an offset of 8,\n",
+        "        # so the signed value is nibble - 8, giving range [-8, 7].\n",
+        "        w_high = ((w_q >> 4) & 0xF).to(torch.int32) - 8  # [N, K//2]\n",
+        "        w_low = (w_q & 0xF).to(torch.int32) - 8  # [N, K//2]\n",
+        "\n",
+        "        # Interleave high and low nibbles to reconstruct [N, K]\n",
+        "        w_int = torch.stack([w_high, w_low], dim=-1).reshape(N, K)  # [N, K]\n",
+        "\n",
+        "        # Apply group-wise scales: dequantize each group\n",
+        "        n_groups = K // group_size\n",
+        "        w_groups = w_int.reshape(N, n_groups, group_size).float()  # [N, n_groups, group_size]\n",
+        "        scales_f = scales.float().unsqueeze(-1)  # [N, n_groups, 1]\n",
+        "        w_dequant = (w_groups * scales_f).reshape(N, K)  # [N, K]\n",
+        "\n",
+        "        # MatMul: x [M, K] @ w_dequant.T [K, N] = y [M, N]\n",
+        "        y.copy_((x.float() @ w_dequant.T).half())\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"w_q\": (ctypes.POINTER(ctypes.c_uint8), \"in\"),\n",
+        "            \"scales\": (ctypes.POINTER(ctypes.c_uint16), \"in\"),\n",
+        "            \"y\": (ctypes.POINTER(ctypes.c_uint16), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"N\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "            \"group_size\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, M: int, N: int, K: int, group_size: int, zero_x: bool = False):\n",
+        "        device = \"cuda\"\n",
+        "        if zero_x:\n",
+        "            x = torch.zeros(M, K, device=device, dtype=torch.float16)\n",
+        "        else:\n",
+        "            x = torch.randn(M, K, device=device, dtype=torch.float16)\n",
+        "        # Random packed INT4 weights: each byte holds two nibbles in [0,15]\n",
+        "        w_q = torch.randint(0, 256, (N, K // 2), dtype=torch.uint8, device=device)\n",
+        "        # Small positive scales to keep magnitudes reasonable\n",
+        "        scales = torch.rand(N, K // group_size, device=device, dtype=torch.float16) * 0.1 + 0.01\n",
+        "        y = torch.empty(M, N, device=device, dtype=torch.float16)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"w_q\": w_q,\n",
+        "            \"scales\": scales,\n",
+        "            \"y\": y,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "            \"group_size\": group_size,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        M, N, K, group_size = 2, 4, 4, 2\n",
+        "\n",
+        "        x = torch.tensor(\n",
+        "            [[1.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 1.0]],\n",
+        "            device=device,\n",
+        "            dtype=torch.float16,\n",
+        "        )\n",
+        "        # Packed INT4 weights (high nibble first).\n",
+        "        # Row 0: weights [1,1,1,1]  \u2192 nibbles stored as [9,9,9,9] \u2192 bytes [0x99, 0x99] = [153, 153]\n",
+        "        # Row 1: weights [2,2,2,2]  \u2192 nibbles [10,10,10,10]      \u2192 bytes [0xAA, 0xAA] = [170, 170]\n",
+        "        # Row 2: weights [-1,-1,-1,-1] \u2192 nibbles [7,7,7,7]       \u2192 bytes [0x77, 0x77] = [119, 119]\n",
+        "        # Row 3: weights [0,0,0,0]  \u2192 nibbles [8,8,8,8]          \u2192 bytes [0x88, 0x88] = [136, 136]\n",
+        "        w_q = torch.tensor(\n",
+        "            [[153, 153], [170, 170], [119, 119], [136, 136]],\n",
+        "            dtype=torch.uint8,\n",
+        "            device=device,\n",
+        "        )\n",
+        "        # One scale per group (group_size=2 \u2192 2 groups per row), all 0.5\n",
+        "        scales = torch.full((N, K // group_size), 0.5, device=device, dtype=torch.float16)\n",
+        "        y = torch.empty(M, N, device=device, dtype=torch.float16)\n",
+        "\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"w_q\": w_q,\n",
+        "            \"scales\": scales,\n",
+        "            \"y\": y,\n",
+        "            \"M\": M,\n",
+        "            \"N\": N,\n",
+        "            \"K\": K,\n",
+        "            \"group_size\": group_size,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge cases \u2014 tiny K, small group_size\n",
+        "        tests.append(self._make_test_case(1, 2, 4, 2, zero_x=True))\n",
+        "        tests.append(self._make_test_case(2, 4, 4, 2))\n",
+        "        tests.append(self._make_test_case(3, 5, 8, 4))\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        tests.append(self._make_test_case(16, 16, 32, 16))\n",
+        "        tests.append(self._make_test_case(32, 64, 64, 32))\n",
+        "        tests.append(self._make_test_case(64, 128, 128, 64))\n",
+        "\n",
+        "        # Non-power-of-2 sizes\n",
+        "        tests.append(self._make_test_case(30, 50, 64, 32))\n",
+        "        tests.append(self._make_test_case(100, 200, 128, 64))\n",
+        "        tests.append(self._make_test_case(255, 100, 128, 64))\n",
+        "\n",
+        "        # Realistic LLM inference sizes\n",
+        "        tests.append(self._make_test_case(128, 256, 512, 128))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # Typical LLM weight matrix: 4096\u00d74096 with group_size=128\n",
+        "        return self._make_test_case(4096, 4096, 4096, 128)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/82_linear_recurrence.ipynb b/challenges/colab_exports/medium/82_linear_recurrence.ipynb
new file mode 100644
index 00000000..4f696e83
--- /dev/null
+++ b/challenges/colab_exports/medium/82_linear_recurrence.ipynb
@@ -0,0 +1,503 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Given two matrices <code>a</code> and <code>x</code>, each of shape <code>[B, L]</code> (batch size &times; sequence length),\n  compute the linear recurrence <code>h</code> of shape <code>[B, L]</code> defined by:\n  <code>h[b, 0] = x[b, 0]</code> and <code>h[b, t] = a[b, t] &times; h[b, t&minus;1] + x[b, t]</code> for <code>t &ge; 1</code>.\n  All values are <code>float32</code>. This operation is the core computational primitive of\n  State Space Models (SSMs) such as Mamba, S4, and H3.\n</p>\n\n<svg width=\"640\" height=\"200\" viewBox=\"0 0 640 200\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\">\n  <rect width=\"640\" height=\"200\" fill=\"#222\" rx=\"8\"/>\n  <!-- Title -->\n  <text x=\"320\" y=\"24\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">Linear Recurrence: h[t] = a[t] \u00b7 h[t-1] + x[t]</text>\n  <!-- Boxes for h values -->\n  <rect x=\"40\"  y=\"80\" width=\"80\" height=\"40\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <rect x=\"180\" y=\"80\" width=\"80\" height=\"40\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <rect x=\"320\" y=\"80\" width=\"80\" height=\"40\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <rect x=\"460\" y=\"80\" width=\"80\" height=\"40\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <text x=\"80\"  y=\"105\" fill=\"#4a9eff\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">h[0]</text>\n  <text x=\"220\" y=\"105\" fill=\"#4a9eff\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">h[1]</text>\n  <text x=\"360\" y=\"105\" fill=\"#4a9eff\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">h[2]</text>\n  <text x=\"500\" y=\"105\" fill=\"#4a9eff\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">h[3]</text>\n  <!-- Arrows between h values with a[t] labels -->\n  <defs>\n    <marker id=\"arr\" markerWidth=\"8\" markerHeight=\"8\" refX=\"6\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L0,6 L8,3 Z\" fill=\"#7ec8a0\"/>\n    </marker>\n  </defs>\n  <line x1=\"120\" y1=\"100\" x2=\"176\" y2=\"100\" stroke=\"#7ec8a0\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <text x=\"148\" y=\"94\" fill=\"#7ec8a0\" font-size=\"11\" text-anchor=\"middle\" font-family=\"monospace\">\u00d7a[1]</text>\n  <line x1=\"260\" y1=\"100\" x2=\"316\" y2=\"100\" stroke=\"#7ec8a0\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <text x=\"288\" y=\"94\" fill=\"#7ec8a0\" font-size=\"11\" text-anchor=\"middle\" font-family=\"monospace\">\u00d7a[2]</text>\n  <line x1=\"400\" y1=\"100\" x2=\"456\" y2=\"100\" stroke=\"#7ec8a0\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <text x=\"428\" y=\"94\" fill=\"#7ec8a0\" font-size=\"11\" text-anchor=\"middle\" font-family=\"monospace\">\u00d7a[3]</text>\n  <!-- x[t] inputs from below -->\n  <line x1=\"80\"  y1=\"152\" x2=\"80\"  y2=\"124\" stroke=\"#ccc\" stroke-width=\"1.2\" marker-end=\"url(#arr)\"/>\n  <line x1=\"220\" y1=\"152\" x2=\"220\" y2=\"124\" stroke=\"#ccc\" stroke-width=\"1.2\" marker-end=\"url(#arr)\"/>\n  <line x1=\"360\" y1=\"152\" x2=\"360\" y2=\"124\" stroke=\"#ccc\" stroke-width=\"1.2\" marker-end=\"url(#arr)\"/>\n  <line x1=\"500\" y1=\"152\" x2=\"500\" y2=\"124\" stroke=\"#ccc\" stroke-width=\"1.2\" marker-end=\"url(#arr)\"/>\n  <text x=\"80\"  y=\"170\" fill=\"#ccc\" font-size=\"12\" text-anchor=\"middle\" font-family=\"monospace\">x[0]</text>\n  <text x=\"220\" y=\"170\" fill=\"#ccc\" font-size=\"12\" text-anchor=\"middle\" font-family=\"monospace\">x[1]</text>\n  <text x=\"360\" y=\"170\" fill=\"#ccc\" font-size=\"12\" text-anchor=\"middle\" font-family=\"monospace\">x[2]</text>\n  <text x=\"500\" y=\"170\" fill=\"#ccc\" font-size=\"12\" text-anchor=\"middle\" font-family=\"monospace\">x[3]</text>\n  <!-- Plus signs -->\n  <text x=\"80\"  y=\"147\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">+</text>\n  <text x=\"220\" y=\"147\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">+</text>\n  <text x=\"360\" y=\"147\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">+</text>\n  <text x=\"500\" y=\"147\" fill=\"#ccc\" font-size=\"13\" text-anchor=\"middle\" font-family=\"monospace\">+</text>\n  <!-- Ellipsis -->\n  <text x=\"590\" y=\"105\" fill=\"#ccc\" font-size=\"18\" text-anchor=\"middle\" font-family=\"monospace\">\u2026</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The result must be stored in the output tensor <code>h</code></li>\n</ul>\n\n<h2>Examples</h2>\n\n<p>Example 1 \u2014 exponential decay (<code>a = 0.5</code>, single impulse):</p>\n$$\na = \\begin{bmatrix} 0.5 & 0.5 & 0.5 & 0.5 \\end{bmatrix}, \\quad\nx = \\begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\end{bmatrix}\n$$\n$$\nh = \\begin{bmatrix} 1.0 & 0.5 & 0.25 & 0.125 \\end{bmatrix}\n$$\n\n<p>Example 2 \u2014 prefix sum (<code>a = 1</code>, unit inputs):</p>\n$$\na = \\begin{bmatrix} 1.0 & 1.0 & 1.0 & 1.0 \\end{bmatrix}, \\quad\nx = \\begin{bmatrix} 1.0 & 1.0 & 1.0 & 1.0 \\end{bmatrix}\n$$\n$$\nh = \\begin{bmatrix} 1.0 & 2.0 & 3.0 & 4.0 \\end{bmatrix}\n$$\n\n<p>Full example with <code>B = 2</code>, <code>L = 4</code>:</p>\n$$\na = \\begin{bmatrix} 0.5 & 0.5 & 0.5 & 0.5 \\\\ 1.0 & 1.0 & 1.0 & 1.0 \\end{bmatrix}, \\quad\nx = \\begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\\\ 1.0 & 1.0 & 1.0 & 1.0 \\end{bmatrix}\n$$\n$$\nh = \\begin{bmatrix} 1.0 & 0.5 & 0.25 & 0.125 \\\\ 1.0 & 2.0 & 3.0 & 4.0 \\end{bmatrix}\n$$\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>B</code> &le; 256 (batch size)</li>\n  <li>1 &le; <code>L</code> &le; 65,536 (sequence length)</li>\n  <li>All values in <code>a</code> and <code>x</code> are <code>float32</code></li>\n  <li>Performance is measured with <code>B</code> = 64, <code>L</code> = 16,384</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// a, x, h are device pointers\nextern \"C\" void solve(const float* a, const float* x, float* h, int B, int L) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# a, x, h are tensors on the GPU\n@cute.jit\ndef solve(a: cute.Tensor, x: cute.Tensor, h: cute.Tensor, B: cute.Int32, L: cute.Int32):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# a, x are tensors on GPU\n@jax.jit\ndef solve(a: jax.Array, x: jax.Array, B: int, L: int) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.gpu import block_dim, block_idx, thread_idx\nfrom std.memory import UnsafePointer\nfrom std.math import ceildiv\n\n\n# a, x, h are device pointers\n@export\ndef solve(\n    a: UnsafePointer[Float32, MutExternalOrigin],\n    x: UnsafePointer[Float32, MutExternalOrigin],\n    h: UnsafePointer[Float32, MutExternalOrigin],\n    B: Int32,\n    L: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# a, x, h are tensors on the GPU\ndef solve(a: torch.Tensor, x: torch.Tensor, h: torch.Tensor, B: int, L: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# a, x, h are tensors on the GPU\ndef solve(a: torch.Tensor, x: torch.Tensor, h: torch.Tensor, B: int, L: int):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Linear Recurrence\",\n",
+        "            atol=1e-05,\n",
+        "            rtol=1e-05,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        a: torch.Tensor,\n",
+        "        x: torch.Tensor,\n",
+        "        h: torch.Tensor,\n",
+        "        B: int,\n",
+        "        L: int,\n",
+        "    ):\n",
+        "        assert a.shape == (B, L)\n",
+        "        assert x.shape == (B, L)\n",
+        "        assert h.shape == (B, L)\n",
+        "        assert a.dtype == x.dtype == h.dtype == torch.float32\n",
+        "        assert a.device.type == \"cuda\"\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert h.device.type == \"cuda\"\n",
+        "\n",
+        "        out = torch.empty_like(x)\n",
+        "        out[:, 0] = x[:, 0]\n",
+        "        for t in range(1, L):\n",
+        "            out[:, t] = a[:, t] * out[:, t - 1] + x[:, t]\n",
+        "        h.copy_(out)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"a\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"h\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"B\": (ctypes.c_int, \"in\"),\n",
+        "            \"L\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, B, L, zero_inputs=False, zero_a=False, unit_a=False):\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        if zero_inputs:\n",
+        "            a = torch.zeros(B, L, device=device, dtype=dtype)\n",
+        "            x = torch.zeros(B, L, device=device, dtype=dtype)\n",
+        "        elif zero_a:\n",
+        "            a = torch.zeros(B, L, device=device, dtype=dtype)\n",
+        "            x = torch.randn(B, L, device=device, dtype=dtype)\n",
+        "        elif unit_a:\n",
+        "            a = torch.ones(B, L, device=device, dtype=dtype)\n",
+        "            x = torch.randn(B, L, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            a = torch.rand(B, L, device=device, dtype=dtype)\n",
+        "            x = torch.randn(B, L, device=device, dtype=dtype)\n",
+        "        h = torch.empty(B, L, device=device, dtype=dtype)\n",
+        "        return {\"a\": a, \"x\": x, \"h\": h, \"B\": B, \"L\": L}\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        a = torch.tensor(\n",
+        "            [[0.5, 0.5, 0.5, 0.5], [1.0, 1.0, 1.0, 1.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        x = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        h = torch.empty(2, 4, device=device, dtype=dtype)\n",
+        "        return {\"a\": a, \"x\": x, \"h\": h, \"B\": 2, \"L\": 4}\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge case: single element\n",
+        "        tests.append(self._make_test_case(1, 1))\n",
+        "\n",
+        "        # Edge case: two elements\n",
+        "        tests.append(self._make_test_case(1, 2))\n",
+        "\n",
+        "        # Zero inputs\n",
+        "        tests.append(self._make_test_case(4, 4, zero_inputs=True))\n",
+        "\n",
+        "        # a=0 everywhere: h[t] = x[t] (no recurrence)\n",
+        "        tests.append(self._make_test_case(4, 16, zero_a=True))\n",
+        "\n",
+        "        # a=1 everywhere: h[t] = prefix sum of x\n",
+        "        tests.append(self._make_test_case(4, 16, unit_a=True))\n",
+        "\n",
+        "        # Power-of-2 sequence length\n",
+        "        tests.append(self._make_test_case(8, 32))\n",
+        "\n",
+        "        # Power-of-2 sequence length, larger\n",
+        "        tests.append(self._make_test_case(8, 256))\n",
+        "\n",
+        "        # Non-power-of-2 sequence length\n",
+        "        tests.append(self._make_test_case(4, 30))\n",
+        "\n",
+        "        # Non-power-of-2 sequence length, larger\n",
+        "        tests.append(self._make_test_case(8, 100))\n",
+        "\n",
+        "        # Realistic size (SSM hidden state)\n",
+        "        tests.append(self._make_test_case(16, 1024))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # B=64 sequences, L=16384 tokens \u2014 typical long-context SSM workload\n",
+        "        return self._make_test_case(64, 16384)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/84_swiglu_mlp_block.ipynb b/challenges/colab_exports/medium/84_swiglu_mlp_block.ipynb
new file mode 100644
index 00000000..577b6914
--- /dev/null
+++ b/challenges/colab_exports/medium/84_swiglu_mlp_block.ipynb
@@ -0,0 +1,543 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the SwiGLU MLP block \u2014 the feedforward network used in LLaMA, Mistral, Gemma, and most\n  modern large language models. Given an input matrix <code>x</code> of shape\n  <code>[M, d_model]</code> and three weight matrices <code>W_gate</code>, <code>W_up</code>\n  (each <code>[d_model, d_ffn]</code>), and <code>W_down</code> (<code>[d_ffn, d_model]</code>),\n  compute:\n  <code>output = (SiLU(x &times; W_gate) &odot; (x &times; W_up)) &times; W_down</code>,\n  where <code>SiLU(z) = z &times; sigmoid(z)</code> and <code>&odot;</code> denotes element-wise\n  multiplication. All tensors are <code>float32</code>.\n</p>\n\n<svg width=\"680\" height=\"220\" viewBox=\"0 0 680 220\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto; font-family:monospace;\">\n  <rect width=\"680\" height=\"220\" fill=\"#222\" rx=\"8\"/>\n  <defs>\n    <marker id=\"arr\" markerWidth=\"8\" markerHeight=\"8\" refX=\"6\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L0,6 L8,3 z\" fill=\"#888\"/>\n    </marker>\n  </defs>\n\n  <!-- x box -->\n  <rect x=\"16\" y=\"82\" width=\"56\" height=\"40\" rx=\"4\" fill=\"#2a4a7f\" stroke=\"#5588cc\" stroke-width=\"1.5\"/>\n  <text x=\"44\" y=\"106\" fill=\"#ccc\" font-size=\"12\" text-anchor=\"middle\">x</text>\n  <text x=\"44\" y=\"136\" fill=\"#666\" font-size=\"8\" text-anchor=\"middle\">[M, d_model]</text>\n\n  <!-- Gate branch (top) -->\n  <line x1=\"72\" y1=\"92\" x2=\"108\" y2=\"52\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"110\" y=\"32\" width=\"90\" height=\"40\" rx=\"4\" fill=\"#2a4a7f\" stroke=\"#5588cc\" stroke-width=\"1.5\"/>\n  <text x=\"155\" y=\"56\" fill=\"#ccc\" font-size=\"10\" text-anchor=\"middle\">x &#xb7; W_gate</text>\n  <text x=\"155\" y=\"22\" fill=\"#5588cc\" font-size=\"9\" text-anchor=\"middle\">gate projection</text>\n\n  <!-- Up branch (bottom) -->\n  <line x1=\"72\" y1=\"112\" x2=\"108\" y2=\"152\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"110\" y=\"132\" width=\"90\" height=\"40\" rx=\"4\" fill=\"#2a4a7f\" stroke=\"#5588cc\" stroke-width=\"1.5\"/>\n  <text x=\"155\" y=\"156\" fill=\"#ccc\" font-size=\"10\" text-anchor=\"middle\">x &#xb7; W_up</text>\n  <text x=\"155\" y=\"184\" fill=\"#5588cc\" font-size=\"9\" text-anchor=\"middle\">up projection</text>\n\n  <!-- Shape labels after projections -->\n  <text x=\"155\" y=\"82\" fill=\"#666\" font-size=\"8\" text-anchor=\"middle\">[M, d_ffn]</text>\n  <text x=\"155\" y=\"130\" fill=\"#666\" font-size=\"8\" text-anchor=\"middle\">[M, d_ffn]</text>\n\n  <!-- Arrow gate \u2192 SiLU -->\n  <line x1=\"200\" y1=\"52\" x2=\"238\" y2=\"52\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- SiLU box -->\n  <rect x=\"240\" y=\"32\" width=\"60\" height=\"40\" rx=\"4\" fill=\"#1a5a3a\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"270\" y=\"56\" fill=\"#ccc\" font-size=\"11\" text-anchor=\"middle\">SiLU</text>\n\n  <!-- Arrow SiLU \u2192 element-wise multiply (goes down) -->\n  <line x1=\"300\" y1=\"52\" x2=\"370\" y2=\"90\" stroke=\"#44aa66\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Arrow up branch \u2192 element-wise multiply (goes up) -->\n  <line x1=\"200\" y1=\"152\" x2=\"370\" y2=\"114\" stroke=\"#5588cc\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Element-wise multiply box -->\n  <rect x=\"372\" y=\"82\" width=\"50\" height=\"40\" rx=\"4\" fill=\"#5a3a1a\" stroke=\"#cc8844\" stroke-width=\"1.5\"/>\n  <text x=\"397\" y=\"107\" fill=\"#ccc\" font-size=\"16\" text-anchor=\"middle\">&#x2299;</text>\n\n  <!-- Arrow \u2299 \u2192 W_down -->\n  <line x1=\"422\" y1=\"102\" x2=\"458\" y2=\"102\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- W_down box -->\n  <rect x=\"460\" y=\"82\" width=\"86\" height=\"40\" rx=\"4\" fill=\"#2a4a7f\" stroke=\"#5588cc\" stroke-width=\"1.5\"/>\n  <text x=\"503\" y=\"106\" fill=\"#ccc\" font-size=\"10\" text-anchor=\"middle\">&#xb7; W_down</text>\n  <text x=\"503\" y=\"76\" fill=\"#666\" font-size=\"8\" text-anchor=\"middle\">[M, d_ffn]</text>\n\n  <!-- Arrow W_down \u2192 output -->\n  <line x1=\"546\" y1=\"102\" x2=\"578\" y2=\"102\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- Output box -->\n  <rect x=\"580\" y=\"82\" width=\"80\" height=\"40\" rx=\"4\" fill=\"#3a1a3a\" stroke=\"#cc44cc\" stroke-width=\"1.5\"/>\n  <text x=\"620\" y=\"106\" fill=\"#ccc\" font-size=\"11\" text-anchor=\"middle\">output</text>\n  <text x=\"620\" y=\"136\" fill=\"#666\" font-size=\"8\" text-anchor=\"middle\">[M, d_model]</text>\n\n  <!-- SiLU formula -->\n  <text x=\"270\" y=\"18\" fill=\"#44aa66\" font-size=\"9\" text-anchor=\"middle\">z &#xb7; sigmoid(z)</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement the <code>solve</code> function with the signature unchanged.</li>\n  <li>Do not use external libraries beyond the framework provided.</li>\n  <li>Write the result into <code>output</code> in-place.</li>\n</ul>\n\n<h2>Example</h2>\n<p>\n  Input: <code>M</code> = 2, <code>d_model</code> = 2, <code>d_ffn</code> = 4\n</p>\n<p>\n  $x$ (float32, $2 \\times 2$):\n  $$\n  x = \\begin{bmatrix} 1.0 & 0.0 \\\\ 0.0 & 1.0 \\end{bmatrix}\n  $$\n  $W_\\text{gate}$ and $W_\\text{up}$ (both $2 \\times 4$):\n  $$\n  W_\\text{gate} = W_\\text{up} =\n  \\begin{bmatrix}\n  1.0 & 0.0 & 0.0 & 0.0 \\\\\n  0.0 & 1.0 & 0.0 & 0.0\n  \\end{bmatrix}\n  $$\n  $W_\\text{down}$ ($4 \\times 2$):\n  $$\n  W_\\text{down} =\n  \\begin{bmatrix}\n  1.0 & 0.0 \\\\\n  0.0 & 1.0 \\\\\n  0.0 & 0.0 \\\\\n  0.0 & 0.0\n  \\end{bmatrix}\n  $$\n</p>\n<p>\n  Intermediate steps:\n  $$\n  \\text{gate} = x \\cdot W_\\text{gate} =\n  \\begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\\\ 0.0 & 1.0 & 0.0 & 0.0 \\end{bmatrix}\n  $$\n  $$\n  \\text{up} = x \\cdot W_\\text{up} =\n  \\begin{bmatrix} 1.0 & 0.0 & 0.0 & 0.0 \\\\ 0.0 & 1.0 & 0.0 & 0.0 \\end{bmatrix}\n  $$\n  $$\n  \\text{SiLU}(1.0) = 1.0 \\times \\sigma(1.0) \\approx 0.7311\n  $$\n  $$\n  \\text{hidden} = \\text{SiLU}(\\text{gate}) \\odot \\text{up} =\n  \\begin{bmatrix} 0.7311 & 0.0 & 0.0 & 0.0 \\\\ 0.0 & 0.7311 & 0.0 & 0.0 \\end{bmatrix}\n  $$\n</p>\n<p>\n  Output:\n  $$\n  \\text{output} = \\text{hidden} \\cdot W_\\text{down} \\approx\n  \\begin{bmatrix} 0.7311 & 0.0 \\\\ 0.0 & 0.7311 \\end{bmatrix}\n  $$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>M</code> &le; 65,536</li>\n  <li>1 &le; <code>d_model</code> &le; 8,192</li>\n  <li>1 &le; <code>d_ffn</code> &le; 32,768</li>\n  <li>All tensors are <code>float32</code> on the GPU.</li>\n  <li>Input values are in the range [-10, 10].</li>\n  <li>\n    Performance is measured with <code>M</code> = 512, <code>d_model</code> = 4,096,\n    <code>d_ffn</code> = 14,336\n  </li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// x, W_gate, W_up, W_down, output are device pointers\nextern \"C\" void solve(const float* x, const float* W_gate, const float* W_up, const float* W_down,\n                      float* output, int M, int d_model, int d_ffn) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, W_gate, W_up, W_down, output are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    W_gate: cute.Tensor,\n    W_up: cute.Tensor,\n    W_down: cute.Tensor,\n    output: cute.Tensor,\n    M: cute.Int32,\n    d_model: cute.Int32,\n    d_ffn: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, W_gate, W_up, W_down are tensors on GPU\n@jax.jit\ndef solve(\n    x: jax.Array,\n    W_gate: jax.Array,\n    W_up: jax.Array,\n    W_down: jax.Array,\n    M: int,\n    d_model: int,\n    d_ffn: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from gpu.host import DeviceContext\nfrom gpu.id import block_dim, block_idx, thread_idx\nfrom memory import UnsafePointer\nfrom math import ceildiv\n\n\n# x, W_gate, W_up, W_down, output are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float32],\n    W_gate: UnsafePointer[Float32],\n    W_up: UnsafePointer[Float32],\n    W_down: UnsafePointer[Float32],\n    output: UnsafePointer[Float32],\n    M: Int32,\n    d_model: Int32,\n    d_ffn: Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, W_gate, W_up, W_down, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    W_gate: torch.Tensor,\n    W_up: torch.Tensor,\n    W_down: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    d_model: int,\n    d_ffn: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, W_gate, W_up, W_down, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    W_gate: torch.Tensor,\n    W_up: torch.Tensor,\n    W_down: torch.Tensor,\n    output: torch.Tensor,\n    M: int,\n    d_model: int,\n    d_ffn: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"SwiGLU MLP Block\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        W_gate: torch.Tensor,\n",
+        "        W_up: torch.Tensor,\n",
+        "        W_down: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        M: int,\n",
+        "        d_model: int,\n",
+        "        d_ffn: int,\n",
+        "    ):\n",
+        "        assert x.shape == (M, d_model)\n",
+        "        assert W_gate.shape == (d_model, d_ffn)\n",
+        "        assert W_up.shape == (d_model, d_ffn)\n",
+        "        assert W_down.shape == (d_ffn, d_model)\n",
+        "        assert output.shape == (M, d_model)\n",
+        "        assert (\n",
+        "            x.dtype == W_gate.dtype == W_up.dtype == W_down.dtype == output.dtype == torch.float32\n",
+        "        )\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert W_gate.device.type == \"cuda\"\n",
+        "        assert W_up.device.type == \"cuda\"\n",
+        "        assert W_down.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "\n",
+        "        gate = x @ W_gate  # [M, d_ffn]\n",
+        "        up = x @ W_up  # [M, d_ffn]\n",
+        "        hidden = F.silu(gate) * up  # [M, d_ffn]\n",
+        "        output.copy_(hidden @ W_down)  # [M, d_model]\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"W_gate\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"W_up\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"W_down\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"M\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_model\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_ffn\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, M, d_model, d_ffn, zero_x=False):\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        if zero_x:\n",
+        "            x = torch.zeros(M, d_model, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            x = torch.randn(M, d_model, device=device, dtype=dtype) * 0.1\n",
+        "        W_gate = torch.randn(d_model, d_ffn, device=device, dtype=dtype) * 0.02\n",
+        "        W_up = torch.randn(d_model, d_ffn, device=device, dtype=dtype) * 0.02\n",
+        "        W_down = torch.randn(d_ffn, d_model, device=device, dtype=dtype) * 0.02\n",
+        "        output = torch.empty(M, d_model, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"W_gate\": W_gate,\n",
+        "            \"W_up\": W_up,\n",
+        "            \"W_down\": W_down,\n",
+        "            \"output\": output,\n",
+        "            \"M\": M,\n",
+        "            \"d_model\": d_model,\n",
+        "            \"d_ffn\": d_ffn,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        M, d_model, d_ffn = 2, 2, 4\n",
+        "        # x: each row is a basis vector\n",
+        "        x = torch.tensor(\n",
+        "            [[1.0, 0.0], [0.0, 1.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        # W_gate: [d_model=2, d_ffn=4] \u2014 first two columns are identity, rest zeros\n",
+        "        W_gate = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        # W_up: same layout as W_gate\n",
+        "        W_up = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        # W_down: [d_ffn=4, d_model=2] \u2014 top 2x2 is identity, rest zeros\n",
+        "        W_down = torch.tensor(\n",
+        "            [[1.0, 0.0], [0.0, 1.0], [0.0, 0.0], [0.0, 0.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.empty(M, d_model, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"W_gate\": W_gate,\n",
+        "            \"W_up\": W_up,\n",
+        "            \"W_down\": W_down,\n",
+        "            \"output\": output,\n",
+        "            \"M\": M,\n",
+        "            \"d_model\": d_model,\n",
+        "            \"d_ffn\": d_ffn,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge cases: single row\n",
+        "        tests.append(self._make_test_case(1, 4, 8))\n",
+        "\n",
+        "        # Edge case: two rows\n",
+        "        tests.append(self._make_test_case(2, 4, 8))\n",
+        "\n",
+        "        # Zero input\n",
+        "        tests.append(self._make_test_case(4, 8, 16, zero_x=True))\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        tests.append(self._make_test_case(16, 32, 64))\n",
+        "\n",
+        "        # Power-of-2 larger\n",
+        "        tests.append(self._make_test_case(64, 64, 128))\n",
+        "\n",
+        "        # Non-power-of-2 M\n",
+        "        tests.append(self._make_test_case(30, 32, 64))\n",
+        "\n",
+        "        # Non-power-of-2 all dims\n",
+        "        tests.append(self._make_test_case(100, 60, 120))\n",
+        "\n",
+        "        # Non-power-of-2 M, medium size\n",
+        "        tests.append(self._make_test_case(255, 64, 128))\n",
+        "\n",
+        "        # Realistic small inference batch (LLaMA-style ratios)\n",
+        "        tests.append(self._make_test_case(128, 256, 512))\n",
+        "\n",
+        "        # Realistic medium inference batch\n",
+        "        tests.append(self._make_test_case(256, 512, 1024))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # LLaMA-3 8B style: d_model=4096, d_ffn=14336, M=512 (batch=4 x seq=128)\n",
+        "        return self._make_test_case(512, 4096, 14336)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/85_lora_linear.ipynb b/challenges/colab_exports/medium/85_lora_linear.ipynb
new file mode 100644
index 00000000..ac5fbe25
--- /dev/null
+++ b/challenges/colab_exports/medium/85_lora_linear.ipynb
@@ -0,0 +1,552 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a LoRA (Low-Rank Adaptation) linear layer forward pass. Given an input matrix\n  <code>x</code> of shape <code>batch &times; d_in</code>, a base weight matrix <code>W</code> of\n  shape <code>d_out &times; d_in</code>, a LoRA down-projection matrix <code>A</code> of shape\n  <code>rank &times; d_in</code>, and a LoRA up-projection matrix <code>B</code> of shape\n  <code>d_out &times; rank</code>, compute\n  <code>output = x &times; W<sup>T</sup> + lora_scale &times; (x &times; A<sup>T</sup>) &times; B<sup>T</sup></code>.\n  All tensors are <code>float32</code>.\n</p>\n\n<svg width=\"680\" height=\"200\" viewBox=\"0 0 680 200\" xmlns=\"http://www.w3.org/2000/svg\" style=\"display:block; margin:20px auto;\">\n  <rect width=\"680\" height=\"200\" fill=\"#222\"/>\n\n  <!-- x block -->\n  <rect x=\"20\" y=\"70\" width=\"60\" height=\"60\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <text x=\"50\" y=\"95\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">x</text>\n  <text x=\"50\" y=\"112\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">B&times;D_in</text>\n\n  <!-- Arrow to W branch -->\n  <line x1=\"80\" y1=\"100\" x2=\"110\" y2=\"70\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <!-- Arrow to A branch -->\n  <line x1=\"80\" y1=\"100\" x2=\"110\" y2=\"145\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- W block -->\n  <rect x=\"112\" y=\"40\" width=\"70\" height=\"55\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.5\"/>\n  <text x=\"147\" y=\"63\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">W</text>\n  <text x=\"147\" y=\"80\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">D_out&times;D_in</text>\n\n  <!-- base output: x@W^T -->\n  <line x1=\"182\" y1=\"67\" x2=\"225\" y2=\"90\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"227\" y=\"70\" width=\"80\" height=\"55\" fill=\"#1a4a2a\" stroke=\"#4aff88\" stroke-width=\"1.5\"/>\n  <text x=\"267\" y=\"92\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"11\" font-family=\"monospace\">x@W&#x1D57;</text>\n  <text x=\"267\" y=\"108\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">B&times;D_out</text>\n\n  <!-- A block -->\n  <rect x=\"112\" y=\"128\" width=\"70\" height=\"50\" fill=\"#3a1a3a\" stroke=\"#cc88ff\" stroke-width=\"1.5\"/>\n  <text x=\"147\" y=\"150\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">A</text>\n  <text x=\"147\" y=\"167\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">rank&times;D_in</text>\n\n  <!-- hidden = x@A^T -->\n  <line x1=\"182\" y1=\"153\" x2=\"225\" y2=\"153\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"227\" y=\"130\" width=\"60\" height=\"45\" fill=\"#3a1a3a\" stroke=\"#cc88ff\" stroke-width=\"1.5\"/>\n  <text x=\"257\" y=\"152\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-family=\"monospace\">x@A&#x1D57;</text>\n  <text x=\"257\" y=\"167\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">B&times;rank</text>\n\n  <!-- B block -->\n  <rect x=\"304\" y=\"128\" width=\"70\" height=\"50\" fill=\"#3a1a3a\" stroke=\"#cc88ff\" stroke-width=\"1.5\"/>\n  <text x=\"339\" y=\"150\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"monospace\">B</text>\n  <text x=\"339\" y=\"167\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">D_out&times;rank</text>\n\n  <!-- arrow from hidden to B -->\n  <line x1=\"287\" y1=\"153\" x2=\"302\" y2=\"153\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n\n  <!-- delta = (x@A^T)@B^T -->\n  <line x1=\"374\" y1=\"153\" x2=\"415\" y2=\"120\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"417\" y=\"95\" width=\"80\" height=\"55\" fill=\"#3a2a1a\" stroke=\"#ffaa44\" stroke-width=\"1.5\"/>\n  <text x=\"457\" y=\"117\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-family=\"monospace\">&#x3B1;&times;(x@A&#x1D57;)@B&#x1D57;</text>\n  <text x=\"457\" y=\"133\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">B&times;D_out</text>\n\n  <!-- plus sign -->\n  <line x1=\"307\" y1=\"97\" x2=\"415\" y2=\"97\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <text x=\"385\" y=\"88\" text-anchor=\"middle\" fill=\"#ffaa44\" font-size=\"20\" font-family=\"monospace\">+</text>\n\n  <!-- output -->\n  <line x1=\"497\" y1=\"122\" x2=\"535\" y2=\"122\" stroke=\"#888\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <rect x=\"537\" y=\"95\" width=\"80\" height=\"55\" fill=\"#1a4a2a\" stroke=\"#4aff88\" stroke-width=\"1.5\"/>\n  <text x=\"577\" y=\"117\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"monospace\">output</text>\n  <text x=\"577\" y=\"135\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">B&times;D_out</text>\n\n  <defs>\n    <marker id=\"arr\" markerWidth=\"6\" markerHeight=\"6\" refX=\"5\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L6,3 L0,6 Z\" fill=\"#888\"/>\n    </marker>\n  </defs>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement the <code>solve</code> function; do not change its signature.</li>\n  <li>Do not use external libraries beyond those provided.</li>\n  <li>Write the result into <code>output</code>.</li>\n</ul>\n\n<h2>Examples</h2>\n<p>Example 1:</p>\n<p>\n$$\nx = \\begin{bmatrix} 1 & 0 & -1 & 2 \\\\ 0 & 1 & 1 & -1 \\end{bmatrix},\\quad\nW = \\begin{bmatrix} 1 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 \\\\ 0 & 0 & 1 & 0 \\end{bmatrix},\\quad\nA = \\begin{bmatrix} 1 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 \\end{bmatrix},\\quad\nB = \\begin{bmatrix} 1 & 0 \\\\ 0 & 1 \\\\ 0 & 0 \\end{bmatrix}\n$$\n</p>\n<p>With <code>lora_scale</code> = 0.5:</p>\n<p>\n$$\n\\text{output} = x W^T + 0.5 \\cdot (x A^T) B^T\n= \\begin{bmatrix} 1 & 0 & -1 \\\\ 0 & 1 & 1 \\end{bmatrix}\n+ 0.5 \\cdot \\begin{bmatrix} 1 & 0 \\\\ 0 & 1 \\end{bmatrix} \\begin{bmatrix} 1 & 0 & 0 \\\\ 0 & 1 & 0 \\end{bmatrix}\n= \\begin{bmatrix} 1.5 & 0 & -1 \\\\ 0 & 1.5 & 1 \\end{bmatrix}\n$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>batch</code> &le; 1,024</li>\n  <li>1 &le; <code>d_in</code>, <code>d_out</code> &le; 8,192</li>\n  <li>1 &le; <code>rank</code> &le; 256; <code>rank</code> &lt; min(<code>d_in</code>, <code>d_out</code>)</li>\n  <li>All tensors are <code>float32</code> on GPU.</li>\n  <li>Performance is measured with <code>batch</code> = 256, <code>d_in</code> = 4,096, <code>d_out</code> = 4,096, <code>rank</code> = 64</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// x, W, A, B, output are device pointers\nextern \"C\" void solve(const float* x, const float* W, const float* A, const float* B, float* output,\n                      int batch, int d_in, int d_out, int rank, float lora_scale) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, W, A, B, output are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    W: cute.Tensor,\n    A: cute.Tensor,\n    B: cute.Tensor,\n    output: cute.Tensor,\n    batch: cute.Int32,\n    d_in: cute.Int32,\n    d_out: cute.Int32,\n    rank: cute.Int32,\n    lora_scale: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, W, A, B are tensors on GPU\n@jax.jit\ndef solve(\n    x: jax.Array,\n    W: jax.Array,\n    A: jax.Array,\n    B: jax.Array,\n    batch: int,\n    d_in: int,\n    d_out: int,\n    rank: int,\n    lora_scale: float,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# x, W, A, B, output are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float32, MutExternalOrigin],\n    W: UnsafePointer[Float32, MutExternalOrigin],\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    batch: Int32,\n    d_in: Int32,\n    d_out: Int32,\n    rank: Int32,\n    lora_scale: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, W, A, B, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    W: torch.Tensor,\n    A: torch.Tensor,\n    B: torch.Tensor,\n    output: torch.Tensor,\n    batch: int,\n    d_in: int,\n    d_out: int,\n    rank: int,\n    lora_scale: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, W, A, B, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    W: torch.Tensor,\n    A: torch.Tensor,\n    B: torch.Tensor,\n    output: torch.Tensor,\n    batch: int,\n    d_in: int,\n    d_out: int,\n    rank: int,\n    lora_scale: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"LoRA Linear\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        W: torch.Tensor,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        batch: int,\n",
+        "        d_in: int,\n",
+        "        d_out: int,\n",
+        "        rank: int,\n",
+        "        lora_scale: float,\n",
+        "    ):\n",
+        "        assert x.shape == (batch, d_in)\n",
+        "        assert W.shape == (d_out, d_in)\n",
+        "        assert A.shape == (rank, d_in)\n",
+        "        assert B.shape == (d_out, rank)\n",
+        "        assert output.shape == (batch, d_out)\n",
+        "        assert x.dtype == W.dtype == A.dtype == B.dtype == output.dtype == torch.float32\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert W.device.type == \"cuda\"\n",
+        "        assert A.device.type == \"cuda\"\n",
+        "        assert B.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "\n",
+        "        # Base linear: output = x @ W^T\n",
+        "        base = torch.mm(x, W.t())\n",
+        "\n",
+        "        # LoRA path: delta = lora_scale * (x @ A^T) @ B^T\n",
+        "        lora_hidden = torch.mm(x, A.t())  # (batch, rank)\n",
+        "        delta = torch.mm(lora_hidden, B.t())  # (batch, d_out)\n",
+        "\n",
+        "        output.copy_(base + lora_scale * delta)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"W\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"batch\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_in\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_out\": (ctypes.c_int, \"in\"),\n",
+        "            \"rank\": (ctypes.c_int, \"in\"),\n",
+        "            \"lora_scale\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, batch, d_in, d_out, rank, lora_scale=0.5, zero_x=False):\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        if zero_x:\n",
+        "            x = torch.zeros(batch, d_in, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            x = torch.randn(batch, d_in, device=device, dtype=dtype)\n",
+        "        W = torch.randn(d_out, d_in, device=device, dtype=dtype) * 0.02\n",
+        "        A = torch.randn(rank, d_in, device=device, dtype=dtype) * 0.02\n",
+        "        B = torch.zeros(d_out, rank, device=device, dtype=dtype)\n",
+        "        output = torch.zeros(batch, d_out, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"W\": W,\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"output\": output,\n",
+        "            \"batch\": batch,\n",
+        "            \"d_in\": d_in,\n",
+        "            \"d_out\": d_out,\n",
+        "            \"rank\": rank,\n",
+        "            \"lora_scale\": lora_scale,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        x = torch.tensor([[1.0, 0.0, -1.0, 2.0], [0.0, 1.0, 1.0, -1.0]], device=device, dtype=dtype)\n",
+        "        W = torch.tensor(\n",
+        "            [[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        A = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=device, dtype=dtype)\n",
+        "        B = torch.tensor(\n",
+        "            [[1.0, 0.0], [0.0, 1.0], [0.0, 0.0]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        output = torch.zeros(2, 3, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"W\": W,\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"output\": output,\n",
+        "            \"batch\": 2,\n",
+        "            \"d_in\": 4,\n",
+        "            \"d_out\": 3,\n",
+        "            \"rank\": 2,\n",
+        "            \"lora_scale\": 0.5,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge case: batch=1, tiny dimensions\n",
+        "        tests.append(self._make_test_case(1, 4, 4, 1))\n",
+        "\n",
+        "        # Edge case: zero input\n",
+        "        tests.append(self._make_test_case(2, 8, 8, 2, zero_x=True))\n",
+        "\n",
+        "        # Edge case: rank=1 (minimum LoRA rank)\n",
+        "        tests.append(self._make_test_case(4, 16, 16, 1))\n",
+        "\n",
+        "        # Power-of-2 dimensions\n",
+        "        tests.append(self._make_test_case(16, 64, 64, 8))\n",
+        "\n",
+        "        # Power-of-2, non-square\n",
+        "        tests.append(self._make_test_case(32, 128, 64, 16))\n",
+        "\n",
+        "        # Non-power-of-2 dimensions\n",
+        "        tests.append(self._make_test_case(30, 100, 100, 4))\n",
+        "\n",
+        "        # Non-power-of-2, mixed\n",
+        "        tests.append(self._make_test_case(7, 255, 128, 8))\n",
+        "\n",
+        "        # Realistic small: LLM feed-forward style\n",
+        "        tests.append(self._make_test_case(64, 512, 512, 16, lora_scale=0.125))\n",
+        "\n",
+        "        # Negative inputs\n",
+        "        tests.append(\n",
+        "            {\n",
+        "                \"x\": torch.full((4, 32), -1.0, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"W\": torch.randn(32, 32, device=\"cuda\", dtype=torch.float32) * 0.02,\n",
+        "                \"A\": torch.randn(8, 32, device=\"cuda\", dtype=torch.float32) * 0.02,\n",
+        "                \"B\": torch.randn(32, 8, device=\"cuda\", dtype=torch.float32) * 0.02,\n",
+        "                \"output\": torch.zeros(4, 32, device=\"cuda\", dtype=torch.float32),\n",
+        "                \"batch\": 4,\n",
+        "                \"d_in\": 32,\n",
+        "                \"d_out\": 32,\n",
+        "                \"rank\": 8,\n",
+        "                \"lora_scale\": 1.0,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Larger realistic: transformer hidden size\n",
+        "        tests.append(self._make_test_case(128, 1024, 1024, 32, lora_scale=0.0625))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # LLaMA-style: d_in=d_out=4096, rank=64, batch=256\n",
+        "        return self._make_test_case(256, 4096, 4096, 64, lora_scale=0.015625)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/87_speculative_decoding_verification.ipynb b/challenges/colab_exports/medium/87_speculative_decoding_verification.ipynb
new file mode 100644
index 00000000..15e7d975
--- /dev/null
+++ b/challenges/colab_exports/medium/87_speculative_decoding_verification.ipynb
@@ -0,0 +1,681 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the token verification step of speculative decoding. A draft model proposes $T$ tokens;\n  the target model evaluates them in one forward pass and accepts or rejects each. Given $B$\n  sequences, produce the verified output tokens. Probability tensors are <code>float32</code>;\n  token tensors are <code>int32</code>.\n</p>\n\n<p>\n  Notation for each sequence $b$, at each draft position $i = 0, \\ldots, T{-}1$:\n</p>\n<ul>\n  <li>$t_i = \\texttt{draft_tokens}[b, i]$ &mdash; the token proposed by the draft model</li>\n  <li>$p_i(v) = \\texttt{draft_probs}[b, i, v]$ &mdash; draft model's probability for token $v$</li>\n  <li>$q_i(v) = \\texttt{target_probs}[b, i, v]$ &mdash; target model's probability for token $v$</li>\n  <li>$u_i = \\texttt{uniform_samples}[b, i]$ &mdash; pre-generated $U[0,1)$ sample for position $i$</li>\n</ul>\n\n<svg width=\"660\" height=\"310\" viewBox=\"0 0 660 310\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto; font-family:monospace;\">\n  <rect width=\"660\" height=\"310\" fill=\"#222\" rx=\"8\"/>\n\n  <!-- Column headers -->\n  <text x=\"108\" y=\"18\" fill=\"#666\" font-size=\"9\" text-anchor=\"middle\">pos 0</text>\n  <text x=\"248\" y=\"18\" fill=\"#666\" font-size=\"9\" text-anchor=\"middle\">pos 1</text>\n  <text x=\"388\" y=\"18\" fill=\"#666\" font-size=\"9\" text-anchor=\"middle\">pos 2</text>\n  <text x=\"528\" y=\"18\" fill=\"#666\" font-size=\"9\" text-anchor=\"middle\">pos 3</text>\n\n  <!-- Row 1: Draft tokens -->\n  <text x=\"16\" y=\"42\" fill=\"#888\" font-size=\"10\">draft</text>\n  <rect x=\"56\" y=\"28\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#1e3a5f\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"108\" y=\"45\" text-anchor=\"middle\" fill=\"#8ec4f0\" font-size=\"11\">t&#x2080;</text>\n  <rect x=\"196\" y=\"28\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#1e3a5f\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"248\" y=\"45\" text-anchor=\"middle\" fill=\"#8ec4f0\" font-size=\"11\">t&#x2081;</text>\n  <rect x=\"336\" y=\"28\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#1e3a5f\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"388\" y=\"45\" text-anchor=\"middle\" fill=\"#8ec4f0\" font-size=\"11\">t&#x2082;</text>\n  <rect x=\"476\" y=\"28\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#1e3a5f\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"528\" y=\"45\" text-anchor=\"middle\" fill=\"#8ec4f0\" font-size=\"11\">t&#x2083;</text>\n\n  <!-- Row 2: Probabilities -->\n  <text x=\"16\" y=\"76\" fill=\"#888\" font-size=\"10\">probs</text>\n  <rect x=\"56\" y=\"62\" width=\"104\" height=\"34\" rx=\"4\" fill=\"#1a1a1a\" stroke=\"#666\" stroke-width=\"1\"/>\n  <text x=\"108\" y=\"76\" text-anchor=\"middle\" fill=\"#c060e0\" font-size=\"9\">p(t&#x2080;) = 0.60</text>\n  <text x=\"108\" y=\"89\" text-anchor=\"middle\" fill=\"#e0a040\" font-size=\"9\">q(t&#x2080;) = 0.50</text>\n\n  <rect x=\"196\" y=\"62\" width=\"104\" height=\"34\" rx=\"4\" fill=\"#1a1a1a\" stroke=\"#666\" stroke-width=\"1\"/>\n  <text x=\"248\" y=\"76\" text-anchor=\"middle\" fill=\"#c060e0\" font-size=\"9\">p(t&#x2081;) = 0.50</text>\n  <text x=\"248\" y=\"89\" text-anchor=\"middle\" fill=\"#e0a040\" font-size=\"9\">q(t&#x2081;) = 0.20</text>\n\n  <rect x=\"336\" y=\"62\" width=\"104\" height=\"34\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"388\" y=\"80\" text-anchor=\"middle\" fill=\"#555\" font-size=\"9\">not reached</text>\n\n  <rect x=\"476\" y=\"62\" width=\"104\" height=\"34\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"528\" y=\"80\" text-anchor=\"middle\" fill=\"#555\" font-size=\"9\">not reached</text>\n\n  <!-- Row 3: Alpha + accept/reject -->\n  <text x=\"16\" y=\"124\" fill=\"#888\" font-size=\"10\">&#x3b1;, test</text>\n  <rect x=\"56\" y=\"108\" width=\"104\" height=\"40\" rx=\"4\" fill=\"#1a3a1a\" stroke=\"#44aa66\" stroke-width=\"1.5\"/>\n  <text x=\"108\" y=\"124\" text-anchor=\"middle\" fill=\"#aaa\" font-size=\"9\">&#x3b1; = .50/.60 = .83</text>\n  <text x=\"108\" y=\"140\" text-anchor=\"middle\" fill=\"#44aa66\" font-size=\"9\">u=0.1 &lt; .83 &#x2713;</text>\n\n  <rect x=\"196\" y=\"108\" width=\"104\" height=\"40\" rx=\"4\" fill=\"#4a1a1a\" stroke=\"#e06060\" stroke-width=\"1.5\"/>\n  <text x=\"248\" y=\"124\" text-anchor=\"middle\" fill=\"#aaa\" font-size=\"9\">&#x3b1; = .20/.50 = .40</text>\n  <text x=\"248\" y=\"140\" text-anchor=\"middle\" fill=\"#e06060\" font-size=\"9\">u=0.7 &#x2265; .40 &#x2717;</text>\n\n  <rect x=\"336\" y=\"108\" width=\"104\" height=\"40\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"388\" y=\"132\" text-anchor=\"middle\" fill=\"#555\" font-size=\"9\">skipped</text>\n\n  <rect x=\"476\" y=\"108\" width=\"104\" height=\"40\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"528\" y=\"132\" text-anchor=\"middle\" fill=\"#555\" font-size=\"9\">skipped</text>\n\n  <!-- Row 4: Resample box -->\n  <rect x=\"56\" y=\"164\" width=\"524\" height=\"38\" rx=\"5\" fill=\"#1a1a1a\" stroke=\"#e06060\" stroke-width=\"1\"/>\n  <text x=\"318\" y=\"180\" text-anchor=\"middle\" fill=\"#e06060\" font-size=\"10\">reject at pos 1 &#x2192; stop, resample from adj(v) = max(0, q(v) &#x2212; p(v))</text>\n  <text x=\"318\" y=\"194\" text-anchor=\"middle\" fill=\"#aaa\" font-size=\"9\">normalize adj, inverse-CDF sample using u[b, T] &#x2192; replacement token t&#x2081;&#x2032;</text>\n\n  <!-- Row 5: Output tokens -->\n  <text x=\"16\" y=\"224\" fill=\"#888\" font-size=\"10\">output</text>\n  <rect x=\"56\" y=\"212\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#1e3a5f\" stroke=\"#4477bb\" stroke-width=\"1.5\"/>\n  <text x=\"108\" y=\"229\" text-anchor=\"middle\" fill=\"#8ec4f0\" font-size=\"11\">t&#x2080;</text>\n  <rect x=\"196\" y=\"212\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#3a2010\" stroke=\"#e0a040\" stroke-width=\"1.5\"/>\n  <text x=\"248\" y=\"229\" text-anchor=\"middle\" fill=\"#f0b060\" font-size=\"11\">t&#x2081;&#x2032;</text>\n  <rect x=\"336\" y=\"212\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"388\" y=\"229\" text-anchor=\"middle\" fill=\"#555\" font-size=\"11\">0</text>\n  <rect x=\"476\" y=\"212\" width=\"104\" height=\"24\" rx=\"4\" fill=\"#2a2a2a\" stroke=\"#555\" stroke-width=\"1\"/>\n  <text x=\"528\" y=\"229\" text-anchor=\"middle\" fill=\"#555\" font-size=\"11\">0</text>\n\n  <!-- Legend -->\n  <text x=\"16\" y=\"260\" fill=\"#c060e0\" font-size=\"9\">p = draft prob</text>\n  <text x=\"130\" y=\"260\" fill=\"#e0a040\" font-size=\"9\">q = target prob</text>\n  <text x=\"260\" y=\"260\" fill=\"#aaa\" font-size=\"9\">&#x3b1; = min(1, q/p)</text>\n  <text x=\"400\" y=\"260\" fill=\"#44aa66\" font-size=\"9\">&#x25a0; accepted</text>\n  <text x=\"490\" y=\"260\" fill=\"#e0a040\" font-size=\"9\">&#x25a0; resampled</text>\n  <text x=\"590\" y=\"260\" fill=\"#555\" font-size=\"9\">&#x25a0; pad</text>\n\n  <!-- All-accept note -->\n  <text x=\"330\" y=\"290\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\">If all T tokens accepted: sample bonus token from q at last position using u[b, T]</text>\n</svg>\n\n<p>\n  For each sequence $b$, process positions $i = 0, 1, \\ldots, T{-}1$ left-to-right:\n</p>\n<ol>\n  <li>Compute acceptance probability: $\\displaystyle \\alpha_i = \\min\\!\\left(1,\\; \\frac{q_i(t_i)}{p_i(t_i)}\\right)$</li>\n  <li>If $u_i < \\alpha_i$: <strong>accept</strong> $t_i$, continue to position $i{+}1$.</li>\n  <li>If $u_i \\ge \\alpha_i$: <strong>reject</strong>, stop. Sample replacement from:\n    $$\\text{adj}(v) = \\frac{\\max(0,\\; q_i(v) - p_i(v))}{\\sum_{v'} \\max(0,\\; q_i(v') - p_i(v'))}$$\n    using inverse CDF with $r = \\texttt{uniform_samples}[b, T]$. If $\\text{adj}$ is all zeros, use uniform $1/V$.\n  </li>\n  <li>If all $T$ tokens accepted: sample a <strong>bonus token</strong> from $q_{T-1}$ using $\\texttt{uniform_samples}[b, T]$.</li>\n</ol>\n<p>\n  Write results into <code>output_tokens[b, :]</code> (shape $[B, T{+}1]$): accepted/resampled tokens\n  fill positions $0$ through the accepted count (inclusive), remaining positions are zero.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement <code>solve(draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens, B, T, V)</code>.</li>\n  <li>Do not change the function signature or use external libraries beyond the standard GPU frameworks.</li>\n  <li>Write results into the provided <code>output_tokens</code> buffer (shape <code>[B, T+1]</code>, <code>int32</code>).</li>\n  <li>Memory layout is row-major: <code>draft_probs[b, i, v]</code> is at offset <code>b*T*V + i*V + v</code>.</li>\n  <li>\n    Inverse CDF sampling: given distribution $\\text{adj}$ (already normalized), find the\n    smallest index $k$ where $\\sum_{v=0}^{k} \\text{adj}(v) \\ge r$, where\n    $r = \\texttt{uniform_samples}[b, T]$. Clamp the result to $[0, V-1]$.\n  </li>\n  <li>\n    If the adjusted distribution is all zeros (i.e., $q_i \\le p_i$ everywhere), fall back to\n    the uniform distribution over $V$ tokens.\n  </li>\n</ul>\n\n<h2>Example</h2>\n<p>\n  Input: $B = 1,\\; T = 3,\\; V = 4$\n</p>\n<p>\n  $\\text{draft_tokens} = [1, 2, 0]$\n</p>\n<p>\n  Draft probabilities $p_i$ and target probabilities $q_i$ per position:\n  $$\n  p_0 = \\begin{bmatrix} 0.10 & 0.60 & 0.20 & 0.10 \\end{bmatrix}, \\quad\n  q_0 = \\begin{bmatrix} 0.10 & 0.50 & 0.20 & 0.20 \\end{bmatrix}\n  $$\n  $$\n  p_1 = \\begin{bmatrix} 0.10 & 0.20 & 0.50 & 0.20 \\end{bmatrix}, \\quad\n  q_1 = \\begin{bmatrix} 0.30 & 0.20 & 0.20 & 0.30 \\end{bmatrix}\n  $$\n  $$\n  \\text{uniform_samples} = \\begin{bmatrix} 0.50 & 0.70 & 0.30 & 0.90 \\end{bmatrix}\n  $$\n</p>\n<p>\n  <strong>Position 0</strong> (draft token = 1):\n  $\\alpha_0 = \\min\\!\\left(1,\\, \\frac{q_0(1)}{p_0(1)}\\right) = \\min\\!\\left(1,\\, \\frac{0.50}{0.60}\\right) \\approx 0.833$.\n  Since $u_0 = 0.50 < 0.833$, <strong>accept</strong> token 1.\n</p>\n<p>\n  <strong>Position 1</strong> (draft token = 2):\n  $\\alpha_1 = \\min\\!\\left(1,\\, \\frac{q_1(2)}{p_1(2)}\\right) = \\min\\!\\left(1,\\, \\frac{0.20}{0.50}\\right) = 0.40$.\n  Since $u_1 = 0.70 \\ge 0.40$, <strong>reject</strong>. Resample from adjusted distribution:\n  $$\n  \\text{adj}(v) = \\max(0,\\, q_1(v) - p_1(v)) = [0.20,\\, 0,\\, 0,\\, 0.10]\n  $$\n  $$\n  \\text{normalized} = \\left[\\tfrac{2}{3},\\, 0,\\, 0,\\, \\tfrac{1}{3}\\right], \\quad\n  \\text{CDF} = [0.667,\\, 0.667,\\, 0.667,\\, 1.0]\n  $$\n  With $r = \\text{uniform_samples}[0, T] = 0.90$, inverse CDF gives token <strong>3</strong>.\n</p>\n<p>\n  Output:\n  $$\\text{output_tokens} = \\begin{bmatrix} 1 & 3 & 0 & 0 \\end{bmatrix}$$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>B</code> &le; 256</li>\n  <li>1 &le; <code>T</code> &le; 16</li>\n  <li>2 &le; <code>V</code> &le; 131,072</li>\n  <li><code>draft_probs[b, i, :]</code> and <code>target_probs[b, i, :]</code> are valid probability distributions (non-negative, sum to 1)</li>\n  <li><code>draft_probs[b, i, draft_tokens[b, i]]</code> &gt; 0 for all <code>b</code>, <code>i</code></li>\n  <li><code>uniform_samples</code> values are in $[0, 1)$</li>\n  <li>All floating-point tensors use <code>float32</code>; token tensors use <code>int32</code></li>\n  <li>Performance is measured with <code>B</code> = 64, <code>T</code> = 8, <code>V</code> = 32,768</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are device pointers\nextern \"C\" void solve(const int* draft_tokens, const float* draft_probs, const float* target_probs,\n                      const float* uniform_samples, int* output_tokens, int B, int T, int V) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU\n@cute.jit\ndef solve(\n    draft_tokens: cute.Tensor,\n    draft_probs: cute.Tensor,\n    target_probs: cute.Tensor,\n    uniform_samples: cute.Tensor,\n    output_tokens: cute.Tensor,\n    B: cute.Int32,\n    T: cute.Int32,\n    V: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# draft_tokens, draft_probs, target_probs, uniform_samples are tensors on GPU\n@jax.jit\ndef solve(\n    draft_tokens: jax.Array,\n    draft_probs: jax.Array,\n    target_probs: jax.Array,\n    uniform_samples: jax.Array,\n    B: int,\n    T: int,\n    V: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from gpu.host import DeviceContext\nfrom memory import UnsafePointer\n\n# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are device pointers\n@export\ndef solve(\n    draft_tokens: UnsafePointer[Int32],\n    draft_probs: UnsafePointer[Float32],\n    target_probs: UnsafePointer[Float32],\n    uniform_samples: UnsafePointer[Float32],\n    output_tokens: UnsafePointer[Int32],\n    B: Int32,\n    T: Int32,\n    V: Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU\ndef solve(\n    draft_tokens: torch.Tensor,\n    draft_probs: torch.Tensor,\n    target_probs: torch.Tensor,\n    uniform_samples: torch.Tensor,\n    output_tokens: torch.Tensor,\n    B: int,\n    T: int,\n    V: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# draft_tokens, draft_probs, target_probs, uniform_samples, output_tokens are tensors on the GPU\ndef solve(\n    draft_tokens: torch.Tensor,\n    draft_probs: torch.Tensor,\n    target_probs: torch.Tensor,\n    uniform_samples: torch.Tensor,\n    output_tokens: torch.Tensor,\n    B: int,\n    T: int,\n    V: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Speculative Decoding Verification\",\n",
+        "            atol=1e-05,\n",
+        "            rtol=1e-05,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        draft_tokens: torch.Tensor,\n",
+        "        draft_probs: torch.Tensor,\n",
+        "        target_probs: torch.Tensor,\n",
+        "        uniform_samples: torch.Tensor,\n",
+        "        output_tokens: torch.Tensor,\n",
+        "        B: int,\n",
+        "        T: int,\n",
+        "        V: int,\n",
+        "    ):\n",
+        "        assert draft_tokens.shape == (B, T)\n",
+        "        assert draft_probs.shape == (B, T, V)\n",
+        "        assert target_probs.shape == (B, T, V)\n",
+        "        assert uniform_samples.shape == (B, T + 1)\n",
+        "        assert output_tokens.shape == (B, T + 1)\n",
+        "        assert draft_tokens.dtype == torch.int32\n",
+        "        assert draft_probs.dtype == torch.float32\n",
+        "        assert target_probs.dtype == torch.float32\n",
+        "        assert uniform_samples.dtype == torch.float32\n",
+        "        assert output_tokens.dtype == torch.int32\n",
+        "        assert draft_tokens.device.type == \"cuda\"\n",
+        "        assert draft_probs.device.type == \"cuda\"\n",
+        "        assert target_probs.device.type == \"cuda\"\n",
+        "        assert uniform_samples.device.type == \"cuda\"\n",
+        "        assert output_tokens.device.type == \"cuda\"\n",
+        "\n",
+        "        output_tokens.fill_(0)\n",
+        "\n",
+        "        for b in range(B):\n",
+        "            for i in range(T):\n",
+        "                tok = int(draft_tokens[b, i].item())\n",
+        "                p = draft_probs[b, i, tok].item()\n",
+        "                q = target_probs[b, i, tok].item()\n",
+        "                alpha = min(1.0, q / p)\n",
+        "\n",
+        "                if uniform_samples[b, i].item() < alpha:\n",
+        "                    output_tokens[b, i] = tok\n",
+        "                else:\n",
+        "                    adjusted = torch.clamp(target_probs[b, i] - draft_probs[b, i], min=0.0)\n",
+        "                    total = adjusted.sum().item()\n",
+        "                    if total > 0.0:\n",
+        "                        adjusted = adjusted / total\n",
+        "                    else:\n",
+        "                        adjusted = (\n",
+        "                            torch.ones(V, dtype=torch.float32, device=draft_tokens.device) / V\n",
+        "                        )\n",
+        "                    cdf = torch.cumsum(adjusted, dim=0)\n",
+        "                    r = float(uniform_samples[b, T].item())\n",
+        "                    new_tok = int(torch.searchsorted(cdf.contiguous(), r).item())\n",
+        "                    output_tokens[b, i] = min(new_tok, V - 1)\n",
+        "                    break\n",
+        "            else:\n",
+        "                cdf = torch.cumsum(target_probs[b, T - 1], dim=0)\n",
+        "                r = float(uniform_samples[b, T].item())\n",
+        "                bonus_tok = int(torch.searchsorted(cdf.contiguous(), r).item())\n",
+        "                output_tokens[b, T] = min(bonus_tok, V - 1)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"draft_tokens\": (ctypes.POINTER(ctypes.c_int), \"in\"),\n",
+        "            \"draft_probs\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"target_probs\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"uniform_samples\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output_tokens\": (ctypes.POINTER(ctypes.c_int), \"out\"),\n",
+        "            \"B\": (ctypes.c_int, \"in\"),\n",
+        "            \"T\": (ctypes.c_int, \"in\"),\n",
+        "            \"V\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_sparse_probs(self, B, T, V, K, device):\n",
+        "        \"\"\"Generate sparse probability distributions: only K tokens have nonzero probability.\n",
+        "\n",
+        "        Using sparse distributions ensures that the adjusted distribution clamp(q-p, 0)\n",
+        "        has at most 2K nonzero entries, making CDF summation numerically exact regardless\n",
+        "        of summation order. This prevents floating-point sensitivity for large V.\n",
+        "        \"\"\"\n",
+        "        K = min(K, V)\n",
+        "        flat = B * T\n",
+        "        # For each (b, i), sample K distinct token indices\n",
+        "        idx = torch.stack([torch.randperm(V, device=device)[:K] for _ in range(flat)])\n",
+        "        idx = idx.view(B, T, K)\n",
+        "        # Random weights summing to 1\n",
+        "        weights = torch.rand(B, T, K, device=device)\n",
+        "        weights = weights / weights.sum(dim=-1, keepdim=True)\n",
+        "        # Scatter into full V-dimensional probability vector\n",
+        "        probs = torch.zeros(B, T, V, device=device)\n",
+        "        probs.scatter_(2, idx, weights)\n",
+        "        return probs, idx\n",
+        "\n",
+        "    def _make_test_case(self, B, T, V, seed=42):\n",
+        "        torch.manual_seed(seed)\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        # K=64 active tokens per position: enough diversity while keeping the adjusted\n",
+        "        # distribution sparse (at most 128 nonzero entries), ensuring CDF sums are\n",
+        "        # independent of floating-point summation order.\n",
+        "        K = min(64, V)\n",
+        "        draft_probs, draft_idx = self._make_sparse_probs(B, T, V, K, device)\n",
+        "        target_probs, _ = self._make_sparse_probs(B, T, V, K, device)\n",
+        "\n",
+        "        # Sample draft tokens from the active K tokens\n",
+        "        weights = draft_probs.gather(2, draft_idx)  # (B, T, K)\n",
+        "        flat_w = weights.view(B * T, K)\n",
+        "        chosen = torch.multinomial(flat_w, 1).view(B, T)  # index within the K tokens\n",
+        "        draft_tokens = draft_idx.gather(2, chosen.unsqueeze(-1)).squeeze(-1).to(torch.int32)\n",
+        "\n",
+        "        uniform_samples = torch.rand(B, T + 1, device=device)\n",
+        "        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"draft_tokens\": draft_tokens,\n",
+        "            \"draft_probs\": draft_probs,\n",
+        "            \"target_probs\": target_probs,\n",
+        "            \"uniform_samples\": uniform_samples,\n",
+        "            \"output_tokens\": output_tokens,\n",
+        "            \"B\": B,\n",
+        "            \"T\": T,\n",
+        "            \"V\": V,\n",
+        "        }\n",
+        "\n",
+        "    def _make_accept_all_case(self, B, T, V, seed=42):\n",
+        "        \"\"\"All draft tokens accepted: target_probs == draft_probs so alpha == 1 everywhere.\"\"\"\n",
+        "        torch.manual_seed(seed)\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        K = min(64, V)\n",
+        "        draft_probs, draft_idx = self._make_sparse_probs(B, T, V, K, device)\n",
+        "        target_probs = draft_probs.clone()  # alpha = min(1, q/p) = 1 \u2192 always accept\n",
+        "\n",
+        "        weights = draft_probs.gather(2, draft_idx)\n",
+        "        flat_w = weights.view(B * T, K)\n",
+        "        chosen = torch.multinomial(flat_w, 1).view(B, T)\n",
+        "        draft_tokens = draft_idx.gather(2, chosen.unsqueeze(-1)).squeeze(-1).to(torch.int32)\n",
+        "\n",
+        "        # All acceptance samples set to 0 (< 1.0 = alpha) to guarantee acceptance\n",
+        "        uniform_samples = torch.zeros(B, T + 1, device=device)\n",
+        "        uniform_samples[:, T] = torch.rand(B, device=device)  # bonus sampling sample\n",
+        "\n",
+        "        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"draft_tokens\": draft_tokens,\n",
+        "            \"draft_probs\": draft_probs,\n",
+        "            \"target_probs\": target_probs,\n",
+        "            \"uniform_samples\": uniform_samples,\n",
+        "            \"output_tokens\": output_tokens,\n",
+        "            \"B\": B,\n",
+        "            \"T\": T,\n",
+        "            \"V\": V,\n",
+        "        }\n",
+        "\n",
+        "    def _make_reject_first_case(self, B, T, V, seed=42):\n",
+        "        \"\"\"First draft token always rejected: draft_probs high, target low for that token.\"\"\"\n",
+        "        torch.manual_seed(seed)\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        draft_probs = torch.softmax(torch.randn(B, T, V, device=device), dim=-1)\n",
+        "        target_probs = torch.softmax(torch.randn(B, T, V, device=device), dim=-1)\n",
+        "\n",
+        "        flat = draft_probs.view(B * T, V)\n",
+        "        draft_tokens = torch.multinomial(flat, 1).view(B, T).to(torch.int32)\n",
+        "\n",
+        "        # Force rejection at position 0 for every sequence:\n",
+        "        # set alpha[b,0] very small and uniform_sample[b,0] high enough to reject\n",
+        "        for b in range(B):\n",
+        "            tok = int(draft_tokens[b, 0].item())\n",
+        "            # Make draft prob ~0.9 for the chosen token (high p)\n",
+        "            draft_probs[b, 0] = torch.full((V,), 0.1 / max(V - 1, 1), device=device)\n",
+        "            draft_probs[b, 0, tok] = 0.9\n",
+        "            draft_probs[b, 0] = draft_probs[b, 0] / draft_probs[b, 0].sum()\n",
+        "            # Make target prob ~1/V for the same token (low q)\n",
+        "            target_probs[b, 0] = torch.ones(V, device=device) / V\n",
+        "\n",
+        "        uniform_samples = torch.rand(B, T + 1, device=device)\n",
+        "        # Force uniform[b, 0] = 0.99 > alpha (which is ~1/V / 0.9 \u2248 small)\n",
+        "        uniform_samples[:, 0] = 0.99\n",
+        "\n",
+        "        output_tokens = torch.zeros(B, T + 1, device=device, dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"draft_tokens\": draft_tokens,\n",
+        "            \"draft_probs\": draft_probs,\n",
+        "            \"target_probs\": target_probs,\n",
+        "            \"uniform_samples\": uniform_samples,\n",
+        "            \"output_tokens\": output_tokens,\n",
+        "            \"B\": B,\n",
+        "            \"T\": T,\n",
+        "            \"V\": V,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "\n",
+        "        # B=1, T=3, V=4: position 0 accepted, position 1 rejected, token resampled\n",
+        "        draft_tokens = torch.tensor([[1, 2, 0]], device=device, dtype=torch.int32)\n",
+        "\n",
+        "        draft_probs = torch.tensor(\n",
+        "            [\n",
+        "                [\n",
+        "                    [0.10, 0.60, 0.20, 0.10],  # pos 0: draft_tokens[0,0]=1, p=0.60\n",
+        "                    [0.10, 0.20, 0.50, 0.20],  # pos 1: draft_tokens[0,1]=2, p=0.50\n",
+        "                    [0.40, 0.20, 0.20, 0.20],  # pos 2: draft_tokens[0,2]=0, p=0.40\n",
+        "                ]\n",
+        "            ],\n",
+        "            device=device,\n",
+        "            dtype=torch.float32,\n",
+        "        )\n",
+        "\n",
+        "        target_probs = torch.tensor(\n",
+        "            [\n",
+        "                [\n",
+        "                    [0.10, 0.50, 0.20, 0.20],  # pos 0: q=0.50, alpha=min(1,0.50/0.60)=0.833\n",
+        "                    [0.30, 0.20, 0.20, 0.30],  # pos 1: q=0.20, alpha=min(1,0.20/0.50)=0.400\n",
+        "                    [0.30, 0.20, 0.30, 0.20],  # pos 2: not reached\n",
+        "                ]\n",
+        "            ],\n",
+        "            device=device,\n",
+        "            dtype=torch.float32,\n",
+        "        )\n",
+        "\n",
+        "        # uniform_samples[0, 0]=0.50 < 0.833 \u2192 ACCEPT token 1\n",
+        "        # uniform_samples[0, 1]=0.70 > 0.400 \u2192 REJECT token 2\n",
+        "        #   adjusted = clamp([0.20, 0, -0.30, 0.10], min=0) = [0.20, 0, 0, 0.10]\n",
+        "        #   normalized CDF = [0.667, 0.667, 0.667, 1.0]\n",
+        "        #   uniform_samples[0, T=3]=0.90 \u2192 searchsorted \u2192 token 3\n",
+        "        # output_tokens[0] = [1, 3, 0, 0]\n",
+        "        uniform_samples = torch.tensor(\n",
+        "            [[0.50, 0.70, 0.30, 0.90]], device=device, dtype=torch.float32\n",
+        "        )\n",
+        "\n",
+        "        output_tokens = torch.zeros(1, 4, device=device, dtype=torch.int32)\n",
+        "\n",
+        "        return {\n",
+        "            \"draft_tokens\": draft_tokens,\n",
+        "            \"draft_probs\": draft_probs,\n",
+        "            \"target_probs\": target_probs,\n",
+        "            \"uniform_samples\": uniform_samples,\n",
+        "            \"output_tokens\": output_tokens,\n",
+        "            \"B\": 1,\n",
+        "            \"T\": 3,\n",
+        "            \"V\": 4,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge: T=1, rejected immediately\n",
+        "        tests.append(self._make_reject_first_case(1, 1, 4, seed=1))\n",
+        "\n",
+        "        # Edge: T=1, all accepted (bonus token sampled)\n",
+        "        tests.append(self._make_accept_all_case(1, 1, 4, seed=2))\n",
+        "\n",
+        "        # Edge: T=2, first rejected\n",
+        "        tests.append(self._make_reject_first_case(1, 2, 8, seed=3))\n",
+        "\n",
+        "        # Edge: T=4, all accepted\n",
+        "        tests.append(self._make_accept_all_case(2, 4, 8, seed=4))\n",
+        "\n",
+        "        # Zero uniform_samples acceptance values \u2192 force rejection at pos 0 (unless alpha=1)\n",
+        "        tests.append(self._make_reject_first_case(4, 4, 16, seed=5))\n",
+        "\n",
+        "        # Power-of-2 vocab, mixed acceptance\n",
+        "        tests.append(self._make_test_case(4, 8, 64, seed=10))\n",
+        "\n",
+        "        # Larger vocab, mixed acceptance\n",
+        "        tests.append(self._make_test_case(8, 8, 256, seed=20))\n",
+        "\n",
+        "        # Non-power-of-2 vocab\n",
+        "        tests.append(self._make_test_case(4, 6, 30, seed=30))\n",
+        "\n",
+        "        # All sequences accept all tokens (bonus sampling)\n",
+        "        tests.append(self._make_accept_all_case(8, 8, 128, seed=40))\n",
+        "\n",
+        "        # Realistic small batch\n",
+        "        tests.append(self._make_test_case(16, 8, 1000, seed=50))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # B=64 sequences, T=8 draft tokens, V=32768 (Mistral/LLaMA-2 vocab size)\n",
+        "        return self._make_test_case(64, 8, 32768, seed=0)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/90_causal_depthwise_conv1d.ipynb b/challenges/colab_exports/medium/90_causal_depthwise_conv1d.ipynb
new file mode 100644
index 00000000..ac7eb888
--- /dev/null
+++ b/challenges/colab_exports/medium/90_causal_depthwise_conv1d.ipynb
@@ -0,0 +1,568 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement a <strong>causal depthwise 1D convolution</strong> over a batched sequence tensor\n  <code>x</code> of shape <code>(B, L, D)</code>, producing an output of the same shape.\n  In a depthwise convolution, each channel <code>d</code> is convolved independently using its\n  own kernel <code>weight[d, :]</code> \u2014 there is no mixing across channels.\n  The convolution is <strong>causal</strong>: output position <code>l</code> may only depend on\n  input positions <code>0, 1, &hellip;, l</code> (past and present), never future positions.\n  This operation is a key component of state-space models such as Mamba, where it is applied\n  before the selective scan to mix local context within each feature channel.\n</p>\n\n<svg xmlns=\"http://www.w3.org/2000/svg\" viewBox=\"0 0 480 260\" width=\"480\" height=\"260\" style=\"display:block; margin:20px auto;\">\n  <defs>\n    <marker id=\"ah\" viewBox=\"0 0 10 10\" refX=\"9\" refY=\"5\" markerWidth=\"6\" markerHeight=\"6\" orient=\"auto-start-reverse\">\n      <path d=\"M0 0L10 5L0 10z\" fill=\"#999\"/>\n    </marker>\n  </defs>\n\n  <!-- Background -->\n  <rect width=\"480\" height=\"260\" fill=\"#222\" rx=\"8\"/>\n\n  <!-- Title -->\n  <text x=\"240\" y=\"22\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"13\" font-family=\"sans-serif\" font-weight=\"bold\">Causal Depthwise Conv1d (K=3, one channel shown)</text>\n\n  <!-- Input row label -->\n  <text x=\"14\" y=\"68\" fill=\"#aaa\" font-size=\"11\" font-family=\"monospace\">x[d]</text>\n\n  <!-- Input cells: positions 0..5 -->\n  <rect x=\"52\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"72\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2080</text>\n\n  <rect x=\"96\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"116\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2081</text>\n\n  <rect x=\"140\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"160\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2082</text>\n\n  <rect x=\"184\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"204\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2083</text>\n\n  <rect x=\"228\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"248\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2084</text>\n\n  <rect x=\"272\" y=\"52\" width=\"40\" height=\"28\" fill=\"#2a3a55\" stroke=\"#4477bb\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"292\" y=\"71\" text-anchor=\"middle\" fill=\"#aaccee\" font-size=\"12\" font-family=\"monospace\">x\u2085</text>\n\n  <!-- Kernel box -->\n  <text x=\"14\" y=\"138\" fill=\"#aaa\" font-size=\"11\" font-family=\"monospace\">w[d]</text>\n  <rect x=\"140\" y=\"118\" width=\"40\" height=\"28\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\" rx=\"3\"/>\n  <text x=\"160\" y=\"137\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"monospace\">w\u2080</text>\n  <rect x=\"184\" y=\"118\" width=\"40\" height=\"28\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\" rx=\"3\"/>\n  <text x=\"204\" y=\"137\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"monospace\">w\u2081</text>\n  <rect x=\"228\" y=\"118\" width=\"40\" height=\"28\" fill=\"#1e3d2d\" stroke=\"#44aa66\" stroke-width=\"1.5\" rx=\"3\"/>\n  <text x=\"248\" y=\"137\" text-anchor=\"middle\" fill=\"#aaeebb\" font-size=\"12\" font-family=\"monospace\">w\u2082</text>\n\n  <!-- Annotation: kernel aligned at l=4 -->\n  <text x=\"190\" y=\"155\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"sans-serif\">kernel at l=4: reads x\u2082,x\u2083,x\u2084</text>\n\n  <!-- Arrow from kernel region to output -->\n  <line x1=\"204\" y1=\"146\" x2=\"204\" y2=\"180\" stroke=\"#999\" stroke-width=\"1.2\" marker-end=\"url(#ah)\"/>\n\n  <!-- Output row label -->\n  <text x=\"14\" y=\"208\" fill=\"#aaa\" font-size=\"11\" font-family=\"monospace\">y[d]</text>\n\n  <!-- Output cells -->\n  <rect x=\"52\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#884444\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"72\" y=\"211\" text-anchor=\"middle\" fill=\"#eeccaa\" font-size=\"11\" font-family=\"monospace\">y\u2080</text>\n\n  <rect x=\"96\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#884444\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"116\" y=\"211\" text-anchor=\"middle\" fill=\"#eeccaa\" font-size=\"11\" font-family=\"monospace\">y\u2081</text>\n\n  <rect x=\"140\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#884444\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"160\" y=\"211\" text-anchor=\"middle\" fill=\"#eeccaa\" font-size=\"11\" font-family=\"monospace\">y\u2082</text>\n\n  <rect x=\"184\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#cc6644\" stroke-width=\"2\" rx=\"3\"/>\n  <text x=\"204\" y=\"211\" text-anchor=\"middle\" fill=\"#ffddaa\" font-size=\"11\" font-family=\"monospace\" font-weight=\"bold\">y\u2083</text>\n\n  <rect x=\"228\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#cc6644\" stroke-width=\"2\" rx=\"3\"/>\n  <text x=\"248\" y=\"211\" text-anchor=\"middle\" fill=\"#ffddaa\" font-size=\"11\" font-family=\"monospace\" font-weight=\"bold\">y\u2084</text>\n\n  <rect x=\"272\" y=\"192\" width=\"40\" height=\"28\" fill=\"#3a2a2a\" stroke=\"#884444\" stroke-width=\"1.2\" rx=\"3\"/>\n  <text x=\"292\" y=\"211\" text-anchor=\"middle\" fill=\"#eeccaa\" font-size=\"11\" font-family=\"monospace\">y\u2085</text>\n\n  <!-- Equation at bottom -->\n  <text x=\"240\" y=\"246\" text-anchor=\"middle\" fill=\"#888\" font-size=\"11\" font-family=\"sans-serif\">\n    y[d,l] = bias[d] + \u03a3 w[d,k] \u00b7 x[d, l\u2212k]   (x[d,l\u2212k] = 0 if l\u2212k &lt; 0)\n  </text>\n</svg>\n\n<p>\n  Formally, for each batch element <code>b</code>, sequence position <code>l</code>, and channel <code>d</code>:\n</p>\n\n$$\n\\text{output}[b,\\, l,\\, d]\n= \\text{bias}[d]\n+ \\sum_{k=0}^{K-1} \\text{weight}[d,\\, k] \\cdot x[b,\\, l - k,\\, d]\n$$\n\n<p>\n  where positions <code>l &minus; k &lt; 0</code> are treated as zero (zero-pad the left boundary).\n  The tensor layout is <strong>channels-last</strong>: <code>x[b, l, d]</code> is stored at offset\n  <code>b &times; L &times; D + l &times; D + d</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>The <code>solve</code> function signature must remain unchanged</li>\n  <li>The result must be written into the <code>output</code> tensor</li>\n  <li>Use only native features (external libraries are not permitted)</li>\n  <li>Input positions before the start of the sequence (i.e. indices <code>l &minus; k &lt; 0</code>) must be treated as zero</li>\n</ul>\n\n<h2>Example</h2>\n\n<p>With <code>B</code> = 1, <code>L</code> = 4, <code>D</code> = 2, <code>K</code> = 3:</p>\n\n<pre>\nx      = [[[1.0, 2.0],    # l=0\n           [3.0, 4.0],    # l=1\n           [5.0, 6.0],    # l=2\n           [7.0, 8.0]]]   # l=3   shape (1, 4, 2)\n\nweight = [[ 1.0,  0.0, -1.0],   # channel d=0\n          [ 1.0,  1.0,  1.0]]   # channel d=1   shape (2, 3)\n\nbias   = [0.0, 0.0]\n\noutput = [[[1.0,  2.0],   # l=0: d0: 1*1=1          d1: 1*2=2\n           [3.0,  6.0],   # l=1: d0: 3*1+1*0=3      d1: 4*1+2*1=6\n           [4.0, 12.0],   # l=2: d0: 5*1+3*0+1*(-1)=4  d1: 6+4+2=12\n           [4.0, 18.0]]]  # l=3: d0: 7*1+5*0+3*(-1)=4  d1: 8+6+4=18\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>B</code> &le; 16 (batch size)</li>\n  <li>1 &le; <code>L</code> &le; 8,192 (sequence length)</li>\n  <li>1 &le; <code>D</code> &le; 8,192 (number of channels)</li>\n  <li>1 &le; <code>K</code> &le; 8 (kernel size; typically 3 or 4 in practice)</li>\n  <li>All tensors use 32-bit floating point</li>\n  <li>Tensor <code>x</code> and <code>output</code> use channels-last layout: shape <code>(B, L, D)</code></li>\n  <li>Performance is measured with <code>B</code> = 8, <code>L</code> = 2,048, <code>D</code> = 4,096, <code>K</code> = 4</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// x, weight, bias, output are device pointers\nextern \"C\" void solve(const float* x, const float* weight, const float* bias, float* output, int B,\n                      int L, int D, int K) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# x, weight, bias, output are tensors on the GPU\n@cute.jit\ndef solve(\n    x: cute.Tensor,\n    weight: cute.Tensor,\n    bias: cute.Tensor,\n    output: cute.Tensor,\n    B: cute.Int32,\n    L: cute.Int32,\n    D: cute.Int32,\n    K: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# x, weight, bias are tensors on GPU\n@jax.jit\ndef solve(\n    x: jax.Array, weight: jax.Array, bias: jax.Array, B: int, L: int, D: int, K: int\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from gpu.host import DeviceContext\nfrom memory import UnsafePointer\n\n# x, weight, bias, output are device pointers\n@export\ndef solve(\n    x: UnsafePointer[Float32],\n    weight: UnsafePointer[Float32],\n    bias: UnsafePointer[Float32],\n    output: UnsafePointer[Float32],\n    B: Int32,\n    L: Int32,\n    D: Int32,\n    K: Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# x, weight, bias, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    weight: torch.Tensor,\n    bias: torch.Tensor,\n    output: torch.Tensor,\n    B: int,\n    L: int,\n    D: int,\n    K: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# x, weight, bias, output are tensors on the GPU\ndef solve(\n    x: torch.Tensor,\n    weight: torch.Tensor,\n    bias: torch.Tensor,\n    output: torch.Tensor,\n    B: int,\n    L: int,\n    D: int,\n    K: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "import torch.nn.functional as F\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Causal Depthwise Conv1d\",\n",
+        "            atol=1e-04,\n",
+        "            rtol=1e-04,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        x: torch.Tensor,\n",
+        "        weight: torch.Tensor,\n",
+        "        bias: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        B: int,\n",
+        "        L: int,\n",
+        "        D: int,\n",
+        "        K: int,\n",
+        "    ):\n",
+        "        assert x.shape == (B, L, D)\n",
+        "        assert weight.shape == (D, K)\n",
+        "        assert bias.shape == (D,)\n",
+        "        assert output.shape == (B, L, D)\n",
+        "        assert x.dtype == weight.dtype == bias.dtype == output.dtype == torch.float32\n",
+        "        assert x.device.type == \"cuda\"\n",
+        "        assert weight.device.type == \"cuda\"\n",
+        "        assert bias.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "\n",
+        "        # Reshape to (B, D, L) for conv1d\n",
+        "        x_t = x.permute(0, 2, 1).contiguous()  # (B, D, L)\n",
+        "\n",
+        "        # Causal padding: pad K-1 zeros on the left so each output position\n",
+        "        # only sees current and past input positions\n",
+        "        x_padded = F.pad(x_t, (K - 1, 0))  # (B, D, L + K - 1)\n",
+        "\n",
+        "        # Depthwise conv: weight (D, K) -> (D, 1, K), groups=D\n",
+        "        # Flip the kernel so weight[d, 0] applies to the current position (l-0)\n",
+        "        # and weight[d, K-1] applies to the oldest position (l-(K-1)).\n",
+        "        # F.conv1d uses cross-correlation (no implicit flip), so we flip explicitly.\n",
+        "        w = weight.flip(1).unsqueeze(1)  # (D, 1, K)\n",
+        "        result = F.conv1d(x_padded, w, bias=bias, groups=D)  # (B, D, L)\n",
+        "\n",
+        "        output.copy_(result.permute(0, 2, 1))  # (B, L, D)\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"x\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"weight\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"bias\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"B\": (ctypes.c_int, \"in\"),\n",
+        "            \"L\": (ctypes.c_int, \"in\"),\n",
+        "            \"D\": (ctypes.c_int, \"in\"),\n",
+        "            \"K\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        B, L, D, K = 1, 4, 2, 3\n",
+        "        x = torch.tensor(\n",
+        "            [[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]],\n",
+        "            device=\"cuda\",\n",
+        "            dtype=torch.float32,\n",
+        "        )\n",
+        "        weight = torch.tensor(\n",
+        "            [[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]], device=\"cuda\", dtype=torch.float32\n",
+        "        )\n",
+        "        bias = torch.zeros(D, device=\"cuda\", dtype=torch.float32)\n",
+        "        output = torch.empty(B, L, D, device=\"cuda\", dtype=torch.float32)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"weight\": weight,\n",
+        "            \"bias\": bias,\n",
+        "            \"output\": output,\n",
+        "            \"B\": B,\n",
+        "            \"L\": L,\n",
+        "            \"D\": D,\n",
+        "            \"K\": K,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        dtype = torch.float32\n",
+        "        test_cases = []\n",
+        "\n",
+        "        def make_case(B, L, D, K, x_vals=None, w_vals=None, b_vals=None):\n",
+        "            if x_vals is not None:\n",
+        "                x = torch.tensor(x_vals, device=\"cuda\", dtype=dtype)\n",
+        "            else:\n",
+        "                x = torch.randn(B, L, D, device=\"cuda\", dtype=dtype)\n",
+        "            if w_vals is not None:\n",
+        "                weight = torch.tensor(w_vals, device=\"cuda\", dtype=dtype)\n",
+        "            else:\n",
+        "                weight = torch.randn(D, K, device=\"cuda\", dtype=dtype)\n",
+        "            if b_vals is not None:\n",
+        "                bias = torch.tensor(b_vals, device=\"cuda\", dtype=dtype)\n",
+        "            else:\n",
+        "                bias = torch.randn(D, device=\"cuda\", dtype=dtype)\n",
+        "            output = torch.empty(B, L, D, device=\"cuda\", dtype=dtype)\n",
+        "            return {\n",
+        "                \"x\": x,\n",
+        "                \"weight\": weight,\n",
+        "                \"bias\": bias,\n",
+        "                \"output\": output,\n",
+        "                \"B\": B,\n",
+        "                \"L\": L,\n",
+        "                \"D\": D,\n",
+        "                \"K\": K,\n",
+        "            }\n",
+        "\n",
+        "        # Example test (matches generate_example_test)\n",
+        "        test_cases.append(\n",
+        "            make_case(\n",
+        "                1,\n",
+        "                4,\n",
+        "                2,\n",
+        "                3,\n",
+        "                x_vals=[[[1.0, 2.0], [3.0, 4.0], [5.0, 6.0], [7.0, 8.0]]],\n",
+        "                w_vals=[[1.0, 0.0, -1.0], [1.0, 1.0, 1.0]],\n",
+        "                b_vals=[0.0, 0.0],\n",
+        "            )\n",
+        "        )\n",
+        "\n",
+        "        # Edge cases: minimal sizes\n",
+        "        test_cases.append(make_case(1, 1, 1, 1))  # single element, kernel=1\n",
+        "        test_cases.append(make_case(1, 2, 1, 2))  # L < K, so first output is partial\n",
+        "        test_cases.append(make_case(2, 3, 4, 3))  # small batch, B=2\n",
+        "\n",
+        "        # Zero inputs\n",
+        "        x_zero = torch.zeros(1, 8, 4, device=\"cuda\", dtype=dtype)\n",
+        "        w_zero = torch.randn(4, 3, device=\"cuda\", dtype=dtype)\n",
+        "        b_zero = torch.randn(4, device=\"cuda\", dtype=dtype)\n",
+        "        test_cases.append(\n",
+        "            {\n",
+        "                \"x\": x_zero,\n",
+        "                \"weight\": w_zero,\n",
+        "                \"bias\": b_zero,\n",
+        "                \"output\": torch.empty(1, 8, 4, device=\"cuda\", dtype=dtype),\n",
+        "                \"B\": 1,\n",
+        "                \"L\": 8,\n",
+        "                \"D\": 4,\n",
+        "                \"K\": 3,\n",
+        "            }\n",
+        "        )\n",
+        "\n",
+        "        # Negative values\n",
+        "        test_cases.append(make_case(1, 16, 8, 4))\n",
+        "\n",
+        "        # Power-of-2 sizes\n",
+        "        test_cases.append(make_case(2, 32, 16, 4))\n",
+        "        test_cases.append(make_case(4, 64, 32, 4))\n",
+        "\n",
+        "        # Non-power-of-2 sizes\n",
+        "        test_cases.append(make_case(3, 30, 12, 3))\n",
+        "        test_cases.append(make_case(2, 100, 24, 4))\n",
+        "\n",
+        "        # Realistic inference size (Mamba-like small)\n",
+        "        test_cases.append(make_case(2, 256, 128, 4))\n",
+        "\n",
+        "        return test_cases\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        B, L, D, K = 8, 2048, 4096, 4\n",
+        "        dtype = torch.float32\n",
+        "        x = torch.randn(B, L, D, device=\"cuda\", dtype=dtype)\n",
+        "        weight = torch.randn(D, K, device=\"cuda\", dtype=dtype)\n",
+        "        bias = torch.randn(D, device=\"cuda\", dtype=dtype)\n",
+        "        output = torch.empty(B, L, D, device=\"cuda\", dtype=dtype)\n",
+        "        return {\n",
+        "            \"x\": x,\n",
+        "            \"weight\": weight,\n",
+        "            \"bias\": bias,\n",
+        "            \"output\": output,\n",
+        "            \"B\": B,\n",
+        "            \"L\": L,\n",
+        "            \"D\": D,\n",
+        "            \"K\": K,\n",
+        "        }\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/92_decaying_causal_attention.ipynb b/challenges/colab_exports/medium/92_decaying_causal_attention.ipynb
new file mode 100644
index 00000000..5191732f
--- /dev/null
+++ b/challenges/colab_exports/medium/92_decaying_causal_attention.ipynb
@@ -0,0 +1,527 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement decaying causal attention. Given query matrix <code>Q</code>, key matrix <code>K</code>,\n  and value matrix <code>V</code>, each of shape <code>seq_len &times; d_model</code>, and a scalar\n  decay factor <code>gamma</code> &isin; (0,&nbsp;1], compute the unnormalized causal attention output\n  where position <code>n</code> attends to all past positions <code>m &le; n</code> with weight\n  <code>gamma<sup>n&minus;m</sup></code>:\n</p>\n<p>\n  $$\n    \\text{output}[n] = \\sum_{m=0}^{n} \\gamma^{n-m} \\cdot \\frac{Q[n] \\cdot K[m]}{\\sqrt{d_{\\text{model}}}} \\cdot V[m]\n  $$\n</p>\n<p>\n  Unlike standard softmax attention, there is no normalization \u2014 the weights decay geometrically from\n  the current position backward. This is the parallel form of the Retention mechanism (RetNet), used\n  as a recurrence-friendly alternative to attention in sequence models.\n</p>\n\n<svg width=\"680\" height=\"215\" viewBox=\"0 0 680 215\" xmlns=\"http://www.w3.org/2000/svg\"\n     style=\"display:block; margin:20px auto;\">\n  <rect width=\"680\" height=\"215\" fill=\"#222\" rx=\"8\"/>\n\n  <!-- Section title: decay mask -->\n  <text x=\"148\" y=\"24\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"monospace\">Causal Decay Mask  D[n,m] = &#947;^(n&#8722;m)</text>\n\n  <!-- Column headers m=0..3 -->\n  <text x=\"80\"  y=\"43\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">m=0</text>\n  <text x=\"125\" y=\"43\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">m=1</text>\n  <text x=\"170\" y=\"43\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">m=2</text>\n  <text x=\"215\" y=\"43\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">m=3</text>\n\n  <!-- Row labels n=0..3 -->\n  <text x=\"42\" y=\"72\"  text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">n=0</text>\n  <text x=\"42\" y=\"112\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">n=1</text>\n  <text x=\"42\" y=\"152\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">n=2</text>\n  <text x=\"42\" y=\"192\" text-anchor=\"middle\" fill=\"#888\" font-size=\"10\" font-family=\"monospace\">n=3</text>\n\n  <!-- Row 0 -->\n  <rect x=\"58\" y=\"53\" width=\"44\" height=\"36\" fill=\"#1a4a8a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"80\"  y=\"76\" text-anchor=\"middle\" fill=\"#4a9eff\" font-size=\"11\" font-family=\"monospace\">1</text>\n  <rect x=\"103\" y=\"53\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n  <rect x=\"148\" y=\"53\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n  <rect x=\"193\" y=\"53\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n\n  <!-- Row 1 -->\n  <rect x=\"58\"  y=\"91\" width=\"44\" height=\"36\" fill=\"#143a6a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"80\"  y=\"114\" text-anchor=\"middle\" fill=\"#3a8ee0\" font-size=\"11\" font-family=\"monospace\">&#947;</text>\n  <rect x=\"103\" y=\"91\" width=\"44\" height=\"36\" fill=\"#1a4a8a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"125\" y=\"114\" text-anchor=\"middle\" fill=\"#4a9eff\" font-size=\"11\" font-family=\"monospace\">1</text>\n  <rect x=\"148\" y=\"91\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n  <rect x=\"193\" y=\"91\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n\n  <!-- Row 2 -->\n  <rect x=\"58\"  y=\"129\" width=\"44\" height=\"36\" fill=\"#0e2a54\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"80\"  y=\"152\" text-anchor=\"middle\" fill=\"#2a7ec0\" font-size=\"11\" font-family=\"monospace\">&#947;&#178;</text>\n  <rect x=\"103\" y=\"129\" width=\"44\" height=\"36\" fill=\"#143a6a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"125\" y=\"152\" text-anchor=\"middle\" fill=\"#3a8ee0\" font-size=\"11\" font-family=\"monospace\">&#947;</text>\n  <rect x=\"148\" y=\"129\" width=\"44\" height=\"36\" fill=\"#1a4a8a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"170\" y=\"152\" text-anchor=\"middle\" fill=\"#4a9eff\" font-size=\"11\" font-family=\"monospace\">1</text>\n  <rect x=\"193\" y=\"129\" width=\"44\" height=\"36\" fill=\"#161616\" stroke=\"#333\" stroke-width=\"1\"/>\n\n  <!-- Row 3 -->\n  <rect x=\"58\"  y=\"167\" width=\"44\" height=\"36\" fill=\"#081e3c\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"80\"  y=\"190\" text-anchor=\"middle\" fill=\"#1a6ea0\" font-size=\"11\" font-family=\"monospace\">&#947;&#179;</text>\n  <rect x=\"103\" y=\"167\" width=\"44\" height=\"36\" fill=\"#0e2a54\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"125\" y=\"190\" text-anchor=\"middle\" fill=\"#2a7ec0\" font-size=\"11\" font-family=\"monospace\">&#947;&#178;</text>\n  <rect x=\"148\" y=\"167\" width=\"44\" height=\"36\" fill=\"#143a6a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"170\" y=\"190\" text-anchor=\"middle\" fill=\"#3a8ee0\" font-size=\"11\" font-family=\"monospace\">&#947;</text>\n  <rect x=\"193\" y=\"167\" width=\"44\" height=\"36\" fill=\"#1a4a8a\" stroke=\"#333\" stroke-width=\"1\"/>\n  <text x=\"215\" y=\"190\" text-anchor=\"middle\" fill=\"#4a9eff\" font-size=\"11\" font-family=\"monospace\">1</text>\n\n  <!-- Divider -->\n  <line x1=\"265\" y1=\"30\" x2=\"265\" y2=\"210\" stroke=\"#444\" stroke-width=\"1\" stroke-dasharray=\"4,3\"/>\n\n  <!-- Right side: computation flow -->\n  <text x=\"472\" y=\"24\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"12\" font-family=\"monospace\">Computation</text>\n\n  <defs>\n    <marker id=\"arr2\" markerWidth=\"7\" markerHeight=\"7\" refX=\"5\" refY=\"3\" orient=\"auto\">\n      <path d=\"M0,0 L0,6 L7,3 Z\" fill=\"#888\"/>\n    </marker>\n  </defs>\n\n  <!-- Step boxes -->\n  <rect x=\"280\" y=\"48\"  width=\"100\" height=\"32\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.2\"/>\n  <text x=\"330\" y=\"62\"  text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-family=\"monospace\">Q [S, D]</text>\n  <text x=\"330\" y=\"74\"  text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">query</text>\n\n  <rect x=\"280\" y=\"90\"  width=\"100\" height=\"32\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.2\"/>\n  <text x=\"330\" y=\"104\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-family=\"monospace\">K [S, D]</text>\n  <text x=\"330\" y=\"116\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">key</text>\n\n  <rect x=\"280\" y=\"132\" width=\"100\" height=\"32\" rx=\"4\" fill=\"#1a3a5c\" stroke=\"#4a9eff\" stroke-width=\"1.2\"/>\n  <text x=\"330\" y=\"146\" text-anchor=\"middle\" fill=\"#ccc\" font-size=\"10\" font-family=\"monospace\">V [S, D]</text>\n  <text x=\"330\" y=\"158\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">value</text>\n\n  <!-- Arrow from Q and K to scores -->\n  <line x1=\"380\" y1=\"64\"  x2=\"412\" y2=\"90\"  stroke=\"#888\" stroke-width=\"1.2\" marker-end=\"url(#arr2)\"/>\n  <line x1=\"380\" y1=\"106\" x2=\"412\" y2=\"96\"  stroke=\"#888\" stroke-width=\"1.2\" marker-end=\"url(#arr2)\"/>\n\n  <rect x=\"414\" y=\"78\"  width=\"110\" height=\"34\" rx=\"4\" fill=\"#1a2a3c\" stroke=\"#7ec8a0\" stroke-width=\"1.2\"/>\n  <text x=\"469\" y=\"92\"  text-anchor=\"middle\" fill=\"#7ec8a0\" font-size=\"10\" font-family=\"monospace\">QK&#7488; / &#8730;D</text>\n  <text x=\"469\" y=\"105\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">attn scores [S,S]</text>\n\n  <!-- Arrow: multiply by decay mask -->\n  <line x1=\"469\" y1=\"112\" x2=\"469\" y2=\"128\" stroke=\"#888\" stroke-width=\"1.2\" marker-end=\"url(#arr2)\"/>\n  <text x=\"505\" y=\"124\" fill=\"#cc88ff\" font-size=\"9\" font-family=\"monospace\">&#8857; decay mask</text>\n\n  <rect x=\"414\" y=\"130\" width=\"110\" height=\"34\" rx=\"4\" fill=\"#2a1a3c\" stroke=\"#cc88ff\" stroke-width=\"1.2\"/>\n  <text x=\"469\" y=\"144\" text-anchor=\"middle\" fill=\"#cc88ff\" font-size=\"10\" font-family=\"monospace\">weighted [S,S]</text>\n  <text x=\"469\" y=\"157\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">lower triangular</text>\n\n  <!-- Arrow from V and weighted to output -->\n  <line x1=\"380\" y1=\"148\" x2=\"412\" y2=\"148\" stroke=\"#888\" stroke-width=\"1.2\" marker-end=\"url(#arr2)\"/>\n  <line x1=\"524\" y1=\"147\" x2=\"546\" y2=\"147\" stroke=\"#888\" stroke-width=\"1.2\" marker-end=\"url(#arr2)\"/>\n  <text x=\"535\" y=\"140\" fill=\"#888\" font-size=\"9\" font-family=\"monospace\">@</text>\n\n  <rect x=\"548\" y=\"131\" width=\"110\" height=\"34\" rx=\"4\" fill=\"#1a3a1c\" stroke=\"#4aff88\" stroke-width=\"1.2\"/>\n  <text x=\"603\" y=\"145\" text-anchor=\"middle\" fill=\"#4aff88\" font-size=\"10\" font-family=\"monospace\">output [S, D]</text>\n  <text x=\"603\" y=\"158\" text-anchor=\"middle\" fill=\"#888\" font-size=\"9\"  font-family=\"monospace\">weighted @ V</text>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement the <code>solve</code> function; do not change its signature.</li>\n  <li>Do not use external libraries beyond those provided.</li>\n  <li>Write the result into <code>output</code>.</li>\n</ul>\n\n<h2>Example</h2>\n<p>Example 1 \u2014 with <code>seq_len</code> = 2, <code>d_model</code> = 4, <code>gamma</code> = 0.5:</p>\n<p>\n$$\nQ = \\begin{bmatrix} 1 & 1 & 0 & 0 \\\\ 1 & 1 & 0 & 0 \\end{bmatrix}, \\quad\nK = \\begin{bmatrix} 1 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 \\end{bmatrix}, \\quad\nV = \\begin{bmatrix} 4 & 8 & 12 & 16 \\\\ 4 & 8 & 12 & 16 \\end{bmatrix}\n$$\n</p>\n<p>\n  Attention scores $QK^\\top / \\sqrt{4}$:\n  $$\n    A = \\begin{bmatrix} 0.5 & 0.5 \\\\ 0.5 & 0.5 \\end{bmatrix}\n  $$\n  Causal decay mask $D_{nm} = 0.5^{n-m}$ for $n \\ge m$, else $0$:\n  $$\n    D = \\begin{bmatrix} 1 & 0 \\\\ 0.5 & 1 \\end{bmatrix}\n  $$\n  Weighted attention $A \\odot D$:\n  $$\n    \\begin{bmatrix} 0.5 & 0 \\\\ 0.25 & 0.5 \\end{bmatrix}\n  $$\n  Output $(A \\odot D)\\,V$:\n  $$\n    \\text{output} = \\begin{bmatrix} 2 & 4 & 6 & 8 \\\\ 3 & 6 & 9 & 12 \\end{bmatrix}\n  $$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>seq_len</code> &le; 8,192</li>\n  <li>1 &le; <code>d_model</code> &le; 256</li>\n  <li>0 &lt; <code>gamma</code> &le; 1</li>\n  <li>All tensors are <code>float32</code> on GPU.</li>\n  <li>Performance is measured with <code>seq_len</code> = 4,096, <code>d_model</code> = 64</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K, V, output are device pointers\nextern \"C\" void solve(const float* Q, const float* K, const float* V, float* output, int seq_len,\n                      int d_model, float gamma) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K, V, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K: cute.Tensor,\n    V: cute.Tensor,\n    output: cute.Tensor,\n    seq_len: cute.Int32,\n    d_model: cute.Int32,\n    gamma: cute.Float32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K, V are tensors on GPU\n@jax.jit\ndef solve(\n    Q: jax.Array,\n    K: jax.Array,\n    V: jax.Array,\n    seq_len: int,\n    d_model: int,\n    gamma: float,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# Q, K, V, output are device pointers\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K: UnsafePointer[Float32, MutExternalOrigin],\n    V: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    seq_len: Int32,\n    d_model: Int32,\n    gamma: Float32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    seq_len: int,\n    d_model: int,\n    gamma: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K, V, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K: torch.Tensor,\n    V: torch.Tensor,\n    output: torch.Tensor,\n    seq_len: int,\n    d_model: int,\n    gamma: float,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"Decaying Causal Attention\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K: torch.Tensor,\n",
+        "        V: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        seq_len: int,\n",
+        "        d_model: int,\n",
+        "        gamma: float,\n",
+        "    ):\n",
+        "        assert Q.shape == (seq_len, d_model)\n",
+        "        assert K.shape == (seq_len, d_model)\n",
+        "        assert V.shape == (seq_len, d_model)\n",
+        "        assert output.shape == (seq_len, d_model)\n",
+        "        assert Q.dtype == K.dtype == V.dtype == output.dtype == torch.float32\n",
+        "        assert Q.device.type == \"cuda\"\n",
+        "        assert K.device.type == \"cuda\"\n",
+        "        assert V.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "\n",
+        "        scale = math.sqrt(d_model)\n",
+        "        positions = torch.arange(seq_len, device=Q.device, dtype=Q.dtype)\n",
+        "        # distances[n, m] = n - m; negative means m is in the future relative to n\n",
+        "        distances = positions.unsqueeze(1) - positions.unsqueeze(0)\n",
+        "        # causal: zero out future positions; clamp avoids overflow in gamma**negative\n",
+        "        causal = (distances >= 0).to(Q.dtype)\n",
+        "        decay_mask = torch.pow(gamma, distances.clamp(min=0)) * causal\n",
+        "        attn = torch.matmul(Q, K.T) / scale\n",
+        "        output.copy_(torch.matmul(attn * decay_mask, V))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"V\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_model\": (ctypes.c_int, \"in\"),\n",
+        "            \"gamma\": (ctypes.c_float, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        # Orthogonal K rows \u2192 QK^T / sqrt(4) = [[0.5, 0.5], [0.5, 0.5]].\n",
+        "        # With gamma=0.5 decay mask [[1, 0], [0.5, 1]], weighted attn = [[0.5, 0], [0.25, 0.5]].\n",
+        "        # Output row 0 = 0.5 * V[0]; row 1 = 0.25 * V[0] + 0.5 * V[1] = [3, 6, 9, 12].\n",
+        "        Q = torch.tensor([[1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0]], device=device, dtype=dtype)\n",
+        "        K = torch.tensor([[1.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 0.0]], device=device, dtype=dtype)\n",
+        "        V = torch.tensor(\n",
+        "            [[4.0, 8.0, 12.0, 16.0], [4.0, 8.0, 12.0, 16.0]], device=device, dtype=dtype\n",
+        "        )\n",
+        "        output = torch.zeros(2, 4, device=device, dtype=dtype)\n",
+        "        return {\"Q\": Q, \"K\": K, \"V\": V, \"output\": output, \"seq_len\": 2, \"d_model\": 4, \"gamma\": 0.5}\n",
+        "\n",
+        "    def _make_test_case(\n",
+        "        self,\n",
+        "        seq_len: int,\n",
+        "        d_model: int,\n",
+        "        gamma: float = 0.9,\n",
+        "        zero_qk: bool = False,\n",
+        "        negative: bool = False,\n",
+        "    ) -> Dict[str, Any]:\n",
+        "        dtype = torch.float32\n",
+        "        device = \"cuda\"\n",
+        "        if zero_qk:\n",
+        "            Q = torch.zeros(seq_len, d_model, device=device, dtype=dtype)\n",
+        "            K = torch.zeros(seq_len, d_model, device=device, dtype=dtype)\n",
+        "            V = torch.randn(seq_len, d_model, device=device, dtype=dtype)\n",
+        "        elif negative:\n",
+        "            Q = torch.randn(seq_len, d_model, device=device, dtype=dtype).neg()\n",
+        "            K = torch.randn(seq_len, d_model, device=device, dtype=dtype).neg()\n",
+        "            V = torch.randn(seq_len, d_model, device=device, dtype=dtype).neg()\n",
+        "        else:\n",
+        "            Q = torch.randn(seq_len, d_model, device=device, dtype=dtype)\n",
+        "            K = torch.randn(seq_len, d_model, device=device, dtype=dtype)\n",
+        "            V = torch.randn(seq_len, d_model, device=device, dtype=dtype)\n",
+        "        output = torch.zeros(seq_len, d_model, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K\": K,\n",
+        "            \"V\": V,\n",
+        "            \"output\": output,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"d_model\": d_model,\n",
+        "            \"gamma\": gamma,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge: single token (only self-attention possible)\n",
+        "        tests.append(self._make_test_case(1, 4, gamma=0.9))\n",
+        "\n",
+        "        # Edge: two tokens (matches example structure)\n",
+        "        tests.append(self._make_test_case(2, 4, gamma=0.5))\n",
+        "\n",
+        "        # Edge: gamma=1.0 \u2014 no decay, equal weight to all past positions\n",
+        "        tests.append(self._make_test_case(4, 8, gamma=1.0))\n",
+        "\n",
+        "        # Edge: small gamma \u2014 very sharp recency bias\n",
+        "        tests.append(self._make_test_case(4, 8, gamma=0.1))\n",
+        "\n",
+        "        # Zero Q and K: all attention scores are zero \u2192 output must be all zeros\n",
+        "        tests.append(self._make_test_case(8, 16, gamma=0.9, zero_qk=True))\n",
+        "\n",
+        "        # All-negative Q, K, V\n",
+        "        tests.append(self._make_test_case(16, 16, gamma=0.8, negative=True))\n",
+        "\n",
+        "        # Power-of-2 sequence length\n",
+        "        tests.append(self._make_test_case(32, 32, gamma=0.9))\n",
+        "\n",
+        "        # Power-of-2, larger\n",
+        "        tests.append(self._make_test_case(64, 64, gamma=0.8))\n",
+        "\n",
+        "        # Non-power-of-2 sequence length\n",
+        "        tests.append(self._make_test_case(30, 32, gamma=0.95))\n",
+        "\n",
+        "        # Non-power-of-2, larger realistic size\n",
+        "        tests.append(self._make_test_case(100, 64, gamma=0.9))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # Typical LLM head: seq_len=4096, head_dim=64\n",
+        "        return self._make_test_case(4096, 64, gamma=0.9)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/94_ssm_selective_scan.ipynb b/challenges/colab_exports/medium/94_ssm_selective_scan.ipynb
new file mode 100644
index 00000000..080f7a59
--- /dev/null
+++ b/challenges/colab_exports/medium/94_ssm_selective_scan.ipynb
@@ -0,0 +1,582 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\n  Implement the forward pass of a State Space Model (SSM) selective scan, the core operation in\n  Mamba-style sequence models. Given an input sequence <code>u</code>, time-step parameters\n  <code>delta</code>, state-transition matrix <code>A</code>, input projection <code>B</code>,\n  output projection <code>C</code>, and skip-connection weights <code>skip</code>, compute the\n  output sequence <code>y</code> in float32.\n</p>\n\n<svg width=\"700\" height=\"180\" viewBox=\"0 0 700 180\" style=\"display:block; margin:20px auto;\" xmlns=\"http://www.w3.org/2000/svg\">\n  <rect width=\"700\" height=\"180\" fill=\"#222\" rx=\"10\"/>\n  <!-- SSM chain diagram -->\n  <!-- State boxes -->\n  <rect x=\"55\" y=\"70\" width=\"60\" height=\"40\" rx=\"6\" fill=\"#1a3a5c\" stroke=\"#4a90d9\" stroke-width=\"1.5\"/>\n  <text x=\"85\" y=\"95\" fill=\"#4a90d9\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">h\u2080</text>\n\n  <rect x=\"195\" y=\"70\" width=\"60\" height=\"40\" rx=\"6\" fill=\"#1a3a5c\" stroke=\"#4a90d9\" stroke-width=\"1.5\"/>\n  <text x=\"225\" y=\"95\" fill=\"#4a90d9\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">h\u2081</text>\n\n  <rect x=\"335\" y=\"70\" width=\"60\" height=\"40\" rx=\"6\" fill=\"#1a3a5c\" stroke=\"#4a90d9\" stroke-width=\"1.5\"/>\n  <text x=\"365\" y=\"95\" fill=\"#4a90d9\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">h\u2082</text>\n\n  <rect x=\"475\" y=\"70\" width=\"60\" height=\"40\" rx=\"6\" fill=\"#1a3a5c\" stroke=\"#4a90d9\" stroke-width=\"1.5\"/>\n  <text x=\"505\" y=\"95\" fill=\"#4a90d9\" font-family=\"monospace\" font-size=\"13\" text-anchor=\"middle\">h\u2083</text>\n\n  <!-- Recurrence arrows -->\n  <line x1=\"115\" y1=\"90\" x2=\"193\" y2=\"90\" stroke=\"#4a90d9\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <line x1=\"255\" y1=\"90\" x2=\"333\" y2=\"90\" stroke=\"#4a90d9\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <line x1=\"395\" y1=\"90\" x2=\"473\" y2=\"90\" stroke=\"#4a90d9\" stroke-width=\"1.5\" marker-end=\"url(#arr)\"/>\n  <text x=\"153\" y=\"83\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"10\" text-anchor=\"middle\">\u0100</text>\n  <text x=\"293\" y=\"83\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"10\" text-anchor=\"middle\">\u0100</text>\n  <text x=\"433\" y=\"83\" fill=\"#ccc\" font-family=\"monospace\" font-size=\"10\" text-anchor=\"middle\">\u0100</text>\n\n  <!-- Input arrows (u into h) -->\n  <line x1=\"85\" y1=\"155\" x2=\"85\" y2=\"112\" stroke=\"#5cb85c\" stroke-width=\"1.5\" marker-end=\"url(#garr)\"/>\n  <line x1=\"225\" y1=\"155\" x2=\"225\" y2=\"112\" stroke=\"#5cb85c\" stroke-width=\"1.5\" marker-end=\"url(#garr)\"/>\n  <line x1=\"365\" y1=\"155\" x2=\"365\" y2=\"112\" stroke=\"#5cb85c\" stroke-width=\"1.5\" marker-end=\"url(#garr)\"/>\n  <line x1=\"505\" y1=\"155\" x2=\"505\" y2=\"112\" stroke=\"#5cb85c\" stroke-width=\"1.5\" marker-end=\"url(#garr)\"/>\n  <text x=\"85\" y=\"168\" fill=\"#5cb85c\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">B\u0304u\u2080</text>\n  <text x=\"225\" y=\"168\" fill=\"#5cb85c\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">B\u0304u\u2081</text>\n  <text x=\"365\" y=\"168\" fill=\"#5cb85c\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">B\u0304u\u2082</text>\n  <text x=\"505\" y=\"168\" fill=\"#5cb85c\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">B\u0304u\u2083</text>\n\n  <!-- Output arrows (h to y) -->\n  <line x1=\"85\" y1=\"68\" x2=\"85\" y2=\"30\" stroke=\"#e87c2e\" stroke-width=\"1.5\" marker-end=\"url(#oarr)\"/>\n  <line x1=\"225\" y1=\"68\" x2=\"225\" y2=\"30\" stroke=\"#e87c2e\" stroke-width=\"1.5\" marker-end=\"url(#oarr)\"/>\n  <line x1=\"365\" y1=\"68\" x2=\"365\" y2=\"30\" stroke=\"#e87c2e\" stroke-width=\"1.5\" marker-end=\"url(#oarr)\"/>\n  <line x1=\"505\" y1=\"68\" x2=\"505\" y2=\"30\" stroke=\"#e87c2e\" stroke-width=\"1.5\" marker-end=\"url(#oarr)\"/>\n  <text x=\"85\" y=\"22\" fill=\"#e87c2e\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">y\u2080</text>\n  <text x=\"225\" y=\"22\" fill=\"#e87c2e\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">y\u2081</text>\n  <text x=\"365\" y=\"22\" fill=\"#e87c2e\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">y\u2082</text>\n  <text x=\"505\" y=\"22\" fill=\"#e87c2e\" font-family=\"monospace\" font-size=\"11\" text-anchor=\"middle\">y\u2083</text>\n\n  <!-- Continuation arrow -->\n  <line x1=\"535\" y1=\"90\" x2=\"590\" y2=\"90\" stroke=\"#4a90d9\" stroke-width=\"1.5\" stroke-dasharray=\"4,3\" marker-end=\"url(#arr)\"/>\n\n  <!-- Arrow markers -->\n  <defs>\n    <marker id=\"arr\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#4a90d9\"/>\n    </marker>\n    <marker id=\"garr\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#5cb85c\"/>\n    </marker>\n    <marker id=\"oarr\" markerWidth=\"8\" markerHeight=\"6\" refX=\"8\" refY=\"3\" orient=\"auto\">\n      <polygon points=\"0 0, 8 3, 0 6\" fill=\"#e87c2e\"/>\n    </marker>\n  </defs>\n</svg>\n\n<h2>Implementation Requirements</h2>\n<p>\n  Implement the function <code>solve(u, delta, A, B, C, skip, y, batch, seq_len, d_model, d_state)</code>\n  with the signature unchanged. Do not use external libraries beyond the allowed framework.\n  Write the result into the pre-allocated output tensor <code>y</code>.\n</p>\n<p>\n  For each batch <code>b</code>, position <code>t</code>, and channel <code>d</code>, the computation is:\n</p>\n<p>\n  $$\n  \\bar{A}_{b,t,d,n} = \\exp(\\Delta_{b,t,d} \\cdot A_{d,n})\n  $$\n  $$\n  \\bar{B}_{b,t,d,n} = \\Delta_{b,t,d} \\cdot B_{b,t,n}\n  $$\n  $$\n  h_{b,t,d,n} = \\bar{A}_{b,t,d,n} \\cdot h_{b,t-1,d,n} + \\bar{B}_{b,t,d,n} \\cdot u_{b,t,d}\n  $$\n  $$\n  y_{b,t,d} = \\sum_{n} C_{b,t,n} \\cdot h_{b,t,d,n} + \\text{skip}_d \\cdot u_{b,t,d}\n  $$\n</p>\n<p>\n  The initial hidden state $h_{b,-1,d,n} = 0$ for all $b, d, n$.\n  All channels <code>d</code> are independent: they share the same <code>B</code> and <code>C</code>\n  projections but have separate state-transition rows in <code>A</code>.\n</p>\n\n<h2>Example</h2>\n<pre>\nInput:\n  u     = [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.0, 0.0]]]  shape (1,4,2)\n  delta = [[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]]  shape (1,4,2)\n  A     = [[-0.5, -1.0], [-0.5, -1.0]]                         shape (2,2)\n  B     = [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]]]  shape (1,4,2)\n  C     = [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]]]  shape (1,4,2)\n  skip  = [0.0, 0.0]                                            shape (2,)\n  batch=1, seq_len=4, d_model=2, d_state=2\n\nDerivation (delta=1 everywhere, so A_bar_dn = exp(A_dn)):\n  A_bar[d=0] = [exp(-0.5), exp(-1.0)] \u2248 [0.607, 0.368]\n  A_bar[d=1] = [exp(-0.5), exp(-1.0)] \u2248 [0.607, 0.368]\n\n  Hidden state h has shape (d_model=2, d_state=2); initial h = zeros.\n  t=0: h = [[1.000, 0.000], [0.000, 0.000]]  \u2192  y[0,0] = [1.000, 0.000]\n  t=1: h = [[0.607, 0.000], [0.000, 1.000]]  \u2192  y[0,1] = [0.000, 1.000]\n  t=2: h = [[1.368, 1.000], [1.000, 1.368]]  \u2192  y[0,2] = [2.368, 2.368]\n  t=3: h = [[0.830, 0.368], [0.607, 0.503]]  \u2192  y[0,3] = [0.599, 0.555]\n\nOutput:\n  y = [[[1.000, 0.000], [0.000, 1.000], [2.368, 2.368], [0.599, 0.555]]]\n</pre>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>batch</code> &le; 16</li>\n  <li>1 &le; <code>seq_len</code> &le; 8,192</li>\n  <li>1 &le; <code>d_model</code> &le; 2,048</li>\n  <li>1 &le; <code>d_state</code> &le; 64</li>\n  <li>All entries of <code>delta</code> are positive</li>\n  <li>All entries of <code>A</code> are negative (ensuring <code>A_bar &isin; (0, 1)</code>)</li>\n  <li>All tensors are float32 on the GPU</li>\n  <li>Performance is measured with <code>batch</code> = 4, <code>seq_len</code> = 4,096, <code>d_model</code> = 512, <code>d_state</code> = 16</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n#include <math.h>\n\n// u, delta, A, B, C, skip, y are device pointers\nextern \"C\" void solve(const float* u, const float* delta, const float* A, const float* B,\n                      const float* C, const float* skip, float* y, int batch, int seq_len,\n                      int d_model, int d_state) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# u, delta, A, B, C, skip, y are tensors on the GPU\n@cute.jit\ndef solve(\n    u: cute.Tensor,\n    delta: cute.Tensor,\n    A: cute.Tensor,\n    B: cute.Tensor,\n    C: cute.Tensor,\n    skip: cute.Tensor,\n    y: cute.Tensor,\n    batch: cute.Uint32,\n    seq_len: cute.Uint32,\n    d_model: cute.Uint32,\n    d_state: cute.Uint32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# u, delta, A, B, C, skip are tensors on GPU\n@jax.jit\ndef solve(\n    u: jax.Array,\n    delta: jax.Array,\n    A: jax.Array,\n    B: jax.Array,\n    C: jax.Array,\n    skip: jax.Array,\n    batch: int,\n    seq_len: int,\n    d_model: int,\n    d_state: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# u, delta, A, B, C, skip, y are device pointers\n@export\ndef solve(\n    u: UnsafePointer[Float32, MutExternalOrigin],\n    delta: UnsafePointer[Float32, MutExternalOrigin],\n    A: UnsafePointer[Float32, MutExternalOrigin],\n    B: UnsafePointer[Float32, MutExternalOrigin],\n    C: UnsafePointer[Float32, MutExternalOrigin],\n    skip: UnsafePointer[Float32, MutExternalOrigin],\n    y: UnsafePointer[Float32, MutExternalOrigin],\n    batch: Int32,\n    seq_len: Int32,\n    d_model: Int32,\n    d_state: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# u, delta, A, B, C, skip, y are tensors on the GPU\ndef solve(\n    u: torch.Tensor,\n    delta: torch.Tensor,\n    A: torch.Tensor,\n    B: torch.Tensor,\n    C: torch.Tensor,\n    skip: torch.Tensor,\n    y: torch.Tensor,\n    batch: int,\n    seq_len: int,\n    d_model: int,\n    d_state: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# u, delta, A, B, C, skip, y are tensors on the GPU\ndef solve(\n    u: torch.Tensor,\n    delta: torch.Tensor,\n    A: torch.Tensor,\n    B: torch.Tensor,\n    C: torch.Tensor,\n    skip: torch.Tensor,\n    y: torch.Tensor,\n    batch: int,\n    seq_len: int,\n    d_model: int,\n    d_state: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"SSM Selective Scan\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        u: torch.Tensor,\n",
+        "        delta: torch.Tensor,\n",
+        "        A: torch.Tensor,\n",
+        "        B: torch.Tensor,\n",
+        "        C: torch.Tensor,\n",
+        "        skip: torch.Tensor,\n",
+        "        y: torch.Tensor,\n",
+        "        batch: int,\n",
+        "        seq_len: int,\n",
+        "        d_model: int,\n",
+        "        d_state: int,\n",
+        "    ):\n",
+        "        assert u.shape == (batch, seq_len, d_model)\n",
+        "        assert delta.shape == (batch, seq_len, d_model)\n",
+        "        assert A.shape == (d_model, d_state)\n",
+        "        assert B.shape == (batch, seq_len, d_state)\n",
+        "        assert C.shape == (batch, seq_len, d_state)\n",
+        "        assert skip.shape == (d_model,)\n",
+        "        assert y.shape == (batch, seq_len, d_model)\n",
+        "        assert (\n",
+        "            u.dtype == delta.dtype == A.dtype == B.dtype == C.dtype == skip.dtype == torch.float32\n",
+        "        )\n",
+        "        assert u.device.type == \"cuda\"\n",
+        "        assert delta.device.type == \"cuda\"\n",
+        "        assert A.device.type == \"cuda\"\n",
+        "        assert B.device.type == \"cuda\"\n",
+        "        assert C.device.type == \"cuda\"\n",
+        "        assert skip.device.type == \"cuda\"\n",
+        "        assert y.device.type == \"cuda\"\n",
+        "\n",
+        "        # Hidden state: (batch, d_model, d_state)\n",
+        "        h = torch.zeros(batch, d_model, d_state, device=u.device, dtype=u.dtype)\n",
+        "\n",
+        "        for t in range(seq_len):\n",
+        "            delta_t = delta[:, t, :]  # (batch, d_model)\n",
+        "            u_t = u[:, t, :]  # (batch, d_model)\n",
+        "\n",
+        "            # Discretize: A_bar = exp(delta_t * A)\n",
+        "            # delta_t: (batch, d_model) -> (batch, d_model, 1)\n",
+        "            # A: (d_model, d_state) -> (1, d_model, d_state)\n",
+        "            A_bar = torch.exp(delta_t.unsqueeze(-1) * A.unsqueeze(0))  # (batch, d_model, d_state)\n",
+        "\n",
+        "            # B_bar = delta_t * B_t\n",
+        "            # B[:, t, :]: (batch, d_state) -> (batch, 1, d_state)\n",
+        "            B_bar = delta_t.unsqueeze(-1) * B[:, t, :].unsqueeze(1)  # (batch, d_model, d_state)\n",
+        "\n",
+        "            # State update: h = A_bar * h + B_bar * u_t\n",
+        "            h = A_bar * h + B_bar * u_t.unsqueeze(-1)  # (batch, d_model, d_state)\n",
+        "\n",
+        "            # Output: y_t = C_t @ h + skip * u_t\n",
+        "            # C[:, t, :]: (batch, d_state) -> einsum with h (batch, d_model, d_state)\n",
+        "            C_t = C[:, t, :]  # (batch, d_state)\n",
+        "            y_t = torch.einsum(\"bn,bdn->bd\", C_t, h) + skip * u_t  # (batch, d_model)\n",
+        "            y[:, t, :] = y_t\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"u\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"delta\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"A\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"B\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"C\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"skip\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"y\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"batch\": (ctypes.c_int, \"in\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_model\": (ctypes.c_int, \"in\"),\n",
+        "            \"d_state\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, batch, seq_len, d_model, d_state, zero_u=False, zero_delta=False):\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        if zero_u:\n",
+        "            u = torch.zeros(batch, seq_len, d_model, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            u = torch.randn(batch, seq_len, d_model, device=device, dtype=dtype)\n",
+        "        if zero_delta:\n",
+        "            delta = torch.zeros(batch, seq_len, d_model, device=device, dtype=dtype)\n",
+        "        else:\n",
+        "            # delta must be positive\n",
+        "            delta = torch.rand(batch, seq_len, d_model, device=device, dtype=dtype) + 0.01\n",
+        "        # A must be negative for stability (eigenvalues < 0)\n",
+        "        A = -torch.rand(d_model, d_state, device=device, dtype=dtype) - 0.01\n",
+        "        B = torch.randn(batch, seq_len, d_state, device=device, dtype=dtype)\n",
+        "        C = torch.randn(batch, seq_len, d_state, device=device, dtype=dtype)\n",
+        "        skip = torch.rand(d_model, device=device, dtype=dtype)\n",
+        "        y = torch.empty(batch, seq_len, d_model, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"u\": u,\n",
+        "            \"delta\": delta,\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"skip\": skip,\n",
+        "            \"y\": y,\n",
+        "            \"batch\": batch,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"d_model\": d_model,\n",
+        "            \"d_state\": d_state,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        device = \"cuda\"\n",
+        "        dtype = torch.float32\n",
+        "        batch, seq_len, d_model, d_state = 1, 4, 2, 2\n",
+        "        u = torch.tensor(\n",
+        "            [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.0, 0.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        delta = torch.tensor(\n",
+        "            [[[1.0, 1.0], [1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        A = torch.tensor([[-0.5, -1.0], [-0.5, -1.0]], device=device, dtype=dtype)\n",
+        "        B = torch.tensor(\n",
+        "            [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        C = torch.tensor(\n",
+        "            [[[1.0, 0.0], [0.0, 1.0], [1.0, 1.0], [0.5, 0.5]]],\n",
+        "            device=device,\n",
+        "            dtype=dtype,\n",
+        "        )\n",
+        "        skip = torch.tensor([0.0, 0.0], device=device, dtype=dtype)\n",
+        "        y = torch.empty(batch, seq_len, d_model, device=device, dtype=dtype)\n",
+        "        return {\n",
+        "            \"u\": u,\n",
+        "            \"delta\": delta,\n",
+        "            \"A\": A,\n",
+        "            \"B\": B,\n",
+        "            \"C\": C,\n",
+        "            \"skip\": skip,\n",
+        "            \"y\": y,\n",
+        "            \"batch\": batch,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"d_model\": d_model,\n",
+        "            \"d_state\": d_state,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        torch.manual_seed(42)\n",
+        "        tests = []\n",
+        "\n",
+        "        # Edge case: single token\n",
+        "        tests.append(self._make_test_case(1, 1, 1, 4))\n",
+        "\n",
+        "        # Edge case: tiny dimensions\n",
+        "        tests.append(self._make_test_case(1, 2, 2, 2))\n",
+        "\n",
+        "        # Edge case: zero input (output should be skip * 0 = 0)\n",
+        "        tests.append(self._make_test_case(1, 4, 4, 4, zero_u=True))\n",
+        "\n",
+        "        # Edge case: zero delta (A_bar=1, B_bar=0, so state stays zero, output = skip * u)\n",
+        "        tests.append(self._make_test_case(2, 4, 4, 4, zero_delta=True))\n",
+        "\n",
+        "        # Power-of-2 lengths\n",
+        "        tests.append(self._make_test_case(2, 16, 8, 4))\n",
+        "        tests.append(self._make_test_case(2, 64, 16, 8))\n",
+        "\n",
+        "        # Non-power-of-2\n",
+        "        tests.append(self._make_test_case(2, 30, 12, 4))\n",
+        "        tests.append(self._make_test_case(3, 100, 24, 8))\n",
+        "\n",
+        "        # Typical d_state=16 (common Mamba setting)\n",
+        "        tests.append(self._make_test_case(2, 128, 32, 16))\n",
+        "\n",
+        "        # Realistic size\n",
+        "        tests.append(self._make_test_case(4, 256, 64, 16))\n",
+        "\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        torch.manual_seed(0)\n",
+        "        # batch=4, seq_len=4096, d_model=512, d_state=16\n",
+        "        # Memory: u+delta+y ~ 3 * 4*4096*512*4 = 96MB; A+B+C+skip small\n",
+        "        # Total << 1GB, comfortably fits 5x in 16GB T4\n",
+        "        return self._make_test_case(4, 4096, 512, 16)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/challenges/colab_exports/medium/96_int8_kv_cache_attention.ipynb b/challenges/colab_exports/medium/96_int8_kv_cache_attention.ipynb
new file mode 100644
index 00000000..68278080
--- /dev/null
+++ b/challenges/colab_exports/medium/96_int8_kv_cache_attention.ipynb
@@ -0,0 +1,536 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "config_cell"
+      },
+      "outputs": [],
+      "source": [
+        "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+        "EVAL_LANG = 'cuda'\n",
+        "\n",
+        "SAVE_GPU = True\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "desc_cell"
+      },
+      "source": [
+        "<p>\nImplement decode-phase multi-head attention where the key and value caches are stored as\n<code>int8</code> with per-token scale factors. This memory layout halves KV-cache bandwidth\nversus <code>float32</code> and is used in production LLM serving systems such as TensorRT-LLM\nand vLLM. Given a query tensor <code>Q</code> for a single new token, <code>int8</code> key cache\n<code>K_int8</code>, <code>int8</code> value cache <code>V_int8</code>, and per-token scales\n<code>k_scale</code> and <code>v_scale</code>, dequantize the caches and compute scaled\ndot-product attention to produce <code>output</code>. All non-integer tensors use\n<code>float32</code>.\n</p>\n\n<h2>Implementation Requirements</h2>\n<ul>\n  <li>Implement the function <code>solve(Q, K_int8, V_int8, k_scale, v_scale, output, num_heads, seq_len, head_dim)</code>.</li>\n  <li>Do not change the function signature or use external libraries beyond the standard GPU frameworks.</li>\n  <li>Write the result into the provided <code>output</code> buffer.</li>\n  <li>Dequantize using per-token scales: <code>K_float[h, s, d] = K_int8[h, s, d] &times; k_scale[h, s]</code> (and analogously for V).</li>\n  <li>Use scaled dot-product attention with scale factor <code>1 / sqrt(head_dim)</code> and a softmax over the sequence dimension.</li>\n</ul>\n\n<h2>Example</h2>\n<p>\n  With <code>num_heads</code> = 1, <code>seq_len</code> = 3, <code>head_dim</code> = 4:\n</p>\n<p>\n  <strong>Input:</strong><br>\n  $Q$ (1&times;4):\n  $$\n  \\begin{bmatrix} 1 & 1 & 1 & 1 \\end{bmatrix}\n  $$\n  $K\\_int8$ (1&times;3&times;4):\n  $$\n  \\begin{bmatrix} 10 & 0 & 0 & 0 \\\\ 0 & 10 & 0 & 0 \\\\ 0 & 0 & 10 & 0 \\end{bmatrix}\n  $$\n  $k\\_scale$ (1&times;3): $\\begin{bmatrix} 0.1 & 0.1 & 0.1 \\end{bmatrix}$\n  &nbsp;&rArr;&nbsp;\n  $K\\_float$ (1&times;3&times;4):\n  $$\n  \\begin{bmatrix} 1 & 0 & 0 & 0 \\\\ 0 & 1 & 0 & 0 \\\\ 0 & 0 & 1 & 0 \\end{bmatrix}\n  $$\n  $V\\_int8$ (1&times;3&times;4):\n  $$\n  \\begin{bmatrix} 10 & 20 & 30 & 40 \\\\ 50 & 60 & 70 & 80 \\\\ 90 & 100 & 110 & 120 \\end{bmatrix}\n  $$\n  $v\\_scale$ (1&times;3): $\\begin{bmatrix} 0.1 & 0.1 & 0.1 \\end{bmatrix}$\n  &nbsp;&rArr;&nbsp;\n  $V\\_float$ (1&times;3&times;4):\n  $$\n  \\begin{bmatrix} 1 & 2 & 3 & 4 \\\\ 5 & 6 & 7 & 8 \\\\ 9 & 10 & 11 & 12 \\end{bmatrix}\n  $$\n</p>\n<p>\n  Scores = $Q \\cdot K\\_float^T / \\sqrt{4}$ = $\\begin{bmatrix} 0.5 & 0.5 & 0.5 \\end{bmatrix}$,\n  so <em>softmax</em> weights = $\\begin{bmatrix} 1/3 & 1/3 & 1/3 \\end{bmatrix}$.\n</p>\n<p>\n  <strong>Output</strong> (1&times;4):\n  $$\n  \\begin{bmatrix} 5.00 & 6.00 & 7.00 & 8.00 \\end{bmatrix}\n  $$\n</p>\n\n<h2>Constraints</h2>\n<ul>\n  <li>1 &le; <code>num_heads</code> &le; 64</li>\n  <li>1 &le; <code>seq_len</code> &le; 32,768</li>\n  <li>8 &le; <code>head_dim</code> &le; 256; <code>head_dim</code> is a multiple of 8</li>\n  <li><code>K_int8</code> and <code>V_int8</code> values are in $[-128, 127]$</li>\n  <li>All scale values are positive <code>float32</code></li>\n  <li>Performance is measured with <code>num_heads</code> = 32, <code>seq_len</code> = 8,192, <code>head_dim</code> = 128</li>\n</ul>\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUDA"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cu",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.cu\n",
+        "#include <cuda_runtime.h>\n\n// Q, K_int8, V_int8, k_scale, v_scale, output are device pointers\nextern \"C\" void solve(const float* Q, const int8_t* K_int8, const int8_t* V_int8,\n                      const float* k_scale, const float* v_scale, float* output, int num_heads,\n                      int seq_len, int head_dim) {}\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# CUTE"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_cute_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import cutlass\nimport cutlass.cute as cute\n\n\n# Q, K_int8, V_int8, k_scale, v_scale, output are tensors on the GPU\n@cute.jit\ndef solve(\n    Q: cute.Tensor,\n    K_int8: cute.Tensor,\n    V_int8: cute.Tensor,\n    k_scale: cute.Tensor,\n    v_scale: cute.Tensor,\n    output: cute.Tensor,\n    num_heads: cute.Int32,\n    seq_len: cute.Int32,\n    head_dim: cute.Int32,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# JAX"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_jax_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import jax\nimport jax.numpy as jnp\n\n\n# Q, K_int8, V_int8, k_scale, v_scale are tensors on GPU\n@jax.jit\ndef solve(\n    Q: jax.Array,\n    K_int8: jax.Array,\n    V_int8: jax.Array,\n    k_scale: jax.Array,\n    v_scale: jax.Array,\n    num_heads: int,\n    seq_len: int,\n    head_dim: int,\n) -> jax.Array:\n    # return output tensor directly\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# MOJO"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_mojo",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.mojo\n",
+        "from std.gpu.host import DeviceContext\nfrom std.memory import UnsafePointer\n\n\n# Q, K_int8, V_int8, k_scale, v_scale, output are device pointers\n@export\ndef solve(\n    Q: UnsafePointer[Float32, MutExternalOrigin],\n    K_int8: UnsafePointer[Int8, MutExternalOrigin],\n    V_int8: UnsafePointer[Int8, MutExternalOrigin],\n    k_scale: UnsafePointer[Float32, MutExternalOrigin],\n    v_scale: UnsafePointer[Float32, MutExternalOrigin],\n    output: UnsafePointer[Float32, MutExternalOrigin],\n    num_heads: Int32,\n    seq_len: Int32,\n    head_dim: Int32,\n) raises:\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Torch"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_pytorch_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\n\n\n# Q, K_int8, V_int8, k_scale, v_scale, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K_int8: torch.Tensor,\n    V_int8: torch.Tensor,\n    k_scale: torch.Tensor,\n    v_scale: torch.Tensor,\n    output: torch.Tensor,\n    num_heads: int,\n    seq_len: int,\n    head_dim: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Triton"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "starter_triton_py",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "%%writefile solution.py\n",
+        "import torch\nimport triton\nimport triton.language as tl\n\n\n# Q, K_int8, V_int8, k_scale, v_scale, output are tensors on the GPU\ndef solve(\n    Q: torch.Tensor,\n    K_int8: torch.Tensor,\n    V_int8: torch.Tensor,\n    k_scale: torch.Tensor,\n    v_scale: torch.Tensor,\n    output: torch.Tensor,\n    num_heads: int,\n    seq_len: int,\n    head_dim: int,\n):\n    pass\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluate Setup"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "challenge_logic"
+      },
+      "outputs": [],
+      "source": [
+        "# --- Core Challenge Base ---\n",
+        "from abc import ABC, abstractmethod\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "\n",
+        "class ChallengeBase(ABC):\n",
+        "    def __init__(self, name: str, atol: float, rtol: float, num_gpus: int, access_tier: str):\n",
+        "        self.name = name\n",
+        "        self.atol = atol\n",
+        "        self.rtol = rtol\n",
+        "        self.num_gpus = num_gpus\n",
+        "        self.access_tier = access_tier\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def reference_impl(self, *args, **kwargs):\n",
+        "        \"\"\"\n",
+        "        Reference solution implementation.\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def get_solve_signature(self) -> Dict[str, Any]:\n",
+        "        \"\"\"\n",
+        "        Get the function signature for solution.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with argtypes and restype for ctypes\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_example_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate an example test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate functional test cases for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            List of test case dictionaries\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "    @abstractmethod\n",
+        "    def generate_performance_test(self) -> List[Dict[str, Any]]:\n",
+        "        \"\"\"\n",
+        "        Generate a performance test case for this problem.\n",
+        "\n",
+        "        Returns:\n",
+        "            Dictionary with test case parameters\n",
+        "        \"\"\"\n",
+        "        pass\n",
+        "\n",
+        "\n",
+        "# --- Challenge Logic ---\n",
+        "import ctypes\n",
+        "import math\n",
+        "from typing import Any, Dict, List\n",
+        "\n",
+        "import torch\n",
+        "\n",
+        "\n",
+        "class Challenge(ChallengeBase):\n",
+        "    def __init__(self):\n",
+        "        super().__init__(\n",
+        "            name=\"INT8 KV-Cache Attention\",\n",
+        "            atol=1e-03,\n",
+        "            rtol=1e-03,\n",
+        "            num_gpus=1,\n",
+        "            access_tier=\"free\",\n",
+        "        )\n",
+        "\n",
+        "    def reference_impl(\n",
+        "        self,\n",
+        "        Q: torch.Tensor,\n",
+        "        K_int8: torch.Tensor,\n",
+        "        V_int8: torch.Tensor,\n",
+        "        k_scale: torch.Tensor,\n",
+        "        v_scale: torch.Tensor,\n",
+        "        output: torch.Tensor,\n",
+        "        num_heads: int,\n",
+        "        seq_len: int,\n",
+        "        head_dim: int,\n",
+        "    ):\n",
+        "        assert Q.shape == (num_heads, head_dim)\n",
+        "        assert K_int8.shape == (num_heads, seq_len, head_dim)\n",
+        "        assert V_int8.shape == (num_heads, seq_len, head_dim)\n",
+        "        assert k_scale.shape == (num_heads, seq_len)\n",
+        "        assert v_scale.shape == (num_heads, seq_len)\n",
+        "        assert output.shape == (num_heads, head_dim)\n",
+        "        assert Q.dtype == torch.float32\n",
+        "        assert K_int8.dtype == torch.int8\n",
+        "        assert V_int8.dtype == torch.int8\n",
+        "        assert k_scale.dtype == torch.float32\n",
+        "        assert v_scale.dtype == torch.float32\n",
+        "        assert output.dtype == torch.float32\n",
+        "        assert Q.device.type == \"cuda\"\n",
+        "        assert K_int8.device.type == \"cuda\"\n",
+        "        assert V_int8.device.type == \"cuda\"\n",
+        "        assert k_scale.device.type == \"cuda\"\n",
+        "        assert v_scale.device.type == \"cuda\"\n",
+        "        assert output.device.type == \"cuda\"\n",
+        "\n",
+        "        # Dequantize: K_float[h, s, d] = K_int8[h, s, d] * k_scale[h, s]\n",
+        "        K_float = K_int8.float() * k_scale.unsqueeze(-1)  # [num_heads, seq_len, head_dim]\n",
+        "        V_float = V_int8.float() * v_scale.unsqueeze(-1)  # [num_heads, seq_len, head_dim]\n",
+        "\n",
+        "        # Scaled dot-product attention: Q [num_heads, head_dim] attends to all seq_len positions\n",
+        "        scale = 1.0 / math.sqrt(head_dim)\n",
+        "        # scores: [num_heads, 1, seq_len]\n",
+        "        scores = torch.bmm(Q.unsqueeze(1), K_float.transpose(1, 2)) * scale\n",
+        "        weights = torch.softmax(scores, dim=-1)  # [num_heads, 1, seq_len]\n",
+        "\n",
+        "        # Weighted sum of V: [num_heads, 1, seq_len] @ [num_heads, seq_len, head_dim]\n",
+        "        out = torch.bmm(weights, V_float)  # [num_heads, 1, head_dim]\n",
+        "        output.copy_(out.squeeze(1))\n",
+        "\n",
+        "    def get_solve_signature(self) -> Dict[str, tuple]:\n",
+        "        return {\n",
+        "            \"Q\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"K_int8\": (ctypes.POINTER(ctypes.c_int8), \"in\"),\n",
+        "            \"V_int8\": (ctypes.POINTER(ctypes.c_int8), \"in\"),\n",
+        "            \"k_scale\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"v_scale\": (ctypes.POINTER(ctypes.c_float), \"in\"),\n",
+        "            \"output\": (ctypes.POINTER(ctypes.c_float), \"out\"),\n",
+        "            \"num_heads\": (ctypes.c_int, \"in\"),\n",
+        "            \"seq_len\": (ctypes.c_int, \"in\"),\n",
+        "            \"head_dim\": (ctypes.c_int, \"in\"),\n",
+        "        }\n",
+        "\n",
+        "    def _make_test_case(self, num_heads, seq_len, head_dim, zero_q=False, seed=None):\n",
+        "        device = \"cuda\"\n",
+        "        if seed is not None:\n",
+        "            torch.manual_seed(seed)\n",
+        "        if zero_q:\n",
+        "            Q = torch.zeros(num_heads, head_dim, dtype=torch.float32, device=device)\n",
+        "        else:\n",
+        "            Q = torch.randn(num_heads, head_dim, dtype=torch.float32, device=device)\n",
+        "        K_int8 = torch.randint(\n",
+        "            -128, 128, (num_heads, seq_len, head_dim), dtype=torch.int8, device=device\n",
+        "        )\n",
+        "        V_int8 = torch.randint(\n",
+        "            -128, 128, (num_heads, seq_len, head_dim), dtype=torch.int8, device=device\n",
+        "        )\n",
+        "        k_scale = torch.rand(num_heads, seq_len, dtype=torch.float32, device=device) * 0.1 + 0.01\n",
+        "        v_scale = torch.rand(num_heads, seq_len, dtype=torch.float32, device=device) * 0.1 + 0.01\n",
+        "        output = torch.empty(num_heads, head_dim, dtype=torch.float32, device=device)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K_int8\": K_int8,\n",
+        "            \"V_int8\": V_int8,\n",
+        "            \"k_scale\": k_scale,\n",
+        "            \"v_scale\": v_scale,\n",
+        "            \"output\": output,\n",
+        "            \"num_heads\": num_heads,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"head_dim\": head_dim,\n",
+        "        }\n",
+        "\n",
+        "    def generate_example_test(self) -> Dict[str, Any]:\n",
+        "        device = \"cuda\"\n",
+        "        num_heads, seq_len, head_dim = 1, 3, 4\n",
+        "        Q = torch.tensor([[1.0, 1.0, 1.0, 1.0]], dtype=torch.float32, device=device)\n",
+        "        K_int8 = torch.tensor(\n",
+        "            [[[10, 0, 0, 0], [0, 10, 0, 0], [0, 0, 10, 0]]], dtype=torch.int8, device=device\n",
+        "        )\n",
+        "        V_int8 = torch.tensor(\n",
+        "            [[[10, 20, 30, 40], [50, 60, 70, 80], [90, 100, 110, 120]]],\n",
+        "            dtype=torch.int8,\n",
+        "            device=device,\n",
+        "        )\n",
+        "        k_scale = torch.tensor([[0.1, 0.1, 0.1]], dtype=torch.float32, device=device)\n",
+        "        v_scale = torch.tensor([[0.1, 0.1, 0.1]], dtype=torch.float32, device=device)\n",
+        "        output = torch.empty(num_heads, head_dim, dtype=torch.float32, device=device)\n",
+        "        return {\n",
+        "            \"Q\": Q,\n",
+        "            \"K_int8\": K_int8,\n",
+        "            \"V_int8\": V_int8,\n",
+        "            \"k_scale\": k_scale,\n",
+        "            \"v_scale\": v_scale,\n",
+        "            \"output\": output,\n",
+        "            \"num_heads\": num_heads,\n",
+        "            \"seq_len\": seq_len,\n",
+        "            \"head_dim\": head_dim,\n",
+        "        }\n",
+        "\n",
+        "    def generate_functional_test(self) -> List[Dict[str, Any]]:\n",
+        "        tests = []\n",
+        "        # Edge: single key in cache\n",
+        "        tests.append(self._make_test_case(1, 1, 8, seed=0))\n",
+        "        # Edge: two keys\n",
+        "        tests.append(self._make_test_case(1, 2, 8, seed=1))\n",
+        "        # Edge: four keys, two heads\n",
+        "        tests.append(self._make_test_case(2, 4, 8, seed=2))\n",
+        "        # Zero query (uniform softmax weights)\n",
+        "        tests.append(self._make_test_case(1, 8, 16, zero_q=True, seed=3))\n",
+        "        # Power-of-2 seq_len\n",
+        "        tests.append(self._make_test_case(4, 16, 64, seed=4))\n",
+        "        tests.append(self._make_test_case(8, 64, 64, seed=5))\n",
+        "        # Non-power-of-2\n",
+        "        tests.append(self._make_test_case(2, 30, 64, seed=6))\n",
+        "        tests.append(self._make_test_case(4, 100, 64, seed=7))\n",
+        "        # Realistic sizes\n",
+        "        tests.append(self._make_test_case(16, 512, 64, seed=8))\n",
+        "        tests.append(self._make_test_case(32, 256, 128, seed=9))\n",
+        "        return tests\n",
+        "\n",
+        "    def generate_performance_test(self) -> Dict[str, Any]:\n",
+        "        return self._make_test_case(32, 8192, 128, seed=42)\n",
+        "\n",
+        "\n",
+        "ch = Challenge()\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "evaluator",
+        "cellView": "form",
+        "collapsed": true
+      },
+      "outputs": [],
+      "source": [
+        "import os\n",
+        "import time\n",
+        "import ctypes\n",
+        "import torch\n",
+        "\n",
+        "class Evaluate:\n",
+        "    @staticmethod\n",
+        "    def eval_cuda(ch):\n",
+        "        # 1. Compile a fresh uniquely named library\n",
+        "        so_filename = f'solution_func_{int(time.time())}.so'\n",
+        "        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')\n",
+        "        lib = ctypes.CDLL(f'./{so_filename}')\n",
+        "        \n",
+        "        # 2. Extract signature and set argtypes\n",
+        "        signature = ch.get_solve_signature()\n",
+        "        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]\n",
+        "        \n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_python(ch):\n",
+        "        import importlib.util\n",
+        "        import sys\n",
+        "        \n",
+        "        spec = importlib.util.spec_from_file_location(\"solution\", \"solution.py\")\n",
+        "        solution = importlib.util.module_from_spec(spec)\n",
+        "        sys.modules[\"solution\"] = solution\n",
+        "        spec.loader.exec_module(solution)\n",
+        "        \n",
+        "        signature = ch.get_solve_signature()\n",
+        "        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_python(solution, kwargs):\n",
+        "        solution.solve(**kwargs)\n",
+        "        if torch.cuda.is_available():\n",
+        "            torch.cuda.synchronize()\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def eval_mojo(ch):\n",
+        "        print(\"Mojo evaluation is currently executed via a separate runner or wrapper.\")\n",
+        "        print(\"Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,\")\n",
+        "        print(\"or run an external python bridge. This is a stub.\")\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _build_cuda_args(kwargs, signature):\n",
+        "        cuda_args = []\n",
+        "        for k, (arg_type, dir_type) in signature.items():\n",
+        "            val = kwargs[k]\n",
+        "            if isinstance(val, torch.Tensor):\n",
+        "                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))\n",
+        "            else:\n",
+        "                cuda_args.append(arg_type(val))\n",
+        "        return cuda_args\n",
+        "\n",
+        "    @staticmethod\n",
+        "    def _run_tests(ch, signature, run_fn):\n",
+        "        print(\"=== Running Functional Tests ===\")\n",
+        "        functional_tests = ch.generate_functional_test()\n",
+        "        all_passed = True\n",
+        "        \n",
+        "        for i, test in enumerate(functional_tests):\n",
+        "            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}\n",
+        "            \n",
+        "            # Run Reference\n",
+        "            ch.reference_impl(**ref_kwargs)\n",
+        "            \n",
+        "            # Run implementation\n",
+        "            run_fn(test_kwargs)\n",
+        "            if torch.cuda.is_available():\n",
+        "                torch.cuda.synchronize()\n",
+        "            \n",
+        "            # Verify outputs\n",
+        "            match = True\n",
+        "            for k, (_, dir_type) in signature.items():\n",
+        "                if dir_type == \"out\":\n",
+        "                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):\n",
+        "                        match = False\n",
+        "                        print(f\"\u274c Test {i+1}/{len(functional_tests)} Failed on output '{k}'\")\n",
+        "                        break\n",
+        "            \n",
+        "            if match:\n",
+        "                print(f\"\u2705 Test {i+1}/{len(functional_tests)} Passed\")\n",
+        "            else:\n",
+        "                all_passed = False\n",
+        "                break\n",
+        "                \n",
+        "        if all_passed:\n",
+        "            print(\"\\n\ud83c\udf89 All functional tests passed!\")\n",
+        "            return True\n",
+        "        else:\n",
+        "            return False\n",
+        "\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "# Evaluation code"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "disconnect"
+      },
+      "outputs": [],
+      "source": [
+        "# Run the evaluator based on configuration\n",
+        "if EVAL_LANG == 'cuda':\n",
+        "    Evaluate.eval_cuda(ch)\n",
+        "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+        "    Evaluate.eval_python(ch)\n",
+        "elif EVAL_LANG == 'mojo':\n",
+        "    Evaluate.eval_mojo(ch)\n",
+        "else:\n",
+        "    print(f\"Unknown language {EVAL_LANG}\")\n",
+        "\n",
+        "# Disconnect runtime to save Colab resources\n",
+        "if SAVE_GPU:\n",
+        "    from google.colab import runtime\n",
+        "    runtime.unassign()\n"
+      ]
+    }
+  ]
+}
\ No newline at end of file
diff --git a/scripts/migrate_to_colab.py b/scripts/migrate_to_colab.py
new file mode 100644
index 00000000..a952d99a
--- /dev/null
+++ b/scripts/migrate_to_colab.py
@@ -0,0 +1,320 @@
+import os
+import glob
+import json
+import re
+
+def generate_notebook(challenge_dir, output_dir):
+    parts = challenge_dir.strip('/').split('/')
+    level = parts[-2]
+    name = parts[-1]
+    
+    # Check if files exist
+    challenge_py_path = os.path.join(challenge_dir, "challenge.py")
+    challenge_html_path = os.path.join(challenge_dir, "challenge.html")
+    starter_dir = os.path.join(challenge_dir, "starter")
+    
+    if not os.path.exists(challenge_py_path) or not os.path.exists(challenge_html_path):
+        return
+        
+    print(f"Migrating {level} {name}...")
+    
+    cells = []
+    
+    # 1. Config cell
+    cells.append({
+        "cell_type": "code",
+        "execution_count": None,
+        "metadata": {"id": "config_cell"},
+        "outputs": [],
+        "source": [
+            "# Change this to your preferred framework (e.g., 'cuda', 'pytorch', 'triton', 'jax', 'mojo')\n",
+            "EVAL_LANG = 'cuda'\n",
+            "\n",
+            "SAVE_GPU = True\n"
+        ]
+    })
+    
+    # 2. Markdown description
+    with open(challenge_html_path, 'r', encoding='utf-8') as f:
+        html_content = f.read()
+        
+    # Replace LaTeX delimiters with standard markdown/mathjax delimiters
+    html_content = html_content.replace("\\(", "$").replace("\\)", "$")
+    html_content = html_content.replace("\\[", "$$").replace("\\]", "$$")
+    
+    cells.append({
+        "cell_type": "markdown",
+        "metadata": {"id": "desc_cell"},
+        "source": [html_content]
+    })
+    
+    # 3. Starter templates (hidden cells)
+    if os.path.exists(starter_dir):
+        for starter_file in sorted(os.listdir(starter_dir)):
+            if starter_file.startswith("starter."):
+                ext = starter_file[len("starter."):]
+                
+                # Header mapping
+                header_map = {
+                    "cu": "# CUDA",
+                    "cute.py": "# CUTE",
+                    "jax.py": "# JAX",
+                    "mojo": "# MOJO",
+                    "pytorch.py": "# Torch",
+                    "triton.py": "# Triton"
+                }
+                header_text = header_map.get(ext, f"# {ext.upper()}")
+                
+                cells.append({
+                    "cell_type": "markdown",
+                    "metadata": {},
+                    "source": [header_text]
+                })
+                
+                # Default output filenames
+                out_filename = "solution.cu" if ext == "cu" else f"solution.{ext}"
+                if out_filename.endswith(".py"):
+                    out_filename = "solution.py"
+                    
+                with open(os.path.join(starter_dir, starter_file), 'r', encoding='utf-8') as f:
+                    starter_content = f.read()
+                    
+                cells.append({
+                    "cell_type": "code",
+                    "execution_count": None,
+                    "metadata": {
+                        "id": f"starter_{ext.replace('.', '_')}",
+                        "cellView": "form",
+                        "collapsed": True
+                    },
+                    "outputs": [],
+                    "source": [
+                        f"%%writefile {out_filename}\n",
+                        starter_content
+                    ]
+                })
+    
+    # 4. Challenge Base & Challenge logic
+    base_py_path = os.path.join("challenges", "core", "challenge_base.py")
+    with open(base_py_path, 'r', encoding='utf-8') as f:
+        base_content = f.read()
+        
+    with open(challenge_py_path, 'r', encoding='utf-8') as f:
+        challenge_content = f.read()
+        
+    # Remove the import statement
+    challenge_content = re.sub(r"from core\.challenge_base import ChallengeBase\n?", "", challenge_content)
+    
+    combined_challenge = (
+        "# --- Core Challenge Base ---\n" +
+        base_content + "\n\n" +
+        "# --- Challenge Logic ---\n" +
+        challenge_content + "\n\n" +
+        "ch = Challenge()\n"
+    )
+    
+    cells.append({
+        "cell_type": "markdown",
+        "metadata": {},
+        "source": ["# Evaluate Setup"]
+    })
+    
+    cells.append({
+        "cell_type": "code",
+        "execution_count": None,
+        "metadata": {"id": "challenge_logic"},
+        "outputs": [],
+        "source": [line + "\n" for line in combined_challenge.split("\n")]
+    })
+    
+    # 5. Evaluator script
+    eval_script = """import os
+import time
+import ctypes
+import torch
+
+class Evaluate:
+    @staticmethod
+    def eval_cuda(ch):
+        # 1. Compile a fresh uniquely named library
+        so_filename = f'solution_func_{int(time.time())}.so'
+        os.system(f'nvcc -shared -Xcompiler -fPIC -O3 solution.cu -o {so_filename}')
+        lib = ctypes.CDLL(f'./{so_filename}')
+        
+        # 2. Extract signature and set argtypes
+        signature = ch.get_solve_signature()
+        lib.solve.argtypes = [arg_info[0] for arg_info in signature.values()]
+        
+        Evaluate._run_tests(ch, signature, lambda kwargs: lib.solve(*Evaluate._build_cuda_args(kwargs, signature)))
+
+    @staticmethod
+    def eval_python(ch):
+        import importlib.util
+        import sys
+        
+        spec = importlib.util.spec_from_file_location("solution", "solution.py")
+        solution = importlib.util.module_from_spec(spec)
+        sys.modules["solution"] = solution
+        spec.loader.exec_module(solution)
+        
+        signature = ch.get_solve_signature()
+        Evaluate._run_tests(ch, signature, lambda kwargs: Evaluate._run_python(solution, kwargs))
+
+    @staticmethod
+    def _run_python(solution, kwargs):
+        solution.solve(**kwargs)
+        if torch.cuda.is_available():
+            torch.cuda.synchronize()
+
+    @staticmethod
+    def eval_mojo(ch):
+        print("Mojo evaluation is currently executed via a separate runner or wrapper.")
+        print("Ensure you have the mojo compiler installed and use 'mojo build solution.mojo' + ctypes/ffi,")
+        print("or run an external python bridge. This is a stub.")
+
+    @staticmethod
+    def _build_cuda_args(kwargs, signature):
+        cuda_args = []
+        for k, (arg_type, dir_type) in signature.items():
+            val = kwargs[k]
+            if isinstance(val, torch.Tensor):
+                cuda_args.append(ctypes.cast(val.data_ptr(), arg_type))
+            else:
+                cuda_args.append(arg_type(val))
+        return cuda_args
+
+    @staticmethod
+    def _run_tests(ch, signature, run_fn):
+        print("=== Running Functional Tests ===")
+        functional_tests = ch.generate_functional_test()
+        all_passed = True
+        
+        for i, test in enumerate(functional_tests):
+            ref_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}
+            test_kwargs = {k: (v.clone() if isinstance(v, torch.Tensor) else v) for k, v in test.items()}
+            
+            # Run Reference
+            ch.reference_impl(**ref_kwargs)
+            
+            # Run implementation
+            run_fn(test_kwargs)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            
+            # Verify outputs
+            match = True
+            for k, (_, dir_type) in signature.items():
+                if dir_type == "out":
+                    if not torch.allclose(ref_kwargs[k], test_kwargs[k], atol=ch.atol, rtol=ch.rtol):
+                        match = False
+                        print(f"❌ Test {i+1}/{len(functional_tests)} Failed on output '{k}'")
+                        break
+            
+            if match:
+                print(f"✅ Test {i+1}/{len(functional_tests)} Passed")
+            else:
+                all_passed = False
+                break
+                
+        if all_passed:
+            print("\\n🎉 All functional tests passed!")
+            return True
+        else:
+            return False
+"""
+    cells.append({
+        "cell_type": "code",
+        "execution_count": None,
+        "metadata": {"id": "evaluator", "cellView": "form", "collapsed": True},
+        "outputs": [],
+        "source": [line + "\n" for line in eval_script.split("\n")]
+    })
+    
+    cells.append({
+        "cell_type": "markdown",
+        "metadata": {},
+        "source": ["# Evaluation code"]
+    })
+    
+    # 6. Run and Disconnect runtime cell
+    cells.append({
+        "cell_type": "code",
+        "execution_count": None,
+        "metadata": {"id": "disconnect"},
+        "outputs": [],
+        "source": [
+            "# Run the evaluator based on configuration\n",
+            "if EVAL_LANG == 'cuda':\n",
+            "    Evaluate.eval_cuda(ch)\n",
+            "elif EVAL_LANG in ['pytorch', 'triton', 'jax', 'cute']:\n",
+            "    Evaluate.eval_python(ch)\n",
+            "elif EVAL_LANG == 'mojo':\n",
+            "    Evaluate.eval_mojo(ch)\n",
+            "else:\n",
+            "    print(f\"Unknown language {EVAL_LANG}\")\n",
+            "\n",
+            "# Disconnect runtime to save Colab resources\n",
+            "if SAVE_GPU:\n",
+            "    from google.colab import runtime\n",
+            "    runtime.unassign()\n"
+        ]
+    })
+    
+    notebook = {
+      "nbformat": 4,
+      "nbformat_minor": 0,
+      "metadata": {
+        "accelerator": "GPU",
+        "colab": {
+          "gpuType": "T4",
+          "provenance": []
+        }
+      },
+      "cells": cells
+    }
+    
+    out_dir_level = os.path.join(output_dir, level)
+    out_file = os.path.join(out_dir_level, f"{name}.ipynb")
+    os.makedirs(out_dir_level, exist_ok=True)
+    with open(out_file, 'w', encoding='utf-8') as f:
+        json.dump(notebook, f, indent=2)
+    print(f"Saved to {out_file}")
+
+if __name__ == "__main__":
+    out_dir = "challenges/colab_exports"
+    
+    exported_notebooks = []
+    
+    challenges_glob = glob.glob("challenges/*/*")
+    for challenge_dir in challenges_glob:
+        if os.path.isdir(challenge_dir) and "colab_exports" not in challenge_dir:
+            generate_notebook(challenge_dir, out_dir)
+            parts = challenge_dir.strip('/').split('/')
+            level = parts[-2]
+            name = parts[-1]
+            if os.path.exists(os.path.join(out_dir, level, f"{name}.ipynb")):
+                exported_notebooks.append((level, name))
+                
+    # Create README.md
+    readme_path = os.path.join(out_dir, "README.md")
+    with open(readme_path, "w", encoding="utf-8") as f:
+        f.write("# LeetGPU Colab Notebooks\n\n")
+        f.write("Click the badges below to open the challenges directly in Google Colab.\n\n")
+        
+        # Group by level
+        grouped = {}
+        for level, name in exported_notebooks:
+            if level not in grouped:
+                grouped[level] = []
+            grouped[level].append(name)
+            
+        # Define specific sort order for levels
+        level_order = {"easy": 1, "medium": 2, "hard": 3}
+        for level in sorted(grouped.keys(), key=lambda x: level_order.get(x, 99)):
+            f.write(f"## {level.capitalize()}\n\n")
+            for name in sorted(grouped[level]):
+                colab_link = f"https://colab.research.google.com/github/lekhit/leetgpu-challenges/blob/main/challenges/colab_exports/{level}/{name}.ipynb"
+                badge = f"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]({colab_link})"
+                f.write(f"- {badge} **{name}**\n")
+            f.write("\n")
+    print(f"Generated {readme_path}")