From 70b251d8861c69c6e64f8dfeb4a0348c3a140be6 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 9 Feb 2026 17:47:09 +0100
Subject: [PATCH 01/43] implement racing for strong branching

---
 cpp/src/dual_simplex/pseudo_costs.cpp | 186 +++++++++++++++++++++++---
 cpp/src/linear_programming/pdlp.cu    |   3 +-
 2 files changed, 168 insertions(+), 21 deletions(-)
diff --git a/cpp/src/dual_simplex/pseudo_costs.cpp b/cpp/src/dual_simplex/pseudo_costs.cpp
index 682bdaa6f9..13a0308b92 100644
--- a/cpp/src/dual_simplex/pseudo_costs.cpp
+++ b/cpp/src/dual_simplex/pseudo_costs.cpp
@@ -31,10 +31,17 @@ void strong_branch_helper(i_t start,
                           const std::vector<f_t>& root_soln,
                           const std::vector<variable_status_t>& root_vstatus,
                           const std::vector<f_t>& edge_norms,
-                          pseudo_costs_t<i_t, f_t>& pc)
+                          pseudo_costs_t<i_t, f_t>& pc,
+                          std::vector<f_t>& ds_obj_down,
+                          std::vector<f_t>& ds_obj_up,
+                          std::vector<dual::status_t>& ds_status_down,
+                          std::vector<dual::status_t>& ds_status_up,
+                          std::atomic<int>* concurrent_halt)
 {
   lp_problem_t child_problem = original_lp;
 
+  assert(concurrent_halt != nullptr && "Concurrent halt pointer cannot be nullptr");
+
   constexpr bool verbose = false;
   f_t last_log           = tic();
   i_t thread_id          = omp_get_thread_num();
@@ -55,7 +62,7 @@ void strong_branch_helper(i_t start,
       child_settings.set_log(false);
       f_t lp_start_time = tic();
       f_t elapsed_time  = toc(start_time);
-      if (elapsed_time > settings.time_limit) { break; }
+      if (elapsed_time > settings.time_limit || *concurrent_halt == 1) { break; }
       child_settings.time_limit      = std::max(0.0, settings.time_limit - elapsed_time);
       child_settings.iteration_limit = 200;
       lp_solution_t<i_t, f_t> solution(original_lp.num_rows, original_lp.num_cols);
@@ -89,6 +96,7 @@ void strong_branch_helper(i_t start,
 
       if (branch == 0) {
         pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0);
+        ds_status_down[k]        = status;
         if (verbose) {
           settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
                               thread_id,
@@ -100,6 +108,7 @@ void strong_branch_helper(i_t start,
         }
       } else {
         pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0);
+        ds_status_up[k]        = status;
         if (verbose) {
           settings.log.printf(
             "Thread id %2d remaining %d variable %d branch %d obj %e change down %e change up %e "
@@ -109,14 +118,18 @@ void strong_branch_helper(i_t start,
             j,
             branch,
             obj,
-            pc.strong_branch_down[k],
-            pc.strong_branch_up[k],
+            ds_obj_down[k],
+            ds_obj_up[k],
             toc(start_time));
         }
       }
-      if (toc(start_time) > settings.time_limit) { break; }
+      if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
+        break; 
+      }
+    }
+    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
+      break; 
     }
-    if (toc(start_time) > settings.time_limit) { break; }
 
     const i_t completed = pc.num_strong_branches_completed++;
 
@@ -131,7 +144,12 @@ void strong_branch_helper(i_t start,
     child_problem.lower[j] = original_lp.lower[j];
     child_problem.upper[j] = original_lp.upper[j];
 
-    if (toc(start_time) > settings.time_limit) { break; }
+    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
+      if (*concurrent_halt == 1) {
+        std::cout << "Concurrent halt reached in Dual Simplex" << std::endl;
+      }
+      break; 
+    }
   }
 }
 
@@ -292,6 +310,40 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   return mps_model;
 }
 
+// Merge a single strong branching result from Dual Simplex and PDLP.
+// Rules:
+//   1. If both found optimal   -> keep DS (higher quality vertex solution)
+//   2. Else if Dual Simplex found infeasible -> declare infeasible
+//   3. Else if one is optimal -> keep the optimal one
+//   4. Else if Dual Simplex hit iteration limit -> keep DS
+//   5. Else if none converged -> NaN (original objective)
+// Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both
+template <typename i_t, typename f_t>
+static std::pair<f_t, i_t> merge_sb_result(f_t ds_val,
+                    dual::status_t ds_status,
+                    f_t pdlp_dual_obj,
+                    bool pdlp_optimal)
+{
+  // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify
+
+  // Rule 1: Both optimal -> keep DS
+  if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; }
+
+  // Rule 2: Dual Simplex found infeasible -> declare infeasible
+  if (ds_status == dual::status_t::DUAL_UNBOUNDED) { return {std::numeric_limits<f_t>::infinity(), 0}; }
+
+  // Rule 3: Only one converged -> keep that
+  if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; }
+  if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; }
+
+  // Rule 4: Dual Simplex hit iteration limit -> keep DS
+  if (ds_status == dual::status_t::ITERATION_LIMIT) { return {ds_val, 0}; }
+
+  // Rule 5: None converged -> NaN
+  return {std::numeric_limits<f_t>::quiet_NaN(), 2};
+}
+
+
 template <typename i_t, typename f_t>
 void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
                       const lp_problem_t<i_t, f_t>& original_lp,
@@ -310,10 +362,26 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
   pc.strong_branch_up.assign(fractional.size(), 0);
   pc.num_strong_branches_completed = 0;
 
-  if (settings.mip_batch_pdlp_strong_branching) {
-    settings.log.printf("Batch PDLP strong branching enabled\n");
+  settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
+                      settings.num_threads,
+                      fractional.size());
+
+  // Race both batch PDLP and parallel Dual Simplex
+  std::atomic<int> concurrent_halt{0};
+
+  std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+
+  auto pdlp_thread = std::thread([&]() {
+    
+    if (settings.mip_batch_pdlp_strong_branching == 0)
+     return;
+  
+    settings.log.printf("Racing batch PDLP and Dual Simplex for strong branching\n");
 
     f_t start_batch = tic();
+    pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
+    pdlp_settings.concurrent_halt = &concurrent_halt;
 
     // Use original_problem to create the BatchLP problem
     csr_matrix_t<i_t, f_t> A_row(original_problem.A.m, original_problem.A.n, 0);
@@ -332,7 +400,7 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
 
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
     const auto solutions =
-      batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values);
+      batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
@@ -377,14 +445,19 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
                      ? solutions.get_dual_objective_value(k + fractional.size())
                      : std::numeric_limits<f_t>::quiet_NaN();
 
-      pc.strong_branch_down[k] = obj_down - root_obj;
-      pc.strong_branch_up[k]   = obj_up - root_obj;
+      pdlp_obj_down[k] = obj_down - root_obj;
+      pdlp_obj_up[k]   = obj_up - root_obj;
     }
-  } else {
-    settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
-                        settings.num_threads,
-                        fractional.size());
-    f_t strong_branching_start_time = tic();
+
+    // Batch PDLP finished – tell Dual Simplex to stop
+    concurrent_halt.store(1);
+  });
+  
+  std::vector<dual::status_t> ds_status_down(fractional.size(), dual::status_t::UNSET);
+  std::vector<dual::status_t> ds_status_up(fractional.size(), dual::status_t::UNSET);
+  std::vector<f_t> ds_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> ds_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
+  f_t dual_simplex_strong_branching_time = tic();
 
 #pragma omp parallel num_threads(settings.num_threads)
     {
@@ -418,10 +491,85 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
                              root_soln,
                              root_vstatus,
                              edge_norms,
-                             pc);
+                             pc,
+                             ds_obj_down,
+                             ds_obj_up,
+                             ds_status_down,
+                             ds_status_up,
+                             &concurrent_halt);
       }
     }
-    settings.log.printf("Strong branching completed in %.2fs\n", toc(strong_branching_start_time));
+
+  if (settings.mip_batch_pdlp_strong_branching == 1) {
+    if (concurrent_halt.load() == 1) {
+      settings.log.printf("Batch PDLP finished before Dual Simplex\n");
+    }
+    else {
+      settings.log.printf("Dual Simplex finished before Batch PDLP\n");
+    }
+  }
+
+  // Dual Simplex finished all subproblems – tell Batch PDLP to stop
+  concurrent_halt.store(1);
+
+  pdlp_thread.join();
+
+  settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time));
+
+
+  // Collect Dual Simplex statistics
+  i_t ds_optimal_count = 0;
+  i_t ds_dual_feasible_only_count = 0;
+  for (i_t k = 0; k < fractional.size(); k++) {
+    if (ds_status_down[k] == dual::status_t::OPTIMAL) ds_optimal_count++;
+    if (ds_status_up[k] == dual::status_t::OPTIMAL) ds_optimal_count++;
+    if (ds_status_down[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++;
+    if (ds_status_up[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++;
+  }
+
+  settings.log.printf(
+    "Dual Simplex found %d/%d optimal solutions and %d/%d dual feasible only solutions\n",
+    ds_optimal_count,
+    fractional.size() * 2,
+    ds_dual_feasible_only_count,
+    fractional.size() * 2);
+
+  if (settings.mip_batch_pdlp_strong_branching == 1) {
+    // Collect Batch PDLP statistics
+    i_t pdlp_optimal_count = 0;
+    for (i_t k = 0; k < fractional.size(); k++) {
+      if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++;
+      if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++;
+    }
+
+    settings.log.printf(
+      "Batch PDLP found %d/%d optimal solutions\n",
+      pdlp_optimal_count,
+      fractional.size() * 2);
+  }
+
+  i_t merged_from_ds = 0;
+  i_t merged_from_pdlp = 0;
+  i_t merged_nan = 0;
+  for (i_t k = 0; k < fractional.size(); k++) {
+    const auto [value_down, source_down] = merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], !std::isnan(pdlp_obj_down[k]));
+    pc.strong_branch_down[k] = value_down;
+    if (source_down == 0) merged_from_ds++;
+    else if (source_down == 1) merged_from_pdlp++;
+    else merged_nan++;
+    const auto [value_up, source_up] = merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], !std::isnan(pdlp_obj_up[k]));
+    pc.strong_branch_up[k] = value_up;
+    if (source_up == 0) merged_from_ds++;
+    else if (source_up == 1) merged_from_pdlp++;
+    else merged_nan++;
+  }
+
+  if (settings.mip_batch_pdlp_strong_branching == 1) {
+    settings.log.printf(
+      "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN)\n",
+      merged_from_ds,
+      merged_from_pdlp,
+      merged_nan);
   }
 
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
diff --git a/cpp/src/linear_programming/pdlp.cu b/cpp/src/linear_programming/pdlp.cu
index 8a05f1b2a1..69e822a1f0 100644
--- a/cpp/src/linear_programming/pdlp.cu
+++ b/cpp/src/linear_programming/pdlp.cu
@@ -444,8 +444,7 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   }
 
   // Check for concurrent limit
-  if (settings_.method == method_t::Concurrent && settings_.concurrent_halt != nullptr &&
-      *settings_.concurrent_halt == 1) {
+  if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
 #ifdef PDLP_VERBOSE_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;

From 6848df1fb82940a7f0fad7f30a401c4220eea59c Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 9 Feb 2026 17:55:22 +0100
Subject: [PATCH 02/43] race on by default

---
 cpp/include/cuopt/linear_programming/mip/solver_settings.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index c5c26884f5..1034af41b2 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -96,7 +96,7 @@ class mip_solver_settings_t {
   i_t reduced_cost_strengthening      = -1;
   f_t cut_change_threshold            = 1e-3;
   f_t cut_min_orthogonality           = 0.5;
-  i_t mip_batch_pdlp_strong_branching = 0;
+  i_t mip_batch_pdlp_strong_branching = 1;
   i_t num_gpus                        = 1;
   bool log_to_console                 = true;
 

From fc7aa0468c3e54c124c7932b2468e2e06024240d Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 17 Feb 2026 14:11:54 +0100
Subject: [PATCH 03/43] tmp

---
 .../linear_programming/cuopt/run_pdlp.cu      |   2 +-
 compile.sh                                    |   2 +
 .../initial_scaling.cu                        |   2 +-
 cpp/src/pdlp/pdhg.cu                          |  35 ++++-
 cpp/src/pdlp/pdlp.cu                          | 142 +++++++++++++++++-
 .../restart_strategy/pdlp_restart_strategy.cu |   6 +-
 cpp/src/pdlp/solve.cu                         |   3 +
 .../adaptive_step_size_strategy.cu            | 108 +++++++++----
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |   2 +-
 .../solver_settings/solver_settings.py        |  38 +++++
 .../linear_programming/data_definition.py     |   5 +
 run_multiple.sh                               |   3 +
 test.py                                       |  12 ++
 13 files changed, 318 insertions(+), 42 deletions(-)
 create mode 100755 compile.sh
 create mode 100755 run_multiple.sh
 create mode 100755 test.py

diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index 229c72a49b..c3d6ad42f4 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -107,7 +107,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  settings.presolve  = program.get<int>("--presolve");
+  //settings.presolve  = program.get<int>("--presolve");
 
   return settings;
 }
diff --git a/compile.sh b/compile.sh
new file mode 100755
index 0000000000..bedf3a7506
--- /dev/null
+++ b/compile.sh
@@ -0,0 +1,2 @@
+./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF
+./build.sh cuopt cuopt_mps_parser
\ No newline at end of file
diff --git a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
index 031cd9c3b6..5a08c3bb53 100644
--- a/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
+++ b/cpp/src/pdlp/initial_scaling_strategy/initial_scaling.cu
@@ -545,7 +545,7 @@ void pdlp_initial_scaling_strategy_t<i_t, f_t>::scale_problem()
 #ifdef CUPDLP_DEBUG_MODE
   print("constraint_lower_bound", op_problem_scaled_.constraint_lower_bounds);
   print("constraint_upper_bound", op_problem_scaled_.constraint_upper_bounds);
-  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds);
+  std::vector<f_t2> variable_bounds = host_copy(op_problem_scaled_.variable_bounds, stream_view_);
   std::vector<f_t> lower_bounds;
   std::vector<f_t> upper_bounds;
   for (const auto& variable_bound : variable_bounds) {
diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index 9a44bd31e3..c5efdcd722 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -567,6 +567,12 @@ struct primal_reflected_major_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_major_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_major_projection");
+
     const f_t next = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
@@ -576,6 +582,9 @@ struct primal_reflected_major_projection_bulk_op {
     potential_next_primal[idx] = next_clamped;
     dual_slack[idx]            = (next_clamped - next) / step_size;
     reflected_primal[idx]      = f_t(2.0) * next_clamped - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_major_projection");
   }
 };
 
@@ -599,6 +608,12 @@ struct dual_reflected_major_projection_bulk_op {
     const f_t current_dual = dual_solution[idx];
     const f_t Ax           = dual_gradient[idx];
 
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_major_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_major_projection");
+    cuopt_assert(!isnan(Ax), "dual_gradient is NaN in dual_reflected_major_projection");
+
     const f_t tmp = current_dual / step_size - Ax;
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
@@ -607,6 +622,9 @@ struct dual_reflected_major_projection_bulk_op {
 
     potential_next_dual[idx] = next_dual;
     reflected_dual[idx]      = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_major_projection");
   }
 };
 
@@ -631,12 +649,19 @@ struct primal_reflected_projection_bulk_op {
     const f_t obj_coef   = objective_coefficients[var_idx];
     const f_t aty_val    = current_AtY[idx];
 
+    cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
+    cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
     const f_t2 bounds = variable_bounds[var_idx];
     reflected = cuda::std::max(cuda::std::min(reflected, get_upper(bounds)), get_lower(bounds));
 
     reflected_primal[idx] = f_t(2.0) * reflected - primal_val;
+
+    cuopt_assert(!isnan(reflected_primal[idx]),
+                 "reflected_primal is NaN after primal_reflected_projection");
   }
 };
 
@@ -659,13 +684,21 @@ struct dual_reflected_projection_bulk_op {
 
     const f_t step_size    = dual_step_size[batch_idx];
     const f_t current_dual = dual_solution[idx];
-    const f_t tmp          = current_dual / step_size - dual_gradient[idx];
+
+    cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
+    cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+
+    const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
       cuda::std::max<f_t>(-constraint_upper_bounds[constraint_idx],
                           cuda::std::min<f_t>(tmp, -constraint_lower_bounds[constraint_idx]));
     const f_t next_dual = (tmp - tmp_proj) * step_size;
 
     reflected_dual[idx] = f_t(2.0) * next_dual - current_dual;
+
+    cuopt_assert(!isnan(reflected_dual[idx]),
+                 "reflected_dual is NaN after dual_reflected_projection");
   }
 };
 
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 67e001db29..e1ab866b5b 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -34,8 +34,10 @@
 #include <cub/cub.cuh>
 
 #include <thrust/count.h>
+#include <thrust/logical.h>
 #include <thrust/extrema.h>
 
+#include <cmath>
 #include <optional>
 #include <unordered_set>
 
@@ -1406,10 +1408,27 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
                                  const f_t interaction,
                                  f_t* fixed_point_error)
 {
+  cuopt_assert(!isnan(norm_squared_delta_primal), "norm_squared_delta_primal must not be NaN");
+  cuopt_assert(!isnan(norm_squared_delta_dual), "norm_squared_delta_dual must not be NaN");
+  cuopt_assert(!isnan(primal_weight), "primal_weight must not be NaN");
+  cuopt_assert(!isnan(step_size), "step_size must not be NaN");
+  cuopt_assert(!isnan(interaction), "interaction must not be NaN");
+  cuopt_assert(norm_squared_delta_primal >= f_t(0.0), "norm_squared_delta_primal must be >= 0");
+  cuopt_assert(norm_squared_delta_dual >= f_t(0.0), "norm_squared_delta_dual must be >= 0");
+  cuopt_assert(primal_weight > f_t(0.0), "primal_weight must be > 0");
+  cuopt_assert(step_size > f_t(0.0), "step_size must be > 0");
+
   const f_t movement =
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
+  //printf("movement %lf\n", movement);
+  //printf("computed_interaction %lf\n", computed_interaction);
+
+  cuopt_assert(
+    movement + computed_interaction >= f_t(0.0),
+    "Movement + computed interaction must be >= 0");
+
   *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
 
 #ifdef CUPDLP_DEBUG_MODE
@@ -1790,6 +1809,68 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // Sync to make sure all previous cuSparse operations are finished before setting the
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+
+  // Validate reflected solutions have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_reflected_primal().data(),
+                    pdhg_solver_.get_reflected_primal().data() +
+                      pdhg_solver_.get_reflected_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "reflected_primal contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_reflected_dual().data(),
+                    pdhg_solver_.get_reflected_dual().data() +
+                      pdhg_solver_.get_reflected_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "reflected_dual contains NaN or Inf in compute_fixed_error");
+
+  // Validate primal/dual solutions have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_primal_solution().data(),
+                    pdhg_solver_.get_primal_solution().data() +
+                      pdhg_solver_.get_primal_solution().size(),
+                    is_nan_or_inf<f_t>{}),
+    "primal_solution contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_dual_solution().data(),
+                    pdhg_solver_.get_dual_solution().data() +
+                      pdhg_solver_.get_dual_solution().size(),
+                    is_nan_or_inf<f_t>{}),
+    "dual_solution contains NaN or Inf in compute_fixed_error");
+
+  // Validate deltas have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data() +
+                      pdhg_solver_.get_saddle_point_state().get_delta_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_primal contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
+                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data() +
+                      pdhg_solver_.get_saddle_point_state().get_delta_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_dual contains NaN or Inf in compute_fixed_error");
+
+  // Validate primal_weight and step_size have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    primal_weight_.data(),
+                    primal_weight_.data() + primal_weight_.size(),
+                    is_nan_or_inf<f_t>{}),
+    "primal_weight_ contains NaN or Inf in compute_fixed_error");
+  cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                               step_size_.data(),
+                               step_size_.data() + step_size_.size(),
+                               is_nan_or_inf<f_t>{}),
+              "step_size_ contains NaN or Inf in compute_fixed_error");
+
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1813,6 +1894,49 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
     RAFT_CUDA_TRY(cudaStreamSynchronize(
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
+
+    // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0
+    {
+      const auto bs = climber_strategies_.size();
+      std::vector<f_t> h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs);
+      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(),
+                                step_size_strategy_.get_norm_squared_delta_primal().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(),
+                                step_size_strategy_.get_norm_squared_delta_dual().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(
+        h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(
+        cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
+      RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(),
+                                step_size_strategy_.get_interaction().data(),
+                                bs * sizeof(f_t),
+                                cudaMemcpyDeviceToHost));
+      for (size_t i = 0; i < bs; ++i) {
+        const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i];
+        const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i];
+        if (movement + comp_inter < f_t(0.0)) {
+          fprintf(stderr,
+                  "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e "
+                  "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n",
+                  i,
+                  (double)movement,
+                  (double)comp_inter,
+                  (double)(movement + comp_inter),
+                  (double)h_nsq_dp[i],
+                  (double)h_nsq_dd[i],
+                  (double)h_pw[i],
+                  (double)h_ss[i],
+                  (double)h_inter[i]);
+        }
+        cuopt_assert(movement + comp_inter >= f_t(0.0),
+                     "Host check: movement + computed_interaction must be >= 0");
+      }
+    }
+
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
@@ -1847,9 +1971,15 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
 #endif
 
   for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+    cuopt_assert(!std::isnan(restart_strategy_.fixed_point_error_[i]),
+                 "fixed_point_error_ must not be NaN after compute_fixed_error");
+    cuopt_assert(restart_strategy_.fixed_point_error_[i] >= f_t(0.0),
+                 "fixed_point_error_ must be >= 0 after compute_fixed_error");
     if (has_restarted[i]) {
       restart_strategy_.initial_fixed_point_error_[i] = restart_strategy_.fixed_point_error_[i];
-      has_restarted[i]                                = false;
+      cuopt_assert(!std::isnan(restart_strategy_.initial_fixed_point_error_[i]),
+                   "initial_fixed_point_error_ must not be NaN after assignment");
+      has_restarted[i] = false;
     }
   }
 }
@@ -1869,6 +1999,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_to_row(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -1945,6 +2076,7 @@ void pdlp_solver_t<i_t, f_t>::transpose_primal_dual_back_to_col(
   rmm::device_uvector<f_t> dual_slack_transposed(
     is_dual_slack_empty ? 0 : primal_size_h_ * climber_strategies_.size(), stream_view_);
 
+  RAFT_CUBLAS_TRY(cublasSetStream(handle_ptr_->get_cublas_handle(), stream_view_));
   CUBLAS_CHECK(cublasDgeam(handle_ptr_->get_cublas_handle(),
                            CUBLAS_OP_T,
                            CUBLAS_OP_N,
@@ -2632,7 +2764,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
     rmm::device_uvector<f_t> d_atq(n, stream_view_);
 
     std::mt19937 gen(1);
-    std::normal_distribution<double> dist(0.0, 1.0);
+    std::normal_distribution<f_t> dist(f_t(0.0), f_t(1.0));
 
     for (int i = 0; i < m; ++i)
       z[i] = dist(gen);
@@ -2684,7 +2816,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecATQ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
 
       // z = A @ A_t_q
       RAFT_CUSPARSE_TRY(
@@ -2697,7 +2829,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                            vecZ,
                                            CUSPARSE_SPMV_CSR_ALG2,
                                            (f_t*)cusparse_view_.buffer_non_transpose.data(),
-                                           stream_view_));
+                                           stream_view_.value()));
       // sigma_max_sq = dot(q, z)
       RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                                       m,
@@ -2706,7 +2838,7 @@ void pdlp_solver_t<i_t, f_t>::compute_initial_step_size()
                                                       d_z.data(),
                                                       primal_stride,
                                                       sigma_max_sq.data(),
-                                                      stream_view_));
+                                                      stream_view_.value()));
 
       cub::DeviceTransform::Transform(
         cuda::std::make_tuple(d_q.data(), d_z.data()),
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index bc12fb360f..a6304a8568 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -1995,14 +1995,14 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
     f_t* end   = threshold_.data() + primal_size_h_ + dual_size_h_;
     auto highest_negInf_primal =
       thrust::find(handle_ptr_->get_thrust_policy(),
-                   thrust::make_reverse_iterator(thrust::device_ptr<f_t>(end)),
-                   thrust::make_reverse_iterator(thrust::device_ptr<f_t>(start)),
+                   thrust::device_ptr<f_t>(end),
+                   thrust::device_ptr<f_t>(start),
                    -std::numeric_limits<f_t>::infinity());
 
     // Set ranges accordingly
     i_t index_start_primal = 0;
     i_t index_end_primal   = primal_size_h_ + dual_size_h_;
-    if (highest_negInf_primal != thrust::make_reverse_iterator(thrust::device_ptr<f_t>(start))) {
+    if (highest_negInf_primal != thrust::device_ptr<f_t>(start)) {
       cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) ==
                      -std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal reverse iterator");
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index a2766be98a..374c9ff513 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -793,6 +793,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
   // If need warm start, solve the LP alone
   if (primal_dual_init || primal_weight_init) {
+    std::cout << "Solving LP for warm start" << std::endl;
     pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
     warm_start_settings.new_bounds.clear();
     warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
@@ -841,6 +842,8 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
+  std::cout << "Solving batch PDLP" << std::endl;
+
   for (int i = 0; i < max_batch_size; i += optimal_batch_size) {
     const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
     // Only take the new bounds from [i, i + current_batch_size)
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 3c1b85aeac..47ba16a297 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -28,6 +28,8 @@
 
 #include <cub/cub.cuh>
 
+#include <thrust/logical.h>
+
 #include <limits>
 
 namespace cuopt::linear_programming::detail {
@@ -80,7 +82,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -90,7 +92,7 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
     RAFT_CUDA_TRY(cub::DeviceSegmentedReduce::Sum(
@@ -100,10 +102,10 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_));
+      stream_view_.value()));
     dot_product_bytes = std::max(dot_product_bytes, byte_needed);
 
-    dot_product_storage.resize(dot_product_bytes, stream_view_);
+    dot_product_storage.resize(dot_product_bytes, stream_view_.value());
   }
 }
 
@@ -143,7 +145,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::swap_context(
   const auto [grid_size, block_size] =
     kernel_config_from_batch_size(static_cast<i_t>(swap_pairs.size()));
   adaptive_step_size_swap_device_vectors_kernel<i_t, f_t>
-    <<<grid_size, block_size, 0, stream_view_>>>(thrust::raw_pointer_cast(swap_pairs.data()),
+    <<<grid_size, block_size, 0, stream_view_.value()>>>(thrust::raw_pointer_cast(swap_pairs.data()),
                                                  static_cast<i_t>(swap_pairs.size()),
                                                  make_span(interaction_),
                                                  make_span(norm_squared_delta_primal_),
@@ -159,9 +161,9 @@ void adaptive_step_size_strategy_t<i_t, f_t>::resize_context(i_t new_size)
   cuopt_assert(new_size > 0, "New size must be greater than 0");
   cuopt_assert(new_size < batch_size, "New size must be less than batch size");
 
-  interaction_.resize(new_size, stream_view_);
-  norm_squared_delta_primal_.resize(new_size, stream_view_);
-  norm_squared_delta_dual_.resize(new_size, stream_view_);
+  interaction_.resize(new_size, stream_view_.value());
+  norm_squared_delta_primal_.resize(new_size, stream_view_.value());
+  norm_squared_delta_dual_.resize(new_size, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -276,19 +278,19 @@ i_t adaptive_step_size_strategy_t<i_t, f_t>::get_valid_step_size() const
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_interaction(i_t i) const
 {
-  return interaction_.element(i, stream_view_);
+  return interaction_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_primal(i_t i) const
 {
-  return norm_squared_delta_primal_.element(i, stream_view_);
+  return norm_squared_delta_primal_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
 f_t adaptive_step_size_strategy_t<i_t, f_t>::get_norm_squared_delta_dual(i_t i) const
 {
-  return norm_squared_delta_dual_.element(i, stream_view_);
+  return norm_squared_delta_dual_.element(i, stream_view_.value());
 }
 
 template <typename i_t, typename f_t>
@@ -337,7 +339,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
                                      pdhg_solver.get_saddle_point_state());
     // Compute n_lim, n_next and decide if step size is valid
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
-      <<<1, 1, 0, stream_view_>>>(this->view(),
+      <<<1, 1, 0, stream_view_.value()>>>(this->view(),
                                   primal_step_size.data(),
                                   dual_step_size.data(),
                                   pdhg_solver.get_d_total_pdhg_iterations().data());
@@ -345,7 +347,27 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
   }
   graph.launch(total_pdlp_iterations);
   // Steam sync so that next call can see modification made to host var valid_step_size
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
+}
+
+template <typename i_t, typename f_t>
+__global__ void validate_interaction_and_movement_outputs(
+  raft::device_span<const f_t> norm_squared_delta_primal,
+  raft::device_span<const f_t> norm_squared_delta_dual,
+  raft::device_span<const f_t> interaction)
+{
+  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= norm_squared_delta_primal.size()) { return; }
+  cuopt_assert(!isnan(norm_squared_delta_primal[idx]),
+               "norm_squared_delta_primal is NaN after reduction");
+  cuopt_assert(!isnan(norm_squared_delta_dual[idx]),
+               "norm_squared_delta_dual is NaN after reduction");
+  cuopt_assert(!isnan(interaction[idx]),
+               "interaction is NaN after reduction");
+  cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0),
+               "norm_squared_delta_primal must be >= 0 after reduction");
+  cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0),
+               "norm_squared_delta_dual must be >= 0 after reduction");
 }
 
 template <typename i_t, typename f_t>
@@ -382,7 +404,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   // We need to make sure both dot products happens after previous operations (next_primal/dual)
   // Thus, we add another node in the main stream before starting the SpMVs
 
-  if (!batch_mode_) deltas_are_done_.record(stream_view_);
+  if (!batch_mode_) deltas_are_done_.record(stream_view_.value());
 
   // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' -
   // A@x)) to save one SpMV
@@ -406,7 +428,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.next_AtY,
                                          CUSPARSE_SPMV_CSR_ALG2,
                                          (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   } else {
     // TODO later batch mode: handle if not all restart
     RAFT_CUSPARSE_TRY(
@@ -420,7 +442,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                          cusparse_view.batch_next_AtYs,
                                          CUSPARSE_SPMM_CSR_ALG3,
                                          (f_t*)cusparse_view.buffer_transpose_batch.data(),
-                                         stream_view_));
+                                         stream_view_.value()));
   }
 
   // Compute Ay' - Ay = next_Aty - current_Aty
@@ -433,6 +455,31 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     cuda::std::minus<>{},
     stream_view_.value());
 
+  // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf
+  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    tmp_primal.data(),
+                    tmp_primal.data() + tmp_primal.size(),
+                    is_nan_or_inf<f_t>{}),
+    "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement");
+
+  // Validate delta_primal and delta_dual inputs have no NaN/Inf
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    current_saddle_point_state.get_delta_primal().data(),
+                    current_saddle_point_state.get_delta_primal().data() +
+                      current_saddle_point_state.get_delta_primal().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_primal contains NaN or Inf in compute_interaction_and_movement");
+  cuopt_assert(
+    !thrust::any_of(handle_ptr_->get_thrust_policy(),
+                    current_saddle_point_state.get_delta_dual().data(),
+                    current_saddle_point_state.get_delta_dual().data() +
+                      current_saddle_point_state.get_delta_dual().size(),
+                    is_nan_or_inf<f_t>{}),
+    "delta_dual contains NaN or Inf in compute_interaction_and_movement");
+
   if (!batch_mode_) {
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
@@ -443,7 +490,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       interaction_.data(),
-                                      stream_view_));
+                                      stream_view_.value()));
 
     // Compute movement
     //  compute euclidean norm squared which is
@@ -453,7 +500,8 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               2 + (0.5 /
     //               solver_state.primal_weight) *
     //               norm(delta_dual) ^ 2;
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(0));
+    // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access
+    // (cuBLAS uses a single internal workspace shared across all streams for the same handle)
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_primal_size(),
@@ -462,10 +510,8 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_primal().data(),
                                       primal_stride,
                                       norm_squared_delta_primal_.data(),
-                                      stream_pool_.get_stream(0)));
-    dot_delta_X_.record(stream_pool_.get_stream(0));
+                                      stream_view_.value()));
 
-    deltas_are_done_.stream_wait(stream_pool_.get_stream(1));
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_dual_size(),
@@ -474,12 +520,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
                                       current_saddle_point_state.get_delta_dual().data(),
                                       dual_stride,
                                       norm_squared_delta_dual_.data(),
-                                      stream_pool_.get_stream(1)));
-    dot_delta_Y_.record(stream_pool_.get_stream(1));
-
-    // Wait on main stream for both dot to be done before launching the next kernel
-    dot_delta_X_.stream_wait(stream_view_);
-    dot_delta_Y_.stream_wait(stream_view_);
+                                      stream_view_.value()));
   } else {
     // TODO later batch mode: remove this once you want to do per climber restart
     cub::DeviceSegmentedReduce::Sum(
@@ -492,7 +533,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       interaction_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -502,7 +543,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_primal_.data(),
       climber_strategies_.size(),
       primal_size_,
-      stream_view_);
+      stream_view_.value());
 
     cub::DeviceSegmentedReduce::Sum(
       dot_product_storage.data(),
@@ -512,7 +553,14 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       norm_squared_delta_dual_.data(),
       climber_strategies_.size(),
       dual_size_,
-      stream_view_);
+      stream_view_.value());
+
+    validate_interaction_and_movement_outputs<i_t, f_t>
+      <<<1, climber_strategies_.size(), 0, stream_view_>>>(
+        make_span(norm_squared_delta_primal_),
+        make_span(norm_squared_delta_dual_),
+        make_span(interaction_));
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 16e2d64957..14180b5bfd 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -21,7 +21,7 @@ template <typename i_t>
 class ping_pong_graph_t {
  public:
   ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false)
-    : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
+    : stream_view_(stream_view), is_legacy_batch_mode_(true)
   {
   }
 
diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
index 19db315349..4ec4a9aaf2 100644
--- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
+++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
@@ -207,6 +207,44 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data):
         """
         self.pdlp_warm_start_data = pdlp_warm_start_data
 
+    def set_mip_batch_pdlp_strong_branching(self, enable):
+        """
+        Note: Only supported for MILP
+
+        Toggle batch PDLP strong branching in the MIP solver.
+
+        Parameters
+        ----------
+        enable : bool
+            If True, enable batch PDLP strong branching (value 1).
+            If False, disable it (value 0).
+
+        Examples
+        --------
+        >>> settings.set_mip_batch_pdlp_strong_branching(True)
+        """
+        self.set_parameter(
+            "mip_batch_pdlp_strong_branching", 1 if enable else 0
+        )
+
+    def get_mip_batch_pdlp_strong_branching(self):
+        """
+        Note: Only supported for MILP
+
+        Get the current value of the batch PDLP strong branching setting.
+
+        Returns
+        -------
+        bool
+            True if batch PDLP strong branching is enabled, False otherwise.
+
+        Examples
+        --------
+        >>> settings.get_mip_batch_pdlp_strong_branching()
+        False
+        """
+        return bool(self.get_parameter("mip_batch_pdlp_strong_branching"))
+
     def set_mip_callback(self, callback, user_data):
         """
         Note: Only supported for MILP
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 8412c745b5..59ea62089d 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -450,6 +450,11 @@ class SolverConfig(BaseModel):
         description="Set True to run heuristics only, False to run "
         "heuristics and branch and bound for MILP",
     )
+    mip_batch_pdlp_strong_branching: Optional[int] = Field(
+        default=0,
+        description="Set 1 to enable batch PDLP strong branching "
+        "in the MIP solver, 0 to disable.",
+    )
     num_cpu_threads: Optional[int] = Field(
         default=None,
         description="Set the number of CPU threads to use for branch and bound.",  # noqa
diff --git a/run_multiple.sh b/run_multiple.sh
new file mode 100755
index 0000000000..183b25b46e
--- /dev/null
+++ b/run_multiple.sh
@@ -0,0 +1,3 @@
+for i in {1..5}; do
+    python test.py
+done
\ No newline at end of file
diff --git a/test.py b/test.py
new file mode 100755
index 0000000000..6cb236dae2
--- /dev/null
+++ b/test.py
@@ -0,0 +1,12 @@
+import cuopt_mps_parser
+from cuopt.linear_programming import Solve, SolverSettings
+
+data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps")
+
+settings = SolverSettings()
+settings.set_mip_batch_pdlp_strong_branching(True)
+
+solution = Solve(data_model, settings)
+
+print(solution.get_termination_reason())
+print(solution.get_primal_objective())

From 1614bc14836050bbf30308c7c7edf140f352770e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 17 Feb 2026 17:42:10 +0100
Subject: [PATCH 04/43] fix

---
 cpp/src/pdlp/pdlp.cu                          | 89 +++++++------------
 .../restart_strategy/pdlp_restart_strategy.cu |  1 -
 .../convergence_information.cu                | 66 ++++++++------
 cpp/src/pdlp/utils.cuh                        | 20 +++--
 4 files changed, 82 insertions(+), 94 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index e1ab866b5b..082299902d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1188,24 +1188,42 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) {
+  auto min_nonzero = [] __host__ __device__(f_t x) -> f_t {
     return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
   };
 
-  smallest = thrust::transform_reduce(rmm::exec_policy(vec.stream()),
-                                      vec.begin(),
-                                      vec.end(),
-                                      min_nonzero,
-                                      std::numeric_limits<f_t>::max(),
-                                      thrust::minimum<f_t>());
-
-  largest = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::maximum<f_t>());
-
-  f_t sum = thrust::transform_reduce(
-    rmm::exec_policy(vec.stream()), vec.begin(), vec.end(), abs_op, 0.0f, thrust::plus<f_t>());
-
-  avg = sum / vec.size();
+  auto stream = vec.stream();
+  auto n      = static_cast<int>(vec.size());
+
+  rmm::device_scalar<f_t> d_smallest(stream);
+  rmm::device_scalar<f_t> d_largest(stream);
+  rmm::device_scalar<f_t> d_sum(stream);
+
+  auto min_nz_iter = thrust::make_transform_iterator(vec.cbegin(), min_nonzero);
+  auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
+
+  void* d_temp   = nullptr;
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1;
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
+  cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+
+  size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
+  rmm::device_buffer temp_buf(max_bytes, stream);
+
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
+  cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+
+  smallest = d_smallest.value(stream);
+  largest  = d_largest.value(stream);
+  avg      = d_sum.value(stream) / vec.size();
 };
 
 template <typename f_t>
@@ -1895,47 +1913,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-    // Host-side diagnostic: copy small device arrays and verify movement + interaction >= 0
-    {
-      const auto bs = climber_strategies_.size();
-      std::vector<f_t> h_nsq_dp(bs), h_nsq_dd(bs), h_pw(bs), h_ss(bs), h_inter(bs);
-      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dp.data(),
-                                step_size_strategy_.get_norm_squared_delta_primal().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(h_nsq_dd.data(),
-                                step_size_strategy_.get_norm_squared_delta_dual().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(
-        h_pw.data(), primal_weight_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(
-        cudaMemcpy(h_ss.data(), step_size_.data(), bs * sizeof(f_t), cudaMemcpyDeviceToHost));
-      RAFT_CUDA_TRY(cudaMemcpy(h_inter.data(),
-                                step_size_strategy_.get_interaction().data(),
-                                bs * sizeof(f_t),
-                                cudaMemcpyDeviceToHost));
-      for (size_t i = 0; i < bs; ++i) {
-        const f_t movement = h_nsq_dp[i] * h_pw[i] + h_nsq_dd[i] / h_pw[i];
-        const f_t comp_inter = f_t(2.0) * h_inter[i] * h_ss[i];
-        if (movement + comp_inter < f_t(0.0)) {
-          fprintf(stderr,
-                  "DIAGNOSTIC [%zu]: movement=%.17e comp_inter=%.17e sum=%.17e "
-                  "norm_sq_dx=%.17e norm_sq_dy=%.17e pw=%.17e ss=%.17e interaction=%.17e\n",
-                  i,
-                  (double)movement,
-                  (double)comp_inter,
-                  (double)(movement + comp_inter),
-                  (double)h_nsq_dp[i],
-                  (double)h_nsq_dd[i],
-                  (double)h_pw[i],
-                  (double)h_ss[i],
-                  (double)h_inter[i]);
-        }
-        cuopt_assert(movement + comp_inter >= f_t(0.0),
-                     "Host check: movement + computed_interaction must be >= 0");
-      }
-    }
 
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index a6304a8568..b2ed166a2d 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -38,7 +38,6 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/sort.h>
-#include <thrust/transform_reduce.h>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 97e0e9c0e9..eec078f8d7 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -395,28 +395,28 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * b_i)
-    thrust::device_ptr<f_t> result_ptr(linf_primal_residual_.data());
-    const f_t neutral = f_t(0.0);
-
     if (settings.save_best_primal_so_far) {
       const i_t zero_int = 0;
       nb_violated_constraints_.set_value_async(zero_int, handle_ptr_->get_stream());
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
-    } else {
-      *result_ptr = thrust::transform_reduce(
-        handle_ptr_->get_thrust_policy(),
-        thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
-        thrust::make_zip_iterator(primal_residual_.cend(), combined_bounds.cend()),
-        relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance},
-        neutral,
-        thrust::maximum<f_t>());
     }
+    auto transform_iter = thrust::make_transform_iterator(
+      thrust::make_zip_iterator(primal_residual_.cbegin(), combined_bounds.cbegin()),
+      relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance});
+    void* d_temp_storage      = nullptr;
+    size_t temp_storage_bytes = 0;
+    cub::DeviceReduce::Max(d_temp_storage,
+                              temp_storage_bytes,
+                              transform_iter,
+                              linf_primal_residual_.data(),
+                              primal_residual_.size(),
+                              stream_view_);
+    rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+    cub::DeviceReduce::Max(temp_buf.data(),
+                              temp_storage_bytes,
+                              transform_iter,
+                              linf_primal_residual_.data(),
+                              primal_residual_.size(),
+                              stream_view_);
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
@@ -458,16 +458,26 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
                   "Batch mode not supported for per_constraint_residual");
 
     // Compute the linf of (residual_i - rel * c_i)
-    thrust::device_ptr<f_t> result_ptr(linf_dual_residual_.data());
-    const f_t neutral = f_t(0.0);
-
-    *result_ptr = thrust::transform_reduce(
-      handle_ptr_->get_thrust_policy(),
-      thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
-      thrust::make_zip_iterator(dual_residual_.cend(), objective_coefficients.cend()),
-      relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance},
-      neutral,
-      thrust::maximum<f_t>());
+    {
+      auto transform_iter = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(dual_residual_.cbegin(), objective_coefficients.cbegin()),
+        relative_residual_t<i_t, f_t>{settings.tolerances.relative_dual_tolerance});
+      void* d_temp_storage      = nullptr;
+      size_t temp_storage_bytes = 0;
+      cub::DeviceReduce::Max(d_temp_storage,
+                                temp_storage_bytes,
+                                transform_iter,
+                                linf_dual_residual_.data(),
+                                dual_residual_.size(),
+                                stream_view_);
+      rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
+      cub::DeviceReduce::Max(temp_buf.data(),
+                                temp_storage_bytes,
+                                transform_iter,
+                                linf_dual_residual_.data(),
+                                dual_residual_.size(),
+                                stream_view_);
+    }
   }
 
   const auto [grid_size, block_size] = kernel_config_from_batch_size(climber_strategies_.size());
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index d48ae21c1a..9150ab8c51 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -604,15 +604,17 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
                         f_t* result,
                         raft::handle_t const* handle_ptr)
 {
-  const f_t neutral = f_t(0.0);
-  thrust::device_ptr<f_t> result_ptr(result);
-
-  *result_ptr = thrust::transform_reduce(handle_ptr->get_thrust_policy(),
-                                         input_vector.data(),
-                                         input_vector.data() + input_vector.size(),
-                                         abs_t<f_t>{},
-                                         neutral,
-                                         thrust::maximum<f_t>());
+  auto stream   = handle_ptr->get_stream();
+  auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
+  auto n        = static_cast<int>(input_vector.size());
+
+  void* d_temp          = nullptr;
+  size_t temp_bytes     = 0;
+  cub::DeviceReduce::Max(
+    d_temp, temp_bytes, abs_iter, result, n, stream);
+  rmm::device_buffer temp_buf(temp_bytes, stream);
+  cub::DeviceReduce::Max(
+    temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
 }
 
 template <typename f_t>

From 4f3353191f6178ce02ffb3449daa9891ee8eb38e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 13:18:41 +0100
Subject: [PATCH 05/43] fix

---
 .../adaptive_step_size_strategy.cu            |  25 ----
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 123 ++++++++++++++++++
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  87 ++-----------
 3 files changed, 137 insertions(+), 98 deletions(-)
 create mode 100644 cpp/src/pdlp/utilities/ping_pong_graph.cu

diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 47ba16a297..32e21cfbf6 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -455,31 +455,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     cuda::std::minus<>{},
     stream_view_.value());
 
-  // Validate tmp_primal (A^T @ delta_y) has no NaN/Inf
-  RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    tmp_primal.data(),
-                    tmp_primal.data() + tmp_primal.size(),
-                    is_nan_or_inf<f_t>{}),
-    "tmp_primal (A^T @ delta_y) contains NaN or Inf in compute_interaction_and_movement");
-
-  // Validate delta_primal and delta_dual inputs have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    current_saddle_point_state.get_delta_primal().data(),
-                    current_saddle_point_state.get_delta_primal().data() +
-                      current_saddle_point_state.get_delta_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_primal contains NaN or Inf in compute_interaction_and_movement");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    current_saddle_point_state.get_delta_dual().data(),
-                    current_saddle_point_state.get_delta_dual().data() +
-                      current_saddle_point_state.get_delta_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_dual contains NaN or Inf in compute_interaction_and_movement");
-
   if (!batch_mode_) {
     // compute interaction (x'-x) . (A(y'-y))
     RAFT_CUBLAS_TRY(
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
new file mode 100644
index 0000000000..08045b47a1
--- /dev/null
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -0,0 +1,123 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#include <pdlp/utilities/ping_pong_graph.cuh>
+
+#include <raft/core/error.hpp>
+
+#include <utilities/logger.hpp>
+
+namespace cuopt::linear_programming::detail {
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::ping_pong_graph_t(rmm::cuda_stream_view stream_view,
+                                          bool is_legacy_batch_mode)
+  : stream_view_(stream_view), is_legacy_batch_mode_(is_legacy_batch_mode)
+{
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::cancel_active_capture()
+{
+  CUOPT_LOG_ERROR(
+    "Canceling active capture in ping_pong_graph_t");
+  if (capture_even_active_) {
+    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+    capture_even_active_ = false;
+  }
+  if (capture_odd_active_) {
+    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+    capture_odd_active_ = false;
+  }
+}
+
+template <typename i_t>
+ping_pong_graph_t<i_t>::~ping_pong_graph_t()
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    // This should not happen, but in case a graph was capturing while destroying the object
+    if (capture_even_active_ || capture_odd_active_) {
+      cancel_active_capture();
+    }
+    if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
+    if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(
+        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_even_active_ = true;
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(
+        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      capture_odd_active_ = true;
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      capture_even_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
+      even_initialized = true;
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+    } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      capture_odd_active_ = false;
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
+      odd_initialized = true;
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    if (total_pdlp_iterations % 2 == 0 && even_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
+    } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
+      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
+    }
+  }
+#endif
+}
+
+template <typename i_t>
+bool ping_pong_graph_t<i_t>::is_initialized(i_t total_pdlp_iterations)
+{
+#ifndef CUPDLP_DEBUG_MODE
+  if (!is_legacy_batch_mode_) {
+    return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
+           (total_pdlp_iterations % 2 == 1 && odd_initialized);
+  }
+#endif
+  return false;
+}
+
+template class ping_pong_graph_t<int>;
+
+}  // namespace cuopt::linear_programming::detail
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 14180b5bfd..9d6ead8cf7 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -9,6 +9,8 @@
 
 #include <pdlp/pdlp_constants.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+
 #include <cuda_runtime.h>
 
 namespace cuopt::linear_programming::detail {
@@ -17,83 +19,20 @@ namespace cuopt::linear_programming::detail {
 // No additional checks for safe usage (calling launch() before initializing the graph) use with
 // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a
 // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph
-template <typename i_t>
+  template <typename i_t>
 class ping_pong_graph_t {
  public:
-  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false)
-    : stream_view_(stream_view), is_legacy_batch_mode_(true)
-  {
-  }
-
-  ~ping_pong_graph_t()
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
-      if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
-    }
-#endif
-  }
-
-  void start_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(
-          cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
-      }
-    }
-#endif
-  }
-
-  void end_capture(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
-        even_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
-      } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-        RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-        RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
-        odd_initialized = true;
-        RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
-      }
-    }
-#endif
-  }
+  ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false);
+  ~ping_pong_graph_t();
 
-  void launch(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
-      } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-        RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
-      }
-    }
-#endif
-  }
-
-  bool is_initialized(i_t total_pdlp_iterations)
-  {
-#ifndef CUPDLP_DEBUG_MODE
-    if (!is_legacy_batch_mode_) {
-      return (total_pdlp_iterations % 2 == 0 && even_initialized) ||
-             (total_pdlp_iterations % 2 == 1 && odd_initialized);
-    }
-#endif
-    return false;
-  }
+  void start_capture(i_t total_pdlp_iterations);
+  void end_capture(i_t total_pdlp_iterations);
+  void launch(i_t total_pdlp_iterations);
+  bool is_initialized(i_t total_pdlp_iterations);
 
  private:
+  void cancel_active_capture();
+
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
   cudaGraphExec_t even_instance;
@@ -101,7 +40,9 @@ class ping_pong_graph_t {
   rmm::cuda_stream_view stream_view_;
   bool even_initialized{false};
   bool odd_initialized{false};
-  // Temporary fix to disable cuda graph in legacy batch mode
+  bool capture_even_active_{false};
+  bool capture_odd_active_{false};
   bool is_legacy_batch_mode_{false};
 };
+
 }  // namespace cuopt::linear_programming::detail

From e0a530ed449f7c904d651701aa066dd90a5edd54 Mon Sep 17 00:00:00 2001
From: Trevor McKay <tmgithub1@gmail.com>
Date: Tue, 17 Feb 2026 13:49:33 -0500
Subject: [PATCH 06/43] workaround for thrust reverse iterator build error

---
 .../restart_strategy/pdlp_restart_strategy.cu   | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index b2ed166a2d..e42a05e1e6 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -34,6 +34,7 @@
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/logical.h>
@@ -1990,18 +1991,18 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                                    threshold_.end(),
                                    std::numeric_limits<f_t>::infinity());
     // Easier / Cleaner than to do reverse iterator arithmetic
-    f_t* start = threshold_.data();
-    f_t* end   = threshold_.data() + primal_size_h_ + dual_size_h_;
-    auto highest_negInf_primal =
-      thrust::find(handle_ptr_->get_thrust_policy(),
-                   thrust::device_ptr<f_t>(end),
-                   thrust::device_ptr<f_t>(start),
-                   -std::numeric_limits<f_t>::infinity());
+    f_t* start                 = threshold_.data();
+    f_t* end                   = threshold_.data() + primal_size_h_ + dual_size_h_;
+    using rev_iter_t           = thrust::reverse_iterator<thrust::device_ptr<f_t>>;
+    auto highest_negInf_primal = thrust::find(handle_ptr_->get_thrust_policy(),
+                                              rev_iter_t(thrust::device_ptr<f_t>(end)),
+                                              rev_iter_t(thrust::device_ptr<f_t>(start)),
+                                              -std::numeric_limits<f_t>::infinity());
 
     // Set ranges accordingly
     i_t index_start_primal = 0;
     i_t index_end_primal   = primal_size_h_ + dual_size_h_;
-    if (highest_negInf_primal != thrust::device_ptr<f_t>(start)) {
+    if (highest_negInf_primal != rev_iter_t(thrust::device_ptr<f_t>(start))) {
       cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*highest_negInf_primal)) ==
                      -std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal reverse iterator");

From e330718a2f40799de9a22615d1e99954bd821922 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 13:55:20 +0100
Subject: [PATCH 07/43] remove compile file

---
 compile.sh | 2 --
 1 file changed, 2 deletions(-)
 delete mode 100755 compile.sh

diff --git a/compile.sh b/compile.sh
deleted file mode 100755
index bedf3a7506..0000000000
--- a/compile.sh
+++ /dev/null
@@ -1,2 +0,0 @@
-./build.sh libcuopt libmps_parser --cache-tool=ccache --skip-tests-build -a -l=OFF
-./build.sh cuopt cuopt_mps_parser
\ No newline at end of file

From dce6d4fec3d5a2ae6d3313e4f1af2db4ce55ccea Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 15:56:09 +0100
Subject: [PATCH 08/43] fix

---
 .../linear_programming/cuopt/run_pdlp.cu      | 20 ++++++---
 cpp/src/branch_and_bound/pseudo_costs.cpp     |  3 +-
 cpp/src/pdlp/CMakeLists.txt                   |  1 +
 cpp/src/pdlp/pdlp.cu                          |  3 --
 .../adaptive_step_size_strategy.cu            | 44 +------------------
 .../adaptive_step_size_strategy.hpp           |  9 ----
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 16 +++----
 .../solver_settings/solver_settings.py        | 38 ----------------
 8 files changed, 26 insertions(+), 108 deletions(-)

diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index c3d6ad42f4..64897264c9 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -70,15 +70,23 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "Path to PDLP hyper-params file to configure PDLP solver. Has priority over PDLP solver "
       "modes.");
 
-  program.add_argument("--presolve")
-    .help("enable/disable presolve (default: true for MIP problems, false for LP problems)")
-    .default_value(0)
-    .scan<'i', int>()
-    .choices(0, 1);
+  program.add_argument("--presolver")
+  .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+  .default_value("Default")
+  .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }
 
+static cuopt::linear_programming::presolver_t string_to_presolver(const std::string& presolver)
+{
+  if (presolver == "None") return cuopt::linear_programming::presolver_t::None;
+  if (presolver == "Papilo") return cuopt::linear_programming::presolver_t::Papilo;
+  if (presolver == "PSLP") return cuopt::linear_programming::presolver_t::PSLP;
+  if (presolver == "Default") return cuopt::linear_programming::presolver_t::Default;
+  return cuopt::linear_programming::presolver_t::Default;
+}
+
 static cuopt::linear_programming::pdlp_solver_mode_t string_to_pdlp_solver_mode(
   const std::string& mode)
 {
@@ -107,7 +115,7 @@ static cuopt::linear_programming::pdlp_solver_settings_t<int, double> create_sol
     string_to_pdlp_solver_mode(program.get<std::string>("--pdlp-solver-mode"));
   settings.method = static_cast<cuopt::linear_programming::method_t>(program.get<int>("--method"));
   settings.crossover = program.get<int>("--crossover");
-  //settings.presolve  = program.get<int>("--presolve");
+  settings.presolver = string_to_presolver(program.get<std::string>("--presolver"));
 
   return settings;
 }
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 2ddc672750..e06f497bdc 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -335,8 +335,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
     }
 
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
+    const raft::handle_t batch_pdlp_handle;
     const auto solutions =
-      batch_pdlp_solve(original_problem.handle_ptr, mps_model, fractional, fraction_values);
+      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
diff --git a/cpp/src/pdlp/CMakeLists.txt b/cpp/src/pdlp/CMakeLists.txt
index ced9da8edc..2071bdfdef 100644
--- a/cpp/src/pdlp/CMakeLists.txt
+++ b/cpp/src/pdlp/CMakeLists.txt
@@ -24,6 +24,7 @@ set(LP_CORE_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/infeasibility_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/termination_strategy/convergence_information.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/optimal_batch_size_handler/optimal_batch_size_handler.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/utilities/ping_pong_graph.cu
 )
 
 # C and Python adapter files
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 082299902d..eaafd1293e 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1440,9 +1440,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  //printf("movement %lf\n", movement);
-  //printf("computed_interaction %lf\n", computed_interaction);
-
   cuopt_assert(
     movement + computed_interaction >= f_t(0.0),
     "Movement + computed interaction must be >= 0");
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index 32e21cfbf6..d491106aaf 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -28,14 +28,10 @@
 
 #include <cub/cub.cuh>
 
-#include <thrust/logical.h>
-
 #include <limits>
 
 namespace cuopt::linear_programming::detail {
 
-constexpr int parallel_stream_computation = 2;
-
 template <typename i_t, typename f_t>
 adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   raft::handle_t const* handle_ptr,
@@ -47,10 +43,6 @@ adaptive_step_size_strategy_t<i_t, f_t>::adaptive_step_size_strategy_t(
   const std::vector<pdlp_climber_strategy_t>& climber_strategies,
   const pdlp_hyper_params::pdlp_hyper_params_t& hyper_params)
   : batch_mode_(climber_strategies.size() > 1),
-    stream_pool_(parallel_stream_computation),
-    dot_delta_X_(cudaEventDisableTiming),
-    dot_delta_Y_(cudaEventDisableTiming),
-    deltas_are_done_(cudaEventDisableTiming),
     handle_ptr_(handle_ptr),
     stream_view_(handle_ptr_->get_stream()),
     primal_size_(primal_size),
@@ -350,26 +342,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_.value()));
 }
 
-template <typename i_t, typename f_t>
-__global__ void validate_interaction_and_movement_outputs(
-  raft::device_span<const f_t> norm_squared_delta_primal,
-  raft::device_span<const f_t> norm_squared_delta_dual,
-  raft::device_span<const f_t> interaction)
-{
-  const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx >= norm_squared_delta_primal.size()) { return; }
-  cuopt_assert(!isnan(norm_squared_delta_primal[idx]),
-               "norm_squared_delta_primal is NaN after reduction");
-  cuopt_assert(!isnan(norm_squared_delta_dual[idx]),
-               "norm_squared_delta_dual is NaN after reduction");
-  cuopt_assert(!isnan(interaction[idx]),
-               "interaction is NaN after reduction");
-  cuopt_assert(norm_squared_delta_primal[idx] >= f_t(0.0),
-               "norm_squared_delta_primal must be >= 0 after reduction");
-  cuopt_assert(norm_squared_delta_dual[idx] >= f_t(0.0),
-               "norm_squared_delta_dual must be >= 0 after reduction");
-}
-
 template <typename i_t, typename f_t>
 void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
   rmm::device_uvector<f_t>& tmp_primal,
@@ -393,7 +365,7 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
 
     Deltas x & y were computed during pdhg step
 
-    We will compute in parallel (parallel cuda graph):
+    We will compute:
     ||(x' - x)||
     ||(y' - y)||
     (y' - y)_t . A @ (x' - x)
@@ -401,11 +373,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     And finally merge the results
   */
 
-  // We need to make sure both dot products happens after previous operations (next_primal/dual)
-  // Thus, we add another node in the main stream before starting the SpMVs
-
-  if (!batch_mode_) deltas_are_done_.record(stream_view_.value());
-
   // primal_dual_interaction computation => we purposly diverge from the paper (delta_y . (A @ x' -
   // A@x)) to save one SpMV
   // Instead we do: delta_x . (A_t @ y' - A_t @ y)
@@ -475,8 +442,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
     //               2 + (0.5 /
     //               solver_state.primal_weight) *
     //               norm(delta_dual) ^ 2;
-    // All dot products run on stream_view_ to avoid concurrent cuBLAS workspace access
-    // (cuBLAS uses a single internal workspace shared across all streams for the same handle)
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
                                       current_saddle_point_state.get_primal_size(),
@@ -529,13 +494,6 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_interaction_and_movement(
       climber_strategies_.size(),
       dual_size_,
       stream_view_.value());
-
-    validate_interaction_and_movement_outputs<i_t, f_t>
-      <<<1, climber_strategies_.size(), 0, stream_view_>>>(
-        make_span(norm_squared_delta_primal_),
-        make_span(norm_squared_delta_dual_),
-        make_span(interaction_));
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
   }
 }
 
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
index 8e7e048b18..1e969150e7 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.hpp
@@ -91,15 +91,6 @@ class adaptive_step_size_strategy_t {
  private:
   const bool batch_mode_;
 
-  // Stream pool to run different step size computation in parallel
-  // Because we already have the main stream, we just need 2 extra streams from this
-  rmm::cuda_stream_pool stream_pool_;
-
-  // Events to record when dot product of both delta_x and y are done and when to start them
-  event_handler_t deltas_are_done_;
-  event_handler_t dot_delta_X_;
-  event_handler_t dot_delta_Y_;
-
   raft::handle_t const* handle_ptr_{nullptr};
   rmm::cuda_stream_view stream_view_;
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 08045b47a1..647672e535 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -58,11 +58,11 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(
+      RAFT_CUDA_TRY_NO_THROW(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(
+      RAFT_CUDA_TRY_NO_THROW(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
@@ -76,15 +76,15 @@ void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph));
       capture_even_active_ = false;
-      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph));
       even_initialized = true;
       RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
       capture_odd_active_ = false;
-      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph));
       odd_initialized = true;
       RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
     }
@@ -98,9 +98,9 @@ void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value()));
     } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
+      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value()));
     }
   }
 #endif
diff --git a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
index 4ec4a9aaf2..19db315349 100644
--- a/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
+++ b/python/cuopt/cuopt/linear_programming/solver_settings/solver_settings.py
@@ -207,44 +207,6 @@ def set_pdlp_warm_start_data(self, pdlp_warm_start_data):
         """
         self.pdlp_warm_start_data = pdlp_warm_start_data
 
-    def set_mip_batch_pdlp_strong_branching(self, enable):
-        """
-        Note: Only supported for MILP
-
-        Toggle batch PDLP strong branching in the MIP solver.
-
-        Parameters
-        ----------
-        enable : bool
-            If True, enable batch PDLP strong branching (value 1).
-            If False, disable it (value 0).
-
-        Examples
-        --------
-        >>> settings.set_mip_batch_pdlp_strong_branching(True)
-        """
-        self.set_parameter(
-            "mip_batch_pdlp_strong_branching", 1 if enable else 0
-        )
-
-    def get_mip_batch_pdlp_strong_branching(self):
-        """
-        Note: Only supported for MILP
-
-        Get the current value of the batch PDLP strong branching setting.
-
-        Returns
-        -------
-        bool
-            True if batch PDLP strong branching is enabled, False otherwise.
-
-        Examples
-        --------
-        >>> settings.get_mip_batch_pdlp_strong_branching()
-        False
-        """
-        return bool(self.get_parameter("mip_batch_pdlp_strong_branching"))
-
     def set_mip_callback(self, callback, user_data):
         """
         Note: Only supported for MILP

From 9c03faf0f2e5e5d44670507c8ac348ca3f142659 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 17:28:11 +0100
Subject: [PATCH 09/43] final cleanup

---
 cpp/src/pdlp/pdlp.cu  | 60 -------------------------------------------
 cpp/src/pdlp/solve.cu |  3 ---
 run_multiple.sh       |  3 ---
 test.py               | 12 ---------
 4 files changed, 78 deletions(-)
 delete mode 100755 run_multiple.sh
 delete mode 100755 test.py

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index eaafd1293e..4b4eed1f32 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1825,66 +1825,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-  // Validate reflected solutions have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_reflected_primal().data(),
-                    pdhg_solver_.get_reflected_primal().data() +
-                      pdhg_solver_.get_reflected_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "reflected_primal contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_reflected_dual().data(),
-                    pdhg_solver_.get_reflected_dual().data() +
-                      pdhg_solver_.get_reflected_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "reflected_dual contains NaN or Inf in compute_fixed_error");
-
-  // Validate primal/dual solutions have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_primal_solution().data(),
-                    pdhg_solver_.get_primal_solution().data() +
-                      pdhg_solver_.get_primal_solution().size(),
-                    is_nan_or_inf<f_t>{}),
-    "primal_solution contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_dual_solution().data(),
-                    pdhg_solver_.get_dual_solution().data() +
-                      pdhg_solver_.get_dual_solution().size(),
-                    is_nan_or_inf<f_t>{}),
-    "dual_solution contains NaN or Inf in compute_fixed_error");
-
-  // Validate deltas have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_primal().data() +
-                      pdhg_solver_.get_saddle_point_state().get_delta_primal().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_primal contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data(),
-                    pdhg_solver_.get_saddle_point_state().get_delta_dual().data() +
-                      pdhg_solver_.get_saddle_point_state().get_delta_dual().size(),
-                    is_nan_or_inf<f_t>{}),
-    "delta_dual contains NaN or Inf in compute_fixed_error");
-
-  // Validate primal_weight and step_size have no NaN/Inf
-  cuopt_assert(
-    !thrust::any_of(handle_ptr_->get_thrust_policy(),
-                    primal_weight_.data(),
-                    primal_weight_.data() + primal_weight_.size(),
-                    is_nan_or_inf<f_t>{}),
-    "primal_weight_ contains NaN or Inf in compute_fixed_error");
-  cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                               step_size_.data(),
-                               step_size_.data() + step_size_.size(),
-                               is_nan_or_inf<f_t>{}),
-              "step_size_ contains NaN or Inf in compute_fixed_error");
 
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 374c9ff513..a2766be98a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -793,7 +793,6 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
   // If need warm start, solve the LP alone
   if (primal_dual_init || primal_weight_init) {
-    std::cout << "Solving LP for warm start" << std::endl;
     pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
     warm_start_settings.new_bounds.clear();
     warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
@@ -842,8 +841,6 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
-  std::cout << "Solving batch PDLP" << std::endl;
-
   for (int i = 0; i < max_batch_size; i += optimal_batch_size) {
     const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
     // Only take the new bounds from [i, i + current_batch_size)
diff --git a/run_multiple.sh b/run_multiple.sh
deleted file mode 100755
index 183b25b46e..0000000000
--- a/run_multiple.sh
+++ /dev/null
@@ -1,3 +0,0 @@
-for i in {1..5}; do
-    python test.py
-done
\ No newline at end of file
diff --git a/test.py b/test.py
deleted file mode 100755
index 6cb236dae2..0000000000
--- a/test.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import cuopt_mps_parser
-from cuopt.linear_programming import Solve, SolverSettings
-
-data_model = cuopt_mps_parser.ParseMps("batch_instances/neos8.mps")
-
-settings = SolverSettings()
-settings.set_mip_batch_pdlp_strong_branching(True)
-
-solution = Solve(data_model, settings)
-
-print(solution.get_termination_reason())
-print(solution.get_primal_objective())

From 6c2fe356a9ffc0994f0e7b177233504aa73c1c7d Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Feb 2026 17:39:07 +0100
Subject: [PATCH 10/43] addtional cleanup

---
 cpp/src/pdlp/pdhg.cu   | 4 ++++
 cpp/src/pdlp/utils.cuh | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/pdhg.cu b/cpp/src/pdlp/pdhg.cu
index c5efdcd722..286d6de5b5 100644
--- a/cpp/src/pdlp/pdhg.cu
+++ b/cpp/src/pdlp/pdhg.cu
@@ -652,6 +652,8 @@ struct primal_reflected_projection_bulk_op {
     cuopt_assert(!isnan(step_size), "primal_step_size is NaN in primal_reflected_projection");
     cuopt_assert(!isnan(primal_val), "primal_solution is NaN in primal_reflected_projection");
     cuopt_assert(!isnan(aty_val), "current_AtY is NaN in primal_reflected_projection");
+    cuopt_assert(!isinf(step_size), "primal_step_size is Inf in primal_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "primal_step_size must be > 0");
 
     f_t reflected = primal_val - step_size * (obj_coef - aty_val);
 
@@ -688,6 +690,8 @@ struct dual_reflected_projection_bulk_op {
     cuopt_assert(!isnan(step_size), "dual_step_size is NaN in dual_reflected_projection");
     cuopt_assert(!isnan(current_dual), "dual_solution is NaN in dual_reflected_projection");
     cuopt_assert(!isnan(dual_gradient[idx]), "dual_gradient is NaN in dual_reflected_projection");
+    cuopt_assert(!isinf(step_size), "dual_step_size is Inf in dual_reflected_projection");
+    cuopt_assert(step_size > f_t(0.0), "dual_step_size must be > 0");
 
     const f_t tmp = current_dual / step_size - dual_gradient[idx];
     const f_t tmp_proj =
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index 9150ab8c51..0f2ed44c42 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -606,7 +606,7 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
 {
   auto stream   = handle_ptr->get_stream();
   auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
-  auto n        = static_cast<int>(input_vector.size());
+  auto n        = input_vector.size();
 
   void* d_temp          = nullptr;
   size_t temp_bytes     = 0;

From a43dc0c78b0d8bd833e8c0b05d8e6e46b0483703 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:43:06 +0100
Subject: [PATCH 11/43] address PR comments, add tests, update doc

---
 cpp/src/pdlp/pdlp.cu                          | 33 ++++++++-------
 .../convergence_information.cu                |  8 ++--
 cpp/src/pdlp/utilities/ping_pong_graph.cu     | 41 +++++--------------
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  1 -
 docs/cuopt/source/lp-qp-milp-settings.rst     | 10 +++++
 .../linear_programming/test_python_API.py     | 33 +++++++++++++++
 6 files changed, 75 insertions(+), 51 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 4b4eed1f32..eebfede7f9 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1192,8 +1192,10 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
     return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
   };
 
+  cuopt_assert(vec.size() > 0, "Vector must not be empty");
+
   auto stream = vec.stream();
-  auto n      = static_cast<int>(vec.size());
+  size_t n      = vec.size();
 
   rmm::device_scalar<f_t> d_smallest(stream);
   rmm::device_scalar<f_t> d_largest(stream);
@@ -1203,23 +1205,23 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
   auto abs_iter    = thrust::make_transform_iterator(vec.cbegin(), abs_op);
 
   void* d_temp   = nullptr;
-  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 1;
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
-  cub::DeviceReduce::Reduce(
-    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+  size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    d_temp, bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
 
   size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
   rmm::device_buffer temp_buf(max_bytes, stream);
 
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream);
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream);
-  cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream);
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
+    temp_buf.data(), bytes_3, abs_iter, d_sum.data(), n, cuda::std::plus<>{}, f_t(0), stream));
 
   smallest = d_smallest.value(stream);
   largest  = d_largest.value(stream);
@@ -1444,7 +1446,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     movement + computed_interaction >= f_t(0.0),
     "Movement + computed interaction must be >= 0");
 
-  *fixed_point_error = cuda::std::sqrt(movement + computed_interaction);
+  // Clamp to 0 to avoid NaN
+  *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
 
 #ifdef CUPDLP_DEBUG_MODE
   printf("movement %lf\n", movement);
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index eec078f8d7..269cb58f5d 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -404,19 +404,19 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       relative_residual_t<i_t, f_t>{settings.tolerances.relative_primal_tolerance});
     void* d_temp_storage      = nullptr;
     size_t temp_storage_bytes = 0;
-    cub::DeviceReduce::Max(d_temp_storage,
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage,
                               temp_storage_bytes,
                               transform_iter,
                               linf_primal_residual_.data(),
                               primal_residual_.size(),
-                              stream_view_);
+                              stream_view_));
     rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
-    cub::DeviceReduce::Max(temp_buf.data(),
+    RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(),
                               temp_storage_bytes,
                               transform_iter,
                               linf_primal_residual_.data(),
                               primal_residual_.size(),
-                              stream_view_);
+                              stream_view_));
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 647672e535..5effbcdc48 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -20,32 +20,11 @@ ping_pong_graph_t<i_t>::ping_pong_graph_t(rmm::cuda_stream_view stream_view,
 {
 }
 
-template <typename i_t>
-void ping_pong_graph_t<i_t>::cancel_active_capture()
-{
-  CUOPT_LOG_ERROR(
-    "Canceling active capture in ping_pong_graph_t");
-  if (capture_even_active_) {
-    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
-    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
-    capture_even_active_ = false;
-  }
-  if (capture_odd_active_) {
-    RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
-    RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
-    capture_odd_active_ = false;
-  }
-}
-
 template <typename i_t>
 ping_pong_graph_t<i_t>::~ping_pong_graph_t()
 {
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
-    // This should not happen, but in case a graph was capturing while destroying the object
-    if (capture_even_active_ || capture_odd_active_) {
-      cancel_active_capture();
-    }
     if (even_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(even_instance)); }
     if (odd_initialized) { RAFT_CUDA_TRY_NO_THROW(cudaGraphExecDestroy(odd_instance)); }
   }
@@ -58,11 +37,11 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(
+      RAFT_CUDA_TRY(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(
+      RAFT_CUDA_TRY(
         cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
@@ -76,17 +55,17 @@ void ping_pong_graph_t<i_t>::end_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &even_graph));
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &even_graph));
       capture_even_active_ = false;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&even_instance, even_graph));
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&even_instance, even_graph));
       even_initialized = true;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(even_graph));
+      RAFT_CUDA_TRY(cudaGraphDestroy(even_graph));
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
+      RAFT_CUDA_TRY(cudaStreamEndCapture(stream_view_.value(), &odd_graph));
       capture_odd_active_ = false;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphInstantiate(&odd_instance, odd_graph));
+      RAFT_CUDA_TRY(cudaGraphInstantiate(&odd_instance, odd_graph));
       odd_initialized = true;
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphDestroy(odd_graph));
+      RAFT_CUDA_TRY(cudaGraphDestroy(odd_graph));
     }
   }
 #endif
@@ -98,9 +77,9 @@ void ping_pong_graph_t<i_t>::launch(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && even_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(even_instance, stream_view_.value()));
+      RAFT_CUDA_TRY(cudaGraphLaunch(even_instance, stream_view_.value()));
     } else if (total_pdlp_iterations % 2 == 1 && odd_initialized) {
-      RAFT_CUDA_TRY_NO_THROW(cudaGraphLaunch(odd_instance, stream_view_.value()));
+      RAFT_CUDA_TRY(cudaGraphLaunch(odd_instance, stream_view_.value()));
     }
   }
 #endif
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 9d6ead8cf7..5113f804d6 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -31,7 +31,6 @@ class ping_pong_graph_t {
   bool is_initialized(i_t total_pdlp_iterations);
 
  private:
-  void cancel_active_capture();
 
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
diff --git a/docs/cuopt/source/lp-qp-milp-settings.rst b/docs/cuopt/source/lp-qp-milp-settings.rst
index 51c6142c2b..bd1372f70e 100644
--- a/docs/cuopt/source/lp-qp-milp-settings.rst
+++ b/docs/cuopt/source/lp-qp-milp-settings.rst
@@ -513,3 +513,13 @@ Set this value to 0 to disable reliability branching.
 Set this value to k > 0, to enable reliability branching. A variable will be considered reliable if it has been branched on k times.
 
 .. note:: The default value is ``-1`` (automatic).
+
+Batch PDLP Strong Branching
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING`` controls whether to use batched PDLP over Dual Simplex during strong branching at the root.
+When enabled, the solver evaluates multiple branching candidates simultaneously in a single batched PDLP solve rather than solving them in parallel using Dual Simplex. This can significantly reduce the time spent in strong branching if Dual Simplex is struggling.
+Set this value to 0 to disable batched PDLP strong branching.
+Set this value to 1 to enable batched PDLP strong branching.
+
+.. note:: The default value is ``0`` (disabled). This setting is ignored if the problem is not a MIP problem.
diff --git a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
index dc470f3828..0eca50ba9b 100644
--- a/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
+++ b/python/cuopt/cuopt/tests/linear_programming/test_python_API.py
@@ -30,6 +30,7 @@
     CUOPT_ELIMINATE_DENSE_COLUMNS,
     CUOPT_FOLDING,
     CUOPT_INFEASIBILITY_DETECTION,
+    CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING,
     CUOPT_MIP_CUT_PASSES,
     CUOPT_METHOD,
     CUOPT_ORDERING,
@@ -997,3 +998,35 @@ def test_cuts():
     assert problem.Status.name == "Optimal"
     assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
     assert problem.SolutionStats.num_nodes == 0
+
+
+def test_batch_pdlp_strong_branching():
+    # Minimize - 86*y1 - 4*y2 - 40*y3
+    # subject to 774*y1 + 76*y2 + 42*y3 <= 875
+    #            67*y1 + 27*y2 + 53*y3 <= 875
+    #            y1, y2, y3 in {0, 1}
+
+    problem = Problem()
+    y1 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y1")
+    y2 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y2")
+    y3 = problem.addVariable(lb=0, ub=1, vtype=INTEGER, name="y3")
+
+    problem.addConstraint(774 * y1 + 76 * y2 + 42 * y3 <= 875)
+    problem.addConstraint(67 * y1 + 27 * y2 + 53 * y3 <= 875)
+
+    problem.setObjective(-86 * y1 - 4 * y2 - 40 * y3)
+
+    settings = SolverSettings()
+    settings.set_parameter(CUOPT_PRESOLVE, 0)
+    settings.set_parameter(CUOPT_TIME_LIMIT, 10)
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 0)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)
+
+    settings.set_parameter(CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, 1)
+
+    problem.solve(settings)
+    assert problem.Status.name == "Optimal"
+    assert problem.ObjValue == pytest.approx(-126, abs=1e-3)

From c8b8b74bc6cc5f65df94f868a0334464e360c374 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:44:54 +0100
Subject: [PATCH 12/43] format

---
 cpp/src/pdlp/pdlp.cu                          | 36 ++++++++++-------
 .../adaptive_step_size_strategy.cu            | 17 ++++----
 .../convergence_information.cu                | 40 +++++++++----------
 cpp/src/pdlp/utilities/ping_pong_graph.cu     |  6 +--
 cpp/src/pdlp/utilities/ping_pong_graph.cuh    |  3 +-
 cpp/src/pdlp/utils.cuh                        | 10 ++---
 6 files changed, 58 insertions(+), 54 deletions(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index eebfede7f9..72aead03d0 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -34,8 +34,8 @@
 #include <cub/cub.cuh>
 
 #include <thrust/count.h>
-#include <thrust/logical.h>
 #include <thrust/extrema.h>
+#include <thrust/logical.h>
 
 #include <cmath>
 #include <optional>
@@ -1188,14 +1188,13 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
                           f_t& avg)
 {
   auto abs_op      = [] __host__ __device__(f_t x) { return abs(x); };
-  auto min_nonzero = [] __host__ __device__(f_t x) -> f_t {
-    return x == 0 ? std::numeric_limits<f_t>::max() : abs(x);
-  };
+  auto min_nonzero = [] __host__ __device__(f_t x)
+    -> f_t { return x == 0 ? std::numeric_limits<f_t>::max() : abs(x); };
 
   cuopt_assert(vec.size() > 0, "Vector must not be empty");
 
   auto stream = vec.stream();
-  size_t n      = vec.size();
+  size_t n    = vec.size();
 
   rmm::device_scalar<f_t> d_smallest(stream);
   rmm::device_scalar<f_t> d_largest(stream);
@@ -1206,8 +1205,14 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
 
   void* d_temp   = nullptr;
   size_t bytes_1 = 0, bytes_2 = 0, bytes_3 = 0;
-  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
-    d_temp, bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(d_temp,
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
     d_temp, bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
@@ -1216,8 +1221,14 @@ static void compute_stats(const rmm::device_uvector<f_t>& vec,
   size_t max_bytes = std::max({bytes_1, bytes_2, bytes_3});
   rmm::device_buffer temp_buf(max_bytes, stream);
 
-  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
-    temp_buf.data(), bytes_1, min_nz_iter, d_smallest.data(), n, cuda::minimum<>{}, std::numeric_limits<f_t>::max(), stream));
+  RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(temp_buf.data(),
+                                          bytes_1,
+                                          min_nz_iter,
+                                          d_smallest.data(),
+                                          n,
+                                          cuda::minimum<>{},
+                                          std::numeric_limits<f_t>::max(),
+                                          stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
     temp_buf.data(), bytes_2, abs_iter, d_largest.data(), n, cuda::maximum<>{}, f_t(0), stream));
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(
@@ -1442,9 +1453,8 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  cuopt_assert(
-    movement + computed_interaction >= f_t(0.0),
-    "Movement + computed interaction must be >= 0");
+  cuopt_assert(movement + computed_interaction >= f_t(0.0),
+               "Movement + computed interaction must be >= 0");
 
   // Clamp to 0 to avoid NaN
   *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
@@ -1828,7 +1838,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
   // potential_next_dual_solution
   RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
 
-
   // Make potential_next_dual_solution point towards reflected dual solution to reuse the code
   RAFT_CUSPARSE_TRY(cusparseDnVecSetValues(cusparse_view.potential_next_dual_solution,
                                            (void*)pdhg_solver_.get_reflected_dual().data()));
@@ -1853,7 +1862,6 @@ void pdlp_solver_t<i_t, f_t>::compute_fixed_error(std::vector<int>& has_restarte
       stream_view_));  // To make sure all the data is written from device to host
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-
 #ifdef CUPDLP_DEBUG_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
 #endif
diff --git a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
index d491106aaf..47e9a78a5e 100644
--- a/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
+++ b/cpp/src/pdlp/step_size_strategy/adaptive_step_size_strategy.cu
@@ -137,11 +137,12 @@ void adaptive_step_size_strategy_t<i_t, f_t>::swap_context(
   const auto [grid_size, block_size] =
     kernel_config_from_batch_size(static_cast<i_t>(swap_pairs.size()));
   adaptive_step_size_swap_device_vectors_kernel<i_t, f_t>
-    <<<grid_size, block_size, 0, stream_view_.value()>>>(thrust::raw_pointer_cast(swap_pairs.data()),
-                                                 static_cast<i_t>(swap_pairs.size()),
-                                                 make_span(interaction_),
-                                                 make_span(norm_squared_delta_primal_),
-                                                 make_span(norm_squared_delta_dual_));
+    <<<grid_size, block_size, 0, stream_view_.value()>>>(
+      thrust::raw_pointer_cast(swap_pairs.data()),
+      static_cast<i_t>(swap_pairs.size()),
+      make_span(interaction_),
+      make_span(norm_squared_delta_primal_),
+      make_span(norm_squared_delta_dual_));
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -332,9 +333,9 @@ void adaptive_step_size_strategy_t<i_t, f_t>::compute_step_sizes(
     // Compute n_lim, n_next and decide if step size is valid
     compute_step_sizes_from_movement_and_interaction<i_t, f_t>
       <<<1, 1, 0, stream_view_.value()>>>(this->view(),
-                                  primal_step_size.data(),
-                                  dual_step_size.data(),
-                                  pdhg_solver.get_d_total_pdhg_iterations().data());
+                                          primal_step_size.data(),
+                                          dual_step_size.data(),
+                                          pdhg_solver.get_d_total_pdhg_iterations().data());
     graph.end_capture(total_pdlp_iterations);
   }
   graph.launch(total_pdlp_iterations);
diff --git a/cpp/src/pdlp/termination_strategy/convergence_information.cu b/cpp/src/pdlp/termination_strategy/convergence_information.cu
index 269cb58f5d..1e9a69d130 100644
--- a/cpp/src/pdlp/termination_strategy/convergence_information.cu
+++ b/cpp/src/pdlp/termination_strategy/convergence_information.cu
@@ -405,18 +405,18 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
     void* d_temp_storage      = nullptr;
     size_t temp_storage_bytes = 0;
     RAFT_CUDA_TRY(cub::DeviceReduce::Max(d_temp_storage,
-                              temp_storage_bytes,
-                              transform_iter,
-                              linf_primal_residual_.data(),
-                              primal_residual_.size(),
-                              stream_view_));
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
     rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
     RAFT_CUDA_TRY(cub::DeviceReduce::Max(temp_buf.data(),
-                              temp_storage_bytes,
-                              transform_iter,
-                              linf_primal_residual_.data(),
-                              primal_residual_.size(),
-                              stream_view_));
+                                         temp_storage_bytes,
+                                         transform_iter,
+                                         linf_primal_residual_.data(),
+                                         primal_residual_.size(),
+                                         stream_view_));
   }
 
   compute_dual_residual(op_problem_cusparse_view_,
@@ -465,18 +465,18 @@ void convergence_information_t<i_t, f_t>::compute_convergence_information(
       void* d_temp_storage      = nullptr;
       size_t temp_storage_bytes = 0;
       cub::DeviceReduce::Max(d_temp_storage,
-                                temp_storage_bytes,
-                                transform_iter,
-                                linf_dual_residual_.data(),
-                                dual_residual_.size(),
-                                stream_view_);
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
       rmm::device_buffer temp_buf(temp_storage_bytes, stream_view_);
       cub::DeviceReduce::Max(temp_buf.data(),
-                                temp_storage_bytes,
-                                transform_iter,
-                                linf_dual_residual_.data(),
-                                dual_residual_.size(),
-                                stream_view_);
+                             temp_storage_bytes,
+                             transform_iter,
+                             linf_dual_residual_.data(),
+                             dual_residual_.size(),
+                             stream_view_);
     }
   }
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 5effbcdc48..4ec5bff8c1 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -37,12 +37,10 @@ void ping_pong_graph_t<i_t>::start_capture(i_t total_pdlp_iterations)
 #ifndef CUPDLP_DEBUG_MODE
   if (!is_legacy_batch_mode_) {
     if (total_pdlp_iterations % 2 == 0 && !even_initialized) {
-      RAFT_CUDA_TRY(
-        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_even_active_ = true;
     } else if (total_pdlp_iterations % 2 == 1 && !odd_initialized) {
-      RAFT_CUDA_TRY(
-        cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
+      RAFT_CUDA_TRY(cudaStreamBeginCapture(stream_view_.value(), cudaStreamCaptureModeThreadLocal));
       capture_odd_active_ = true;
     }
   }
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cuh b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
index 5113f804d6..dafecdd06e 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cuh
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cuh
@@ -19,7 +19,7 @@ namespace cuopt::linear_programming::detail {
 // No additional checks for safe usage (calling launch() before initializing the graph) use with
 // caution Binary part is because in pdlp we swap pointers instead of copying vectors to accept a
 // valid pdhg step So every odd pdlp step it's one graph, every even step it's another graph
-  template <typename i_t>
+template <typename i_t>
 class ping_pong_graph_t {
  public:
   ping_pong_graph_t(rmm::cuda_stream_view stream_view, bool is_legacy_batch_mode = false);
@@ -31,7 +31,6 @@ class ping_pong_graph_t {
   bool is_initialized(i_t total_pdlp_iterations);
 
  private:
-
   cudaGraph_t even_graph;
   cudaGraph_t odd_graph;
   cudaGraphExec_t even_instance;
diff --git a/cpp/src/pdlp/utils.cuh b/cpp/src/pdlp/utils.cuh
index 0f2ed44c42..33625f7680 100644
--- a/cpp/src/pdlp/utils.cuh
+++ b/cpp/src/pdlp/utils.cuh
@@ -608,13 +608,11 @@ void inline my_inf_norm(const rmm::device_uvector<f_t>& input_vector,
   auto abs_iter = thrust::make_transform_iterator(input_vector.data(), abs_t<f_t>{});
   auto n        = input_vector.size();
 
-  void* d_temp          = nullptr;
-  size_t temp_bytes     = 0;
-  cub::DeviceReduce::Max(
-    d_temp, temp_bytes, abs_iter, result, n, stream);
+  void* d_temp      = nullptr;
+  size_t temp_bytes = 0;
+  cub::DeviceReduce::Max(d_temp, temp_bytes, abs_iter, result, n, stream);
   rmm::device_buffer temp_buf(temp_bytes, stream);
-  cub::DeviceReduce::Max(
-    temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
+  cub::DeviceReduce::Max(temp_buf.data(), temp_bytes, abs_iter, result, n, stream);
 }
 
 template <typename f_t>

From b1be5bb2d0b729ddb685b722ad50c41f5a32aa58 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:45:01 +0100
Subject: [PATCH 13/43] format

---
 benchmarks/linear_programming/cuopt/run_pdlp.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/linear_programming/cuopt/run_pdlp.cu b/benchmarks/linear_programming/cuopt/run_pdlp.cu
index 64897264c9..e9b4f8296c 100644
--- a/benchmarks/linear_programming/cuopt/run_pdlp.cu
+++ b/benchmarks/linear_programming/cuopt/run_pdlp.cu
@@ -71,9 +71,9 @@ static void parse_arguments(argparse::ArgumentParser& program)
       "modes.");
 
   program.add_argument("--presolver")
-  .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
-  .default_value("Default")
-  .choices("None", "Papilo", "PSLP", "Default");
+    .help("Presolver to use. Possible values: None, Papilo, PSLP, Default")
+    .default_value("Default")
+    .choices("None", "Papilo", "PSLP", "Default");
 
   program.add_argument("--solution-path").help("Path where solution file will be generated");
 }

From d89af961b3d8af72d22f027114ac1921809c3ccc Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 13:53:59 +0100
Subject: [PATCH 14/43] style

---
 .../restart_strategy/pdlp_restart_strategy.cu | 827 +++++++++---------
 1 file changed, 415 insertions(+), 412 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index cf715e8a1d..0b1c109185 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -2008,462 +2008,465 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                    "Incorrect primal reverse iterator");
       index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() +
                            1;  // + 1 to go after last negInf
-    if (lowest_inf != end) {
+      if (lowest_inf != end) {
                      std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal iterator");
-      index_end_primal =
-        thrust::raw_pointer_cast(lowest_inf) -
-        threshold_.data();  // no - 1 to go before the first inf because end is not included
-      testing_range_high_.set_value_async(index_end_primal, stream_view_);
-    } else  // No inf found, end is primal_size_h_
-      testing_range_high_.set_value_async(index_end_primal, stream_view_);
-    cuopt_assert(index_start_primal <= index_end_primal,
-                 "Start should be strictly smaller than end");
-
-    cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                                 threshold_.data() + index_start_primal,
-                                 threshold_.data() + index_end_primal,
-                                 is_nan_or_inf<f_t>()),
-                 "Threshold vector should not contain inf or NaN values");
-
-    // Init parameters for live kernel
-    // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
-    auto restart_view        = this->view();
-    auto op_view             = problem_ptr->view();
-    i_t* testing_range_low   = testing_range_low_.data();
-    i_t* testing_range_high  = testing_range_high_.data();
-    f_t* test_radius_squared = test_radius_squared_.data();
-    f_t* low_radius_squared  = low_radius_squared_.data();
-    f_t* high_radius_squared = high_radius_squared_.data();
-    f_t* distance_traveled   = duality_gap.distance_traveled_.data();
-
-    void* kernel_args[] = {
-      &restart_view,
-      &op_view,
-      &testing_range_low,
-      &testing_range_high,
-      &test_radius_squared,
-      &low_radius_squared,
-      &high_radius_squared,
-      &distance_traveled,
-    };
-    constexpr int numThreads = 128;
-    dim3 dimBlock(numThreads, 1, 1);
-    // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
-    // numBlocksPerSm
-    dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
-    // Compute the median for the join problem, while loop is inside the live kernel
-    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
-      (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
-      dimGrid,
-      dimBlock,
-      kernel_args,
-      0,
-      stream_view_));
-
-    // Find max threshold for the join problem
-    const f_t* max_threshold =
-      thrust::max_element(handle_ptr_->get_thrust_policy(),
-                          threshold_.data(),
-                          threshold_.data() + primal_size_h_ + dual_size_h_);
-
-    // we have now determined the test_threshold that should minimize the objective value of the
-    // solution.
-
-    //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
-    //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
-    // Otherwise use the test_threshold determined in the loop
-    // {
-    target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-      this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-    // }
+                     index_end_primal =
+                       thrust::raw_pointer_cast(lowest_inf) -
+                       threshold_
+                         .data();  // no - 1 to go before the first inf because end is not included
+                     testing_range_high_.set_value_async(index_end_primal, stream_view_);
+      } else  // No inf found, end is primal_size_h_
+        testing_range_high_.set_value_async(index_end_primal, stream_view_);
+      cuopt_assert(index_start_primal <= index_end_primal,
+                   "Start should be strictly smaller than end");
+
+      cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                                   threshold_.data() + index_start_primal,
+                                   threshold_.data() + index_end_primal,
+                                   is_nan_or_inf<f_t>()),
+                   "Threshold vector should not contain inf or NaN values");
+
+      // Init parameters for live kernel
+      // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
+      auto restart_view        = this->view();
+      auto op_view             = problem_ptr->view();
+      i_t* testing_range_low   = testing_range_low_.data();
+      i_t* testing_range_high  = testing_range_high_.data();
+      f_t* test_radius_squared = test_radius_squared_.data();
+      f_t* low_radius_squared  = low_radius_squared_.data();
+      f_t* high_radius_squared = high_radius_squared_.data();
+      f_t* distance_traveled   = duality_gap.distance_traveled_.data();
+
+      void* kernel_args[] = {
+        &restart_view,
+        &op_view,
+        &testing_range_low,
+        &testing_range_high,
+        &test_radius_squared,
+        &low_radius_squared,
+        &high_radius_squared,
+        &distance_traveled,
+      };
+      constexpr int numThreads = 128;
+      dim3 dimBlock(numThreads, 1, 1);
+      // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
+      // numBlocksPerSm
+      dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
+      // Compute the median for the join problem, while loop is inside the live kernel
+      RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
+        (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
+        dimGrid,
+        dimBlock,
+        kernel_args,
+        0,
+        stream_view_));
+
+      // Find max threshold for the join problem
+      const f_t* max_threshold =
+        thrust::max_element(handle_ptr_->get_thrust_policy(),
+                            threshold_.data(),
+                            threshold_.data() + primal_size_h_ + dual_size_h_);
+
+      // we have now determined the test_threshold that should minimize the objective value of the
+      // solution.
+
+      //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
+      //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
+      // Otherwise use the test_threshold determined in the loop
+      // {
+      target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+        this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
+      RAFT_CUDA_TRY(cudaPeekAtLastError());
+      // }
+
+      // Compute x (the solution which is defined by moving each component test_threshold *
+      // direction[component]) clamp on upper and lower bounds.
+      // Used unsorted_direction_full_ as the other one got sorted
+      // {
+      raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
+                             duality_gap.primal_solution_.data(),
+                             unsorted_direction_full_.data(),
+                             primal_size_h_,
+                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                             stream_view_);
+      raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
+                             duality_gap.dual_solution_.data(),
+                             unsorted_direction_full_.data() + primal_size_h_,
+                             dual_size_h_,
+                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                             stream_view_);
+      // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
+      using f_t2 = typename type_2<f_t>::type;
+      cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
+                                                            problem_ptr->variable_bounds.data()),
+                                      duality_gap.primal_solution_tr_.data(),
+                                      primal_size_h_,
+                                      clamp<f_t, f_t2>(),
+                                      stream_view_.value());
+
+      // project by max(min(y[i], upperbound[i]),lowerbound[i])
+      raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
+                              duality_gap.dual_solution_tr_.data(),
+                              transformed_constraint_lower_bounds_.data(),
+                              transformed_constraint_upper_bounds_.data(),
+                              dual_size_h_,
+                              constraint_clamp<f_t>(),
+                              stream_view_);
+      // }
+    }
 
-    // Compute x (the solution which is defined by moving each component test_threshold *
-    // direction[component]) clamp on upper and lower bounds.
-    // Used unsorted_direction_full_ as the other one got sorted
+    // Compute the current lower bound for the objective value using the primal solution_tr and
+    // upper bound for the objective value using the dual solution_tr
     // {
-    raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
-                           duality_gap.primal_solution_.data(),
-                           unsorted_direction_full_.data(),
-                           primal_size_h_,
-                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                           stream_view_);
-    raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
-                           duality_gap.dual_solution_.data(),
-                           unsorted_direction_full_.data() + primal_size_h_,
-                           dual_size_h_,
-                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                           stream_view_);
-    // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
-    using f_t2 = typename type_2<f_t>::type;
-    cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
-                                                          problem_ptr->variable_bounds.data()),
-                                    duality_gap.primal_solution_tr_.data(),
-                                    primal_size_h_,
-                                    clamp<f_t, f_t2>(),
-                                    stream_view_.value());
-
-    // project by max(min(y[i], upperbound[i]),lowerbound[i])
-    raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
-                            duality_gap.dual_solution_tr_.data(),
-                            transformed_constraint_lower_bounds_.data(),
-                            transformed_constraint_upper_bounds_.data(),
-                            dual_size_h_,
-                            constraint_clamp<f_t>(),
-                            stream_view_);
+    // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
+    // primal_gradient))
+    compute_bound(duality_gap.primal_solution_tr_,
+                  duality_gap.primal_solution_,
+                  duality_gap.primal_gradient_,
+                  duality_gap.lagrangian_value_,
+                  primal_size_h_,
+                  primal_stride,
+                  tmp_primal,
+                  duality_gap.lower_bound_value_);
+
+    // compute 'upper bound' using dual
+    compute_bound(duality_gap.dual_solution_tr_,
+                  duality_gap.dual_solution_,
+                  duality_gap.dual_gradient_,
+                  duality_gap.lagrangian_value_,
+                  dual_size_h_,
+                  dual_stride,
+                  tmp_dual,
+                  duality_gap.upper_bound_value_);
+
     // }
   }
 
-  // Compute the current lower bound for the objective value using the primal solution_tr and
-  // upper bound for the objective value using the dual solution_tr
-  // {
-  // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
-  // primal_gradient))
-  compute_bound(duality_gap.primal_solution_tr_,
-                duality_gap.primal_solution_,
-                duality_gap.primal_gradient_,
-                duality_gap.lagrangian_value_,
-                primal_size_h_,
-                primal_stride,
-                tmp_primal,
-                duality_gap.lower_bound_value_);
-
-  // compute 'upper bound' using dual
-  compute_bound(duality_gap.dual_solution_tr_,
-                duality_gap.dual_solution_,
-                duality_gap.dual_gradient_,
-                duality_gap.lagrangian_value_,
-                dual_size_h_,
-                dual_stride,
-                tmp_dual,
-                duality_gap.upper_bound_value_);
-
-  // }
-}
-
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  rmm::device_uvector<f_t>& primal_weight,
-  rmm::device_uvector<f_t>& tmp_primal,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
-  // norm(
-  //     new_primal_solution - last_restart.primal_solution,
-  //   )^2
-
-  // Julia / Paper use a weighted norm using primal weight for primal / dual distance
-  // We simply use L2 norm of diff
-  distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
-                                                  last_restart_duality_gap_.primal_solution_,
-                                                  tmp_primal,
-                                                  primal_size_h_,
-                                                  primal_stride,
-                                                  duality_gap.primal_distance_traveled_);
-
-  // compute similarly for dual
-  distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
-                                                  last_restart_duality_gap_.dual_solution_,
-                                                  tmp_dual,
-                                                  dual_size_h_,
-                                                  dual_stride,
-                                                  duality_gap.dual_distance_traveled_);
-
-  // distance_traveled = primal_distance * 0.5 * primal_weight
-  // + dual_distance * 0.5 / primal_weight
-  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-    duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    rmm::device_uvector<f_t> & primal_weight,
+    rmm::device_uvector<f_t> & tmp_primal,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
+    // norm(
+    //     new_primal_solution - last_restart.primal_solution,
+    //   )^2
+
+    // Julia / Paper use a weighted norm using primal weight for primal / dual distance
+    // We simply use L2 norm of diff
+    distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
+                                                    last_restart_duality_gap_.primal_solution_,
+                                                    tmp_primal,
+                                                    primal_size_h_,
+                                                    primal_stride,
+                                                    duality_gap.primal_distance_traveled_);
+
+    // compute similarly for dual
+    distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
+                                                    last_restart_duality_gap_.dual_solution_,
+                                                    tmp_dual,
+                                                    dual_size_h_,
+                                                    dual_stride,
+                                                    duality_gap.dual_distance_traveled_);
+
+    // distance_traveled = primal_distance * 0.5 * primal_weight
+    // + dual_distance * 0.5 / primal_weight
+    compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+      duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view)
-{
-  raft::common::nvtx::range fun_scope("compute_primal_gradient");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view)
+  {
+    raft::common::nvtx::range fun_scope("compute_primal_gradient");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute primal gradient:" << std::endl;
+    std::cout << "    Compute primal gradient:" << std::endl;
 #endif
 
-  // for QP add problem.objective_matrix * primal_solution as well
-  // c - A^T*y (copy c to primal_gradient for correct writing of result)
-  raft::copy(duality_gap.primal_gradient_.data(),
-             problem_ptr->objective_coefficients.data(),
-             primal_size_h_,
-             stream_view_);
+    // for QP add problem.objective_matrix * primal_solution as well
+    // c - A^T*y (copy c to primal_gradient for correct writing of result)
+    raft::copy(duality_gap.primal_gradient_.data(),
+               problem_ptr->objective_coefficients.data(),
+               primal_size_h_,
+               stream_view_);
 
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_neg_1_.data(),
-                                                       cusparse_view.A_T,
-                                                       cusparse_view.dual_solution,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view.primal_gradient,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                                       stream_view_));
-}
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_neg_1_.data(),
+                                         cusparse_view.A_T,
+                                         cusparse_view.dual_solution,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.primal_gradient,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_transpose.data(),
+                                         stream_view_));
+  }
 
-template <typename i_t, typename f_t>
-__global__ void compute_subgradient_kernel(
-  const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
-  const typename problem_t<i_t, f_t>::view_t op_problem_view,
-  const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-  f_t* subgradient)
-{
-  i_t id = threadIdx.x + blockIdx.x * blockDim.x;
-  if (id >= duality_gap_view.dual_size) { return; }
-
-  f_t lower          = op_problem_view.constraint_lower_bounds[id];
-  f_t upper          = op_problem_view.constraint_upper_bounds[id];
-  f_t primal_product = duality_gap_view.dual_gradient[id];
-  f_t dual_solution  = duality_gap_view.dual_solution[id];
-
-  f_t subgradient_coefficient;
-
-  if (dual_solution < f_t(0)) {
-    subgradient_coefficient = upper;
-  } else if (dual_solution > f_t(0)) {
-    subgradient_coefficient = lower;
-  } else if (!isfinite(upper) && !isfinite(lower)) {
-    subgradient_coefficient = f_t(0);
-  } else if (!isfinite(upper) && isfinite(lower)) {
-    subgradient_coefficient = lower;
-  } else if (isfinite(upper) && !isfinite(lower)) {
-    subgradient_coefficient = upper;
-  } else {
-    if (primal_product < lower) {
+  template <typename i_t, typename f_t>
+  __global__ void compute_subgradient_kernel(
+    const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
+    const typename problem_t<i_t, f_t>::view_t op_problem_view,
+    const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
+    f_t* subgradient)
+  {
+    i_t id = threadIdx.x + blockIdx.x * blockDim.x;
+    if (id >= duality_gap_view.dual_size) { return; }
+
+    f_t lower          = op_problem_view.constraint_lower_bounds[id];
+    f_t upper          = op_problem_view.constraint_upper_bounds[id];
+    f_t primal_product = duality_gap_view.dual_gradient[id];
+    f_t dual_solution  = duality_gap_view.dual_solution[id];
+
+    f_t subgradient_coefficient;
+
+    if (dual_solution < f_t(0)) {
+      subgradient_coefficient = upper;
+    } else if (dual_solution > f_t(0)) {
       subgradient_coefficient = lower;
-    } else if (primal_product > upper) {
+    } else if (!isfinite(upper) && !isfinite(lower)) {
+      subgradient_coefficient = f_t(0);
+    } else if (!isfinite(upper) && isfinite(lower)) {
+      subgradient_coefficient = lower;
+    } else if (isfinite(upper) && !isfinite(lower)) {
       subgradient_coefficient = upper;
     } else {
-      subgradient_coefficient = primal_product;
+      if (primal_product < lower) {
+        subgradient_coefficient = lower;
+      } else if (primal_product > upper) {
+        subgradient_coefficient = upper;
+      } else {
+        subgradient_coefficient = primal_product;
+      }
     }
-  }
 
-  subgradient[id] = subgradient_coefficient;
-}
+    subgradient[id] = subgradient_coefficient;
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_dual_gradient");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_dual_gradient");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute dual gradient:" << std::endl;
+    std::cout << "    Compute dual gradient:" << std::endl;
 #endif
 
-  // b - A*x
-  // is changed with the introduction of constraint upper and lower bounds
-
-  // gradient constains primal_product
-  RAFT_CUSPARSE_TRY(
-    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                       reusable_device_scalar_value_1_.data(),
-                                       cusparse_view.A,
-                                       cusparse_view.primal_solution,
-                                       reusable_device_scalar_value_0_.data(),
-                                       cusparse_view.dual_gradient,
-                                       CUSPARSE_SPMV_CSR_ALG2,
-                                       (f_t*)cusparse_view.buffer_non_transpose.data(),
-                                       stream_view_));
-
-  // tmp_dual will contain the subgradient
-  i_t number_of_blocks = dual_size_h_ / block_size;
-  if (dual_size_h_ % block_size) number_of_blocks++;
-  i_t number_of_threads = std::min(dual_size_h_, block_size);
-  compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-    this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
-
-  // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
-  raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
-                           tmp_dual.data(),
-                           duality_gap.dual_gradient_.data(),
-                           dual_size_h_,
-                           stream_view_);
-}
+    // b - A*x
+    // is changed with the introduction of constraint upper and lower bounds
+
+    // gradient constains primal_product
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.A,
+                                         cusparse_view.primal_solution,
+                                         reusable_device_scalar_value_0_.data(),
+                                         cusparse_view.dual_gradient,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                         stream_view_));
+
+    // tmp_dual will contain the subgradient
+    i_t number_of_blocks = dual_size_h_ / block_size;
+    if (dual_size_h_ % block_size) number_of_blocks++;
+    i_t number_of_threads = std::min(dual_size_h_, block_size);
+    compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+      this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
+
+    // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
+    raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
+                             tmp_dual.data(),
+                             duality_gap.dual_gradient_.data(),
+                             dual_size_h_,
+                             stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
-  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
-  cusparse_view_t<i_t, f_t>& cusparse_view,
-  rmm::device_uvector<f_t>& tmp_primal,
-  rmm::device_uvector<f_t>& tmp_dual)
-{
-  raft::common::nvtx::range fun_scope("compute_lagrangian_value");
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
+    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
+    cusparse_view_t<i_t, f_t> & cusparse_view,
+    rmm::device_uvector<f_t> & tmp_primal,
+    rmm::device_uvector<f_t> & tmp_dual)
+  {
+    raft::common::nvtx::range fun_scope("compute_lagrangian_value");
 #ifdef PDLP_DEBUG_MODE
-  std::cout << "    Compute lagrangian value:" << std::endl;
+    std::cout << "    Compute lagrangian value:" << std::endl;
 #endif
-  // if QP
-  //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
-  //  dot(primal_solution, problem.objective_vector) -
-  //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
-  //  dot(dual_solution, dual_gradient+primal_product) +
-  //  problem.objective_constant
+    // if QP
+    //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
+    //  dot(primal_solution, problem.objective_vector) -
+    //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
+    //  dot(dual_solution, dual_gradient+primal_product) +
+    //  problem.objective_constant
 
-  // when lp first term is irrelevant
+    // when lp first term is irrelevant
 
-  // second term
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  primal_size_h_,
-                                                  duality_gap.primal_solution_.data(),
-                                                  primal_stride,
-                                                  problem_ptr->objective_coefficients.data(),
-                                                  primal_stride,
-                                                  reusable_device_scalar_1_.data(),
-                                                  stream_view_));
+    // second term
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    primal_size_h_,
+                                                    duality_gap.primal_solution_.data(),
+                                                    primal_stride,
+                                                    problem_ptr->objective_coefficients.data(),
+                                                    primal_stride,
+                                                    reusable_device_scalar_1_.data(),
+                                                    stream_view_));
 
-  // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
-  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                                       reusable_device_scalar_value_1_.data(),
-                                                       cusparse_view.A_T,
-                                                       cusparse_view.dual_solution,
-                                                       reusable_device_scalar_value_0_.data(),
-                                                       cusparse_view.tmp_primal,
-                                                       CUSPARSE_SPMV_CSR_ALG2,
-                                                       (f_t*)cusparse_view.buffer_transpose.data(),
-                                                       stream_view_));
+    // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
+    RAFT_CUSPARSE_TRY(
+      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                         reusable_device_scalar_value_1_.data(),
+                                         cusparse_view.A_T,
+                                         cusparse_view.dual_solution,
+                                         reusable_device_scalar_value_0_.data(),
+                                         cusparse_view.tmp_primal,
+                                         CUSPARSE_SPMV_CSR_ALG2,
+                                         (f_t*)cusparse_view.buffer_transpose.data(),
+                                         stream_view_));
 
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  primal_size_h_,
-                                                  duality_gap.primal_solution_.data(),
-                                                  primal_stride,
-                                                  tmp_primal.data(),
-                                                  primal_stride,
-                                                  reusable_device_scalar_2_.data(),
-                                                  stream_view_));
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    primal_size_h_,
+                                                    duality_gap.primal_solution_.data(),
+                                                    primal_stride,
+                                                    tmp_primal.data(),
+                                                    primal_stride,
+                                                    reusable_device_scalar_2_.data(),
+                                                    stream_view_));
 
-  // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
-  reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
-  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                  dual_size_h_,
-                                                  duality_gap.dual_solution_.data(),
-                                                  dual_stride,
-                                                  tmp_dual.data(),
-                                                  dual_stride,
-                                                  reusable_device_scalar_3_.data(),
-                                                  stream_view_));
+    // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
+    reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                    dual_size_h_,
+                                                    duality_gap.dual_solution_.data(),
+                                                    dual_stride,
+                                                    tmp_dual.data(),
+                                                    dual_stride,
+                                                    reusable_device_scalar_3_.data(),
+                                                    stream_view_));
 
-  // subtract third term from second up
-  raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_2_.data(),
-                           1,
-                           stream_view_);
-  raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
-                           reusable_device_scalar_1_.data(),
-                           reusable_device_scalar_3_.data(),
-                           1,
-                           stream_view_);
-}
+    // subtract third term from second up
+    raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_2_.data(),
+                             1,
+                             stream_view_);
+    raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
+                             reusable_device_scalar_1_.data(),
+                             reusable_device_scalar_3_.data(),
+                             1,
+                             stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
-{
-  candidate_is_avg_.set_value_to_zero_async(stream_view_);
-  restart_triggered_.set_value_to_zero_async(stream_view_);
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
+  {
+    candidate_is_avg_.set_value_to_zero_async(stream_view_);
+    restart_triggered_.set_value_to_zero_async(stream_view_);
+  }
 
-template <typename i_t, typename f_t>
-typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
-{
-  pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
-  v.primal_size                         = primal_size_h_;
-  v.dual_size                           = dual_size_h_;
-  v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
-    transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
-  v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
-    transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
-  v.last_restart_length = last_restart_length_;
+  template <typename i_t, typename f_t>
+  typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
+  {
+    pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
+    v.primal_size                         = primal_size_h_;
+    v.dual_size                           = dual_size_h_;
+    v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
+      transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
+    v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
+      transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
+    v.last_restart_length = last_restart_length_;
 
-  v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
+    v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
 
-  v.candidate_is_avg  = candidate_is_avg_.data();
-  v.restart_triggered = restart_triggered_.data();
+    v.candidate_is_avg  = candidate_is_avg_.data();
+    v.restart_triggered = restart_triggered_.data();
 
-  v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
+    v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
 
-  v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
-  v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
-  v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
-  v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
-  v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
-  v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
-  v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
+    v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
+    v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
+    v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
+    v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
+    v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
+    v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
+    v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
 
-  v.target_threshold    = target_threshold_.data();
-  v.low_radius_squared  = low_radius_squared_.data();
-  v.high_radius_squared = high_radius_squared_.data();
-  v.test_radius_squared = test_radius_squared_.data();
+    v.target_threshold    = target_threshold_.data();
+    v.low_radius_squared  = low_radius_squared_.data();
+    v.high_radius_squared = high_radius_squared_.data();
+    v.test_radius_squared = test_radius_squared_.data();
 
-  v.testing_range_low  = testing_range_low_.data();
-  v.testing_range_high = testing_range_high_.data();
+    v.testing_range_low  = testing_range_low_.data();
+    v.testing_range_high = testing_range_high_.data();
 
-  v.shared_live_kernel_accumulator = raft::device_span<f_t>{shared_live_kernel_accumulator_.data(),
-                                                            shared_live_kernel_accumulator_.size()};
+    v.shared_live_kernel_accumulator = raft::device_span<f_t>{
+      shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()};
 
-  v.hyper_params = hyper_params_;
+    v.hyper_params = hyper_params_;
 
-  return v;
-}
+    return v;
+  }
 
-template <typename i_t, typename f_t>
-typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
-pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
-  const rmm::device_uvector<f_t>& primal_distance,
-  const rmm::device_uvector<f_t>& dual_distance,
-  const convergence_information_t<i_t, f_t>& current_convergence_information,
-  const rmm::device_uvector<f_t>& step_size,
-  rmm::device_uvector<f_t>& primal_weight,
-  rmm::device_uvector<f_t>& best_primal_weight,
-  rmm::device_uvector<f_t>& primal_step_size,
-  rmm::device_uvector<f_t>& dual_step_size)
-{
-  cupdlpx_restart_view_t v{};
-  v.primal_distance    = make_span(primal_distance);
-  v.dual_distance      = make_span(dual_distance);
-  v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
-  v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
-  v.l2_norm_primal_linear_objective =
-    current_convergence_information.get_relative_dual_tolerance_factor();
-  v.l2_norm_primal_right_hand_side =
-    current_convergence_information.get_relative_primal_tolerance_factor();
-  v.step_size                     = make_span(step_size);
-  v.primal_weight                 = make_span(primal_weight);
-  v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
-  v.primal_weight_last_error      = make_span(primal_weight_last_error_);
-  v.best_primal_weight            = make_span(best_primal_weight);
-  v.new_primal_step_size          = make_span(primal_step_size);
-  v.new_dual_step_size            = make_span(dual_step_size);
-  v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
-  v.hyper_params                  = hyper_params_;
-  return v;
-}
+  template <typename i_t, typename f_t>
+  typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
+  pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
+    const rmm::device_uvector<f_t>& primal_distance,
+    const rmm::device_uvector<f_t>& dual_distance,
+    const convergence_information_t<i_t, f_t>& current_convergence_information,
+    const rmm::device_uvector<f_t>& step_size,
+    rmm::device_uvector<f_t>& primal_weight,
+    rmm::device_uvector<f_t>& best_primal_weight,
+    rmm::device_uvector<f_t>& primal_step_size,
+    rmm::device_uvector<f_t>& dual_step_size)
+  {
+    cupdlpx_restart_view_t v{};
+    v.primal_distance    = make_span(primal_distance);
+    v.dual_distance      = make_span(dual_distance);
+    v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
+    v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
+    v.l2_norm_primal_linear_objective =
+      current_convergence_information.get_relative_dual_tolerance_factor();
+    v.l2_norm_primal_right_hand_side =
+      current_convergence_information.get_relative_primal_tolerance_factor();
+    v.step_size                     = make_span(step_size);
+    v.primal_weight                 = make_span(primal_weight);
+    v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
+    v.primal_weight_last_error      = make_span(primal_weight_last_error_);
+    v.best_primal_weight            = make_span(best_primal_weight);
+    v.new_primal_step_size          = make_span(primal_step_size);
+    v.new_dual_step_size            = make_span(dual_step_size);
+    v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
+    v.hyper_params                  = hyper_params_;
+    return v;
+  }
 
-template <typename i_t, typename f_t>
-i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
-{
-  return weighted_average_solution_.get_iterations_since_last_restart();
-}
+  template <typename i_t, typename f_t>
+  i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
+  {
+    return weighted_average_solution_.get_iterations_since_last_restart();
+  }
 
-template <typename i_t, typename f_t>
-void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
-{
-  last_restart_was_average_ = value;
-}
+  template <typename i_t, typename f_t>
+  void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
+  {
+    last_restart_was_average_ = value;
+  }
 
-template <typename i_t, typename f_t>
-bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
-{
-  return last_restart_was_average_;
-}
+  template <typename i_t, typename f_t>
+  bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
+  {
+    return last_restart_was_average_;
+  }
 
 #define INSTANTIATE(F_TYPE)                                                                     \
   template class pdlp_restart_strategy_t<int, F_TYPE>;                                          \
@@ -2520,11 +2523,11 @@ bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
     F_TYPE* primal_product);
 
 #if MIP_INSTANTIATE_FLOAT
-INSTANTIATE(float)
+  INSTANTIATE(float)
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-INSTANTIATE(double)
+  INSTANTIATE(double)
 #endif
 
 }  // namespace cuopt::linear_programming::detail

From c7e3e222d063b98907efa2f5f81e31d84f231f10 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Feb 2026 15:03:34 +0100
Subject: [PATCH 15/43] put back changes in restart

---
 .../restart_strategy/pdlp_restart_strategy.cu | 831 +++++++++---------
 1 file changed, 416 insertions(+), 415 deletions(-)

diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 0b1c109185..8eacd4d246 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -2008,465 +2008,466 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
                    "Incorrect primal reverse iterator");
       index_start_primal = thrust::raw_pointer_cast(&*highest_negInf_primal) - threshold_.data() +
                            1;  // + 1 to go after last negInf
-      if (lowest_inf != end) {
+      testing_range_low_.set_value_async(index_start_primal, stream_view_);
+    } else  // No negInf found, start is 0
+      testing_range_low_.set_value_async(index_start_primal, stream_view_);
+    if (lowest_inf != end) {
+      cuopt_assert(device_to_host_value(thrust::raw_pointer_cast(&*lowest_inf)) ==
                      std::numeric_limits<f_t>::infinity(),
                    "Incorrect primal iterator");
-                     index_end_primal =
-                       thrust::raw_pointer_cast(lowest_inf) -
-                       threshold_
-                         .data();  // no - 1 to go before the first inf because end is not included
-                     testing_range_high_.set_value_async(index_end_primal, stream_view_);
-      } else  // No inf found, end is primal_size_h_
-        testing_range_high_.set_value_async(index_end_primal, stream_view_);
-      cuopt_assert(index_start_primal <= index_end_primal,
-                   "Start should be strictly smaller than end");
-
-      cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
-                                   threshold_.data() + index_start_primal,
-                                   threshold_.data() + index_end_primal,
-                                   is_nan_or_inf<f_t>()),
-                   "Threshold vector should not contain inf or NaN values");
-
-      // Init parameters for live kernel
-      // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
-      auto restart_view        = this->view();
-      auto op_view             = problem_ptr->view();
-      i_t* testing_range_low   = testing_range_low_.data();
-      i_t* testing_range_high  = testing_range_high_.data();
-      f_t* test_radius_squared = test_radius_squared_.data();
-      f_t* low_radius_squared  = low_radius_squared_.data();
-      f_t* high_radius_squared = high_radius_squared_.data();
-      f_t* distance_traveled   = duality_gap.distance_traveled_.data();
-
-      void* kernel_args[] = {
-        &restart_view,
-        &op_view,
-        &testing_range_low,
-        &testing_range_high,
-        &test_radius_squared,
-        &low_radius_squared,
-        &high_radius_squared,
-        &distance_traveled,
-      };
-      constexpr int numThreads = 128;
-      dim3 dimBlock(numThreads, 1, 1);
-      // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
-      // numBlocksPerSm
-      dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
-      // Compute the median for the join problem, while loop is inside the live kernel
-      RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
-        (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
-        dimGrid,
-        dimBlock,
-        kernel_args,
-        0,
-        stream_view_));
-
-      // Find max threshold for the join problem
-      const f_t* max_threshold =
-        thrust::max_element(handle_ptr_->get_thrust_policy(),
-                            threshold_.data(),
-                            threshold_.data() + primal_size_h_ + dual_size_h_);
-
-      // we have now determined the test_threshold that should minimize the objective value of the
-      // solution.
-
-      //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
-      //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
-      // Otherwise use the test_threshold determined in the loop
-      // {
-      target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-        this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
-      RAFT_CUDA_TRY(cudaPeekAtLastError());
-      // }
-
-      // Compute x (the solution which is defined by moving each component test_threshold *
-      // direction[component]) clamp on upper and lower bounds.
-      // Used unsorted_direction_full_ as the other one got sorted
-      // {
-      raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
-                             duality_gap.primal_solution_.data(),
-                             unsorted_direction_full_.data(),
-                             primal_size_h_,
-                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                             stream_view_);
-      raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
-                             duality_gap.dual_solution_.data(),
-                             unsorted_direction_full_.data() + primal_size_h_,
-                             dual_size_h_,
-                             a_add_scalar_times_b<f_t>(target_threshold_.data()),
-                             stream_view_);
-      // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
-      using f_t2 = typename type_2<f_t>::type;
-      cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
-                                                            problem_ptr->variable_bounds.data()),
-                                      duality_gap.primal_solution_tr_.data(),
-                                      primal_size_h_,
-                                      clamp<f_t, f_t2>(),
-                                      stream_view_.value());
-
-      // project by max(min(y[i], upperbound[i]),lowerbound[i])
-      raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
-                              duality_gap.dual_solution_tr_.data(),
-                              transformed_constraint_lower_bounds_.data(),
-                              transformed_constraint_upper_bounds_.data(),
-                              dual_size_h_,
-                              constraint_clamp<f_t>(),
-                              stream_view_);
-      // }
-    }
-
-    // Compute the current lower bound for the objective value using the primal solution_tr and
-    // upper bound for the objective value using the dual solution_tr
+      index_end_primal =
+        thrust::raw_pointer_cast(lowest_inf) -
+        threshold_.data();  // no - 1 to go before the first inf because end is not included
+      testing_range_high_.set_value_async(index_end_primal, stream_view_);
+    } else  // No inf found, end is primal_size_h_
+      testing_range_high_.set_value_async(index_end_primal, stream_view_);
+    cuopt_assert(index_start_primal <= index_end_primal,
+                 "Start should be strictly smaller than end");
+
+    cuopt_assert(!thrust::any_of(handle_ptr_->get_thrust_policy(),
+                                 threshold_.data() + index_start_primal,
+                                 threshold_.data() + index_end_primal,
+                                 is_nan_or_inf<f_t>()),
+                 "Threshold vector should not contain inf or NaN values");
+
+    // Init parameters for live kernel
+    // Has to do this to pass lvalues (and not rvalue) to void* kernel_args
+    auto restart_view        = this->view();
+    auto op_view             = problem_ptr->view();
+    i_t* testing_range_low   = testing_range_low_.data();
+    i_t* testing_range_high  = testing_range_high_.data();
+    f_t* test_radius_squared = test_radius_squared_.data();
+    f_t* low_radius_squared  = low_radius_squared_.data();
+    f_t* high_radius_squared = high_radius_squared_.data();
+    f_t* distance_traveled   = duality_gap.distance_traveled_.data();
+
+    void* kernel_args[] = {
+      &restart_view,
+      &op_view,
+      &testing_range_low,
+      &testing_range_high,
+      &test_radius_squared,
+      &low_radius_squared,
+      &high_radius_squared,
+      &distance_traveled,
+    };
+    constexpr int numThreads = 128;
+    dim3 dimBlock(numThreads, 1, 1);
+    // shared_live_kernel_accumulator_.size() contains deviceProp.multiProcessorCount *
+    // numBlocksPerSm
+    dim3 dimGrid(shared_live_kernel_accumulator_.size(), 1, 1);
+    // Compute the median for the join problem, while loop is inside the live kernel
+    RAFT_CUDA_TRY(cudaLaunchCooperativeKernel(
+      (void*)solve_bound_constrained_trust_region_kernel<i_t, f_t, numThreads>,
+      dimGrid,
+      dimBlock,
+      kernel_args,
+      0,
+      stream_view_));
+
+    // Find max threshold for the join problem
+    const f_t* max_threshold =
+      thrust::max_element(handle_ptr_->get_thrust_policy(),
+                          threshold_.data(),
+                          threshold_.data() + primal_size_h_ + dual_size_h_);
+
+    // we have now determined the test_threshold that should minimize the objective value of the
+    // solution.
+
+    //  if no component got fixed by their upper bound we can pick the maximum threshold to be the
+    //  target_threshold which was computed before the loop in the direction_and_threshold_kernel
+    // Otherwise use the test_threshold determined in the loop
     // {
-    // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
-    // primal_gradient))
-    compute_bound(duality_gap.primal_solution_tr_,
-                  duality_gap.primal_solution_,
-                  duality_gap.primal_gradient_,
-                  duality_gap.lagrangian_value_,
-                  primal_size_h_,
-                  primal_stride,
-                  tmp_primal,
-                  duality_gap.lower_bound_value_);
-
-    // compute 'upper bound' using dual
-    compute_bound(duality_gap.dual_solution_tr_,
-                  duality_gap.dual_solution_,
-                  duality_gap.dual_gradient_,
-                  duality_gap.lagrangian_value_,
-                  dual_size_h_,
-                  dual_stride,
-                  tmp_dual,
-                  duality_gap.upper_bound_value_);
+    target_threshold_determination_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+      this->view(), duality_gap.distance_traveled_.data(), max_threshold, max_threshold);
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
+    // }
 
+    // Compute x (the solution which is defined by moving each component test_threshold *
+    // direction[component]) clamp on upper and lower bounds.
+    // Used unsorted_direction_full_ as the other one got sorted
+    // {
+    raft::linalg::binaryOp(duality_gap.primal_solution_tr_.data(),
+                           duality_gap.primal_solution_.data(),
+                           unsorted_direction_full_.data(),
+                           primal_size_h_,
+                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                           stream_view_);
+    raft::linalg::binaryOp(duality_gap.dual_solution_tr_.data(),
+                           duality_gap.dual_solution_.data(),
+                           unsorted_direction_full_.data() + primal_size_h_,
+                           dual_size_h_,
+                           a_add_scalar_times_b<f_t>(target_threshold_.data()),
+                           stream_view_);
+    // project by max(min(x[i], upperbound[i]),lowerbound[i]) for primal part
+    using f_t2 = typename type_2<f_t>::type;
+    cub::DeviceTransform::Transform(cuda::std::make_tuple(duality_gap.primal_solution_tr_.data(),
+                                                          problem_ptr->variable_bounds.data()),
+                                    duality_gap.primal_solution_tr_.data(),
+                                    primal_size_h_,
+                                    clamp<f_t, f_t2>(),
+                                    stream_view_.value());
+
+    // project by max(min(y[i], upperbound[i]),lowerbound[i])
+    raft::linalg::ternaryOp(duality_gap.dual_solution_tr_.data(),
+                            duality_gap.dual_solution_tr_.data(),
+                            transformed_constraint_lower_bounds_.data(),
+                            transformed_constraint_upper_bounds_.data(),
+                            dual_size_h_,
+                            constraint_clamp<f_t>(),
+                            stream_view_);
     // }
   }
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    rmm::device_uvector<f_t> & primal_weight,
-    rmm::device_uvector<f_t> & tmp_primal,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
-    // norm(
-    //     new_primal_solution - last_restart.primal_solution,
-    //   )^2
-
-    // Julia / Paper use a weighted norm using primal weight for primal / dual distance
-    // We simply use L2 norm of diff
-    distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
-                                                    last_restart_duality_gap_.primal_solution_,
-                                                    tmp_primal,
-                                                    primal_size_h_,
-                                                    primal_stride,
-                                                    duality_gap.primal_distance_traveled_);
-
-    // compute similarly for dual
-    distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
-                                                    last_restart_duality_gap_.dual_solution_,
-                                                    tmp_dual,
-                                                    dual_size_h_,
-                                                    dual_stride,
-                                                    duality_gap.dual_distance_traveled_);
-
-    // distance_traveled = primal_distance * 0.5 * primal_weight
-    // + dual_distance * 0.5 / primal_weight
-    compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
-      duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
-  }
+  // Compute the current lower bound for the objective value using the primal solution_tr and
+  // upper bound for the objective value using the dual solution_tr
+  // {
+  // -> compute 'lower bound' for saddle point (langrangian + dot(primal_tr - primal_solution,
+  // primal_gradient))
+  compute_bound(duality_gap.primal_solution_tr_,
+                duality_gap.primal_solution_,
+                duality_gap.primal_gradient_,
+                duality_gap.lagrangian_value_,
+                primal_size_h_,
+                primal_stride,
+                tmp_primal,
+                duality_gap.lower_bound_value_);
+
+  // compute 'upper bound' using dual
+  compute_bound(duality_gap.dual_solution_tr_,
+                duality_gap.dual_solution_,
+                duality_gap.dual_gradient_,
+                duality_gap.lagrangian_value_,
+                dual_size_h_,
+                dual_stride,
+                tmp_dual,
+                duality_gap.upper_bound_value_);
+
+  // }
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view)
-  {
-    raft::common::nvtx::range fun_scope("compute_primal_gradient");
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_distance_traveled_from_last_restart(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  rmm::device_uvector<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& tmp_primal,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_distance_traveled_from_last_restart");
+  // norm(
+  //     new_primal_solution - last_restart.primal_solution,
+  //   )^2
+
+  // Julia / Paper use a weighted norm using primal weight for primal / dual distance
+  // We simply use L2 norm of diff
+  distance_squared_moved_from_last_restart_period(duality_gap.primal_solution_,
+                                                  last_restart_duality_gap_.primal_solution_,
+                                                  tmp_primal,
+                                                  primal_size_h_,
+                                                  primal_stride,
+                                                  duality_gap.primal_distance_traveled_);
+
+  // compute similarly for dual
+  distance_squared_moved_from_last_restart_period(duality_gap.dual_solution_,
+                                                  last_restart_duality_gap_.dual_solution_,
+                                                  tmp_dual,
+                                                  dual_size_h_,
+                                                  dual_stride,
+                                                  duality_gap.dual_distance_traveled_);
+
+  // distance_traveled = primal_distance * 0.5 * primal_weight
+  // + dual_distance * 0.5 / primal_weight
+  compute_distance_traveled_last_restart_kernel<i_t, f_t><<<1, 1, 0, stream_view_>>>(
+    duality_gap.view(), primal_weight.data(), duality_gap.distance_traveled_.data());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_primal_gradient(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view)
+{
+  raft::common::nvtx::range fun_scope("compute_primal_gradient");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute primal gradient:" << std::endl;
+  std::cout << "    Compute primal gradient:" << std::endl;
 #endif
 
-    // for QP add problem.objective_matrix * primal_solution as well
-    // c - A^T*y (copy c to primal_gradient for correct writing of result)
-    raft::copy(duality_gap.primal_gradient_.data(),
-               problem_ptr->objective_coefficients.data(),
-               primal_size_h_,
-               stream_view_);
-
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_neg_1_.data(),
-                                         cusparse_view.A_T,
-                                         cusparse_view.dual_solution,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.primal_gradient,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
-  }
-
-  template <typename i_t, typename f_t>
-  __global__ void compute_subgradient_kernel(
-    const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
-    const typename problem_t<i_t, f_t>::view_t op_problem_view,
-    const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
-    f_t* subgradient)
-  {
-    i_t id = threadIdx.x + blockIdx.x * blockDim.x;
-    if (id >= duality_gap_view.dual_size) { return; }
-
-    f_t lower          = op_problem_view.constraint_lower_bounds[id];
-    f_t upper          = op_problem_view.constraint_upper_bounds[id];
-    f_t primal_product = duality_gap_view.dual_gradient[id];
-    f_t dual_solution  = duality_gap_view.dual_solution[id];
+  // for QP add problem.objective_matrix * primal_solution as well
+  // c - A^T*y (copy c to primal_gradient for correct writing of result)
+  raft::copy(duality_gap.primal_gradient_.data(),
+             problem_ptr->objective_coefficients.data(),
+             primal_size_h_,
+             stream_view_);
 
-    f_t subgradient_coefficient;
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_neg_1_.data(),
+                                                       cusparse_view.A_T,
+                                                       cusparse_view.dual_solution,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view.primal_gradient,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view.buffer_transpose.data(),
+                                                       stream_view_));
+}
 
-    if (dual_solution < f_t(0)) {
-      subgradient_coefficient = upper;
-    } else if (dual_solution > f_t(0)) {
-      subgradient_coefficient = lower;
-    } else if (!isfinite(upper) && !isfinite(lower)) {
-      subgradient_coefficient = f_t(0);
-    } else if (!isfinite(upper) && isfinite(lower)) {
+template <typename i_t, typename f_t>
+__global__ void compute_subgradient_kernel(
+  const typename pdlp_restart_strategy_t<i_t, f_t>::view_t restart_strategy_view,
+  const typename problem_t<i_t, f_t>::view_t op_problem_view,
+  const typename localized_duality_gap_container_t<i_t, f_t>::view_t duality_gap_view,
+  f_t* subgradient)
+{
+  i_t id = threadIdx.x + blockIdx.x * blockDim.x;
+  if (id >= duality_gap_view.dual_size) { return; }
+
+  f_t lower          = op_problem_view.constraint_lower_bounds[id];
+  f_t upper          = op_problem_view.constraint_upper_bounds[id];
+  f_t primal_product = duality_gap_view.dual_gradient[id];
+  f_t dual_solution  = duality_gap_view.dual_solution[id];
+
+  f_t subgradient_coefficient;
+
+  if (dual_solution < f_t(0)) {
+    subgradient_coefficient = upper;
+  } else if (dual_solution > f_t(0)) {
+    subgradient_coefficient = lower;
+  } else if (!isfinite(upper) && !isfinite(lower)) {
+    subgradient_coefficient = f_t(0);
+  } else if (!isfinite(upper) && isfinite(lower)) {
+    subgradient_coefficient = lower;
+  } else if (isfinite(upper) && !isfinite(lower)) {
+    subgradient_coefficient = upper;
+  } else {
+    if (primal_product < lower) {
       subgradient_coefficient = lower;
-    } else if (isfinite(upper) && !isfinite(lower)) {
+    } else if (primal_product > upper) {
       subgradient_coefficient = upper;
     } else {
-      if (primal_product < lower) {
-        subgradient_coefficient = lower;
-      } else if (primal_product > upper) {
-        subgradient_coefficient = upper;
-      } else {
-        subgradient_coefficient = primal_product;
-      }
+      subgradient_coefficient = primal_product;
     }
-
-    subgradient[id] = subgradient_coefficient;
   }
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_dual_gradient");
+  subgradient[id] = subgradient_coefficient;
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_dual_gradient(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_dual_gradient");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute dual gradient:" << std::endl;
+  std::cout << "    Compute dual gradient:" << std::endl;
 #endif
 
-    // b - A*x
-    // is changed with the introduction of constraint upper and lower bounds
-
-    // gradient constains primal_product
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.A,
-                                         cusparse_view.primal_solution,
-                                         reusable_device_scalar_value_0_.data(),
-                                         cusparse_view.dual_gradient,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_non_transpose.data(),
-                                         stream_view_));
-
-    // tmp_dual will contain the subgradient
-    i_t number_of_blocks = dual_size_h_ / block_size;
-    if (dual_size_h_ % block_size) number_of_blocks++;
-    i_t number_of_threads = std::min(dual_size_h_, block_size);
-    compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
-      this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
-
-    // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
-    raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
-                             tmp_dual.data(),
-                             duality_gap.dual_gradient_.data(),
-                             dual_size_h_,
-                             stream_view_);
-  }
+  // b - A*x
+  // is changed with the introduction of constraint upper and lower bounds
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
-    localized_duality_gap_container_t<i_t, f_t> & duality_gap,
-    cusparse_view_t<i_t, f_t> & cusparse_view,
-    rmm::device_uvector<f_t> & tmp_primal,
-    rmm::device_uvector<f_t> & tmp_dual)
-  {
-    raft::common::nvtx::range fun_scope("compute_lagrangian_value");
+  // gradient constains primal_product
+  RAFT_CUSPARSE_TRY(
+    raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                       reusable_device_scalar_value_1_.data(),
+                                       cusparse_view.A,
+                                       cusparse_view.primal_solution,
+                                       reusable_device_scalar_value_0_.data(),
+                                       cusparse_view.dual_gradient,
+                                       CUSPARSE_SPMV_CSR_ALG2,
+                                       (f_t*)cusparse_view.buffer_non_transpose.data(),
+                                       stream_view_));
+
+  // tmp_dual will contain the subgradient
+  i_t number_of_blocks = dual_size_h_ / block_size;
+  if (dual_size_h_ % block_size) number_of_blocks++;
+  i_t number_of_threads = std::min(dual_size_h_, block_size);
+  compute_subgradient_kernel<i_t, f_t><<<number_of_blocks, number_of_threads, 0, stream_view_>>>(
+    this->view(), problem_ptr->view(), duality_gap.view(), tmp_dual.data());
+
+  // dual gradient = subgradient - primal_product (tmp_dual-dual_gradient)
+  raft::linalg::eltwiseSub(duality_gap.dual_gradient_.data(),
+                           tmp_dual.data(),
+                           duality_gap.dual_gradient_.data(),
+                           dual_size_h_,
+                           stream_view_);
+}
+
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::compute_lagrangian_value(
+  localized_duality_gap_container_t<i_t, f_t>& duality_gap,
+  cusparse_view_t<i_t, f_t>& cusparse_view,
+  rmm::device_uvector<f_t>& tmp_primal,
+  rmm::device_uvector<f_t>& tmp_dual)
+{
+  raft::common::nvtx::range fun_scope("compute_lagrangian_value");
 #ifdef PDLP_DEBUG_MODE
-    std::cout << "    Compute lagrangian value:" << std::endl;
+  std::cout << "    Compute lagrangian value:" << std::endl;
 #endif
-    // if QP
-    //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
-    //  dot(primal_solution, problem.objective_vector) -
-    //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
-    //  dot(dual_solution, dual_gradient+primal_product) +
-    //  problem.objective_constant
+  // if QP
+  //  0.5 * dot(primal_solution, problem.objective_matrix * primal_solution) +
+  //  dot(primal_solution, problem.objective_vector) -
+  //  dot(primal_solution, problem.constraint_matrix' * dual_solution) +
+  //  dot(dual_solution, dual_gradient+primal_product) +
+  //  problem.objective_constant
 
-    // when lp first term is irrelevant
+  // when lp first term is irrelevant
 
-    // second term
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    primal_size_h_,
-                                                    duality_gap.primal_solution_.data(),
-                                                    primal_stride,
-                                                    problem_ptr->objective_coefficients.data(),
-                                                    primal_stride,
-                                                    reusable_device_scalar_1_.data(),
-                                                    stream_view_));
+  // second term
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  primal_size_h_,
+                                                  duality_gap.primal_solution_.data(),
+                                                  primal_stride,
+                                                  problem_ptr->objective_coefficients.data(),
+                                                  primal_stride,
+                                                  reusable_device_scalar_1_.data(),
+                                                  stream_view_));
 
-    // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
-    RAFT_CUSPARSE_TRY(
-      raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
-                                         CUSPARSE_OPERATION_NON_TRANSPOSE,
-                                         reusable_device_scalar_value_1_.data(),
-                                         cusparse_view.A_T,
-                                         cusparse_view.dual_solution,
-                                         reusable_device_scalar_value_0_.data(),
-                                         cusparse_view.tmp_primal,
-                                         CUSPARSE_SPMV_CSR_ALG2,
-                                         (f_t*)cusparse_view.buffer_transpose.data(),
-                                         stream_view_));
+  // third term, let beta be 0 to not add what is in tmp_primal, compute it and compute dot
+  RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmv(handle_ptr_->get_cusparse_handle(),
+                                                       CUSPARSE_OPERATION_NON_TRANSPOSE,
+                                                       reusable_device_scalar_value_1_.data(),
+                                                       cusparse_view.A_T,
+                                                       cusparse_view.dual_solution,
+                                                       reusable_device_scalar_value_0_.data(),
+                                                       cusparse_view.tmp_primal,
+                                                       CUSPARSE_SPMV_CSR_ALG2,
+                                                       (f_t*)cusparse_view.buffer_transpose.data(),
+                                                       stream_view_));
 
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    primal_size_h_,
-                                                    duality_gap.primal_solution_.data(),
-                                                    primal_stride,
-                                                    tmp_primal.data(),
-                                                    primal_stride,
-                                                    reusable_device_scalar_2_.data(),
-                                                    stream_view_));
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  primal_size_h_,
+                                                  duality_gap.primal_solution_.data(),
+                                                  primal_stride,
+                                                  tmp_primal.data(),
+                                                  primal_stride,
+                                                  reusable_device_scalar_2_.data(),
+                                                  stream_view_));
 
-    // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
-    reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
-    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
-                                                    dual_size_h_,
-                                                    duality_gap.dual_solution_.data(),
-                                                    dual_stride,
-                                                    tmp_dual.data(),
-                                                    dual_stride,
-                                                    reusable_device_scalar_3_.data(),
-                                                    stream_view_));
+  // fourth term //tmp_dual still contains subgradient from the dual_gradient computation
+  reusable_device_scalar_3_.set_value_to_zero_async(stream_view_);
+  RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(handle_ptr_->get_cublas_handle(),
+                                                  dual_size_h_,
+                                                  duality_gap.dual_solution_.data(),
+                                                  dual_stride,
+                                                  tmp_dual.data(),
+                                                  dual_stride,
+                                                  reusable_device_scalar_3_.data(),
+                                                  stream_view_));
 
-    // subtract third term from second up
-    raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_2_.data(),
-                             1,
-                             stream_view_);
-    raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
-                             reusable_device_scalar_1_.data(),
-                             reusable_device_scalar_3_.data(),
-                             1,
-                             stream_view_);
-  }
+  // subtract third term from second up
+  raft::linalg::eltwiseSub(reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_2_.data(),
+                           1,
+                           stream_view_);
+  raft::linalg::eltwiseAdd(duality_gap.lagrangian_value_.data(),
+                           reusable_device_scalar_1_.data(),
+                           reusable_device_scalar_3_.data(),
+                           1,
+                           stream_view_);
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
-  {
-    candidate_is_avg_.set_value_to_zero_async(stream_view_);
-    restart_triggered_.set_value_to_zero_async(stream_view_);
-  }
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::reset_internal()
+{
+  candidate_is_avg_.set_value_to_zero_async(stream_view_);
+  restart_triggered_.set_value_to_zero_async(stream_view_);
+}
 
-  template <typename i_t, typename f_t>
-  typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
-  {
-    pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
-    v.primal_size                         = primal_size_h_;
-    v.dual_size                           = dual_size_h_;
-    v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
-      transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
-    v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
-      transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
-    v.last_restart_length = last_restart_length_;
+template <typename i_t, typename f_t>
+typename pdlp_restart_strategy_t<i_t, f_t>::view_t pdlp_restart_strategy_t<i_t, f_t>::view()
+{
+  pdlp_restart_strategy_t<i_t, f_t>::view_t v{};
+  v.primal_size                         = primal_size_h_;
+  v.dual_size                           = dual_size_h_;
+  v.transformed_constraint_lower_bounds = raft::device_span<f_t>{
+    transformed_constraint_lower_bounds_.data(), transformed_constraint_lower_bounds_.size()};
+  v.transformed_constraint_upper_bounds = raft::device_span<f_t>{
+    transformed_constraint_upper_bounds_.data(), transformed_constraint_upper_bounds_.size()};
+  v.last_restart_length = last_restart_length_;
 
-    v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
+  v.weights = raft::device_span<f_t>{weights_.data(), weights_.size()};
 
-    v.candidate_is_avg  = candidate_is_avg_.data();
-    v.restart_triggered = restart_triggered_.data();
+  v.candidate_is_avg  = candidate_is_avg_.data();
+  v.restart_triggered = restart_triggered_.data();
 
-    v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
+  v.gap_reduction_ratio_last_trial = gap_reduction_ratio_last_trial_.data();
 
-    v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
-    v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
-    v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
-    v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
-    v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
-    v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
-    v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
+  v.center_point     = raft::device_span<f_t>{center_point_.data(), center_point_.size()};
+  v.objective_vector = raft::device_span<f_t>{objective_vector_.data(), objective_vector_.size()};
+  v.direction_full   = raft::device_span<f_t>{direction_full_.data(), direction_full_.size()};
+  v.threshold        = raft::device_span<f_t>{threshold_.data(), threshold_.size()};
+  v.lower_bound      = raft::device_span<f_t>{lower_bound_.data(), lower_bound_.size()};
+  v.upper_bound      = raft::device_span<f_t>{upper_bound_.data(), upper_bound_.size()};
+  v.test_point       = raft::device_span<f_t>{test_point_.data(), test_point_.size()};
 
-    v.target_threshold    = target_threshold_.data();
-    v.low_radius_squared  = low_radius_squared_.data();
-    v.high_radius_squared = high_radius_squared_.data();
-    v.test_radius_squared = test_radius_squared_.data();
+  v.target_threshold    = target_threshold_.data();
+  v.low_radius_squared  = low_radius_squared_.data();
+  v.high_radius_squared = high_radius_squared_.data();
+  v.test_radius_squared = test_radius_squared_.data();
 
-    v.testing_range_low  = testing_range_low_.data();
-    v.testing_range_high = testing_range_high_.data();
+  v.testing_range_low  = testing_range_low_.data();
+  v.testing_range_high = testing_range_high_.data();
 
-    v.shared_live_kernel_accumulator = raft::device_span<f_t>{
-      shared_live_kernel_accumulator_.data(), shared_live_kernel_accumulator_.size()};
+  v.shared_live_kernel_accumulator = raft::device_span<f_t>{shared_live_kernel_accumulator_.data(),
+                                                            shared_live_kernel_accumulator_.size()};
 
-    v.hyper_params = hyper_params_;
+  v.hyper_params = hyper_params_;
 
-    return v;
-  }
+  return v;
+}
 
-  template <typename i_t, typename f_t>
-  typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
-  pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
-    const rmm::device_uvector<f_t>& primal_distance,
-    const rmm::device_uvector<f_t>& dual_distance,
-    const convergence_information_t<i_t, f_t>& current_convergence_information,
-    const rmm::device_uvector<f_t>& step_size,
-    rmm::device_uvector<f_t>& primal_weight,
-    rmm::device_uvector<f_t>& best_primal_weight,
-    rmm::device_uvector<f_t>& primal_step_size,
-    rmm::device_uvector<f_t>& dual_step_size)
-  {
-    cupdlpx_restart_view_t v{};
-    v.primal_distance    = make_span(primal_distance);
-    v.dual_distance      = make_span(dual_distance);
-    v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
-    v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
-    v.l2_norm_primal_linear_objective =
-      current_convergence_information.get_relative_dual_tolerance_factor();
-    v.l2_norm_primal_right_hand_side =
-      current_convergence_information.get_relative_primal_tolerance_factor();
-    v.step_size                     = make_span(step_size);
-    v.primal_weight                 = make_span(primal_weight);
-    v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
-    v.primal_weight_last_error      = make_span(primal_weight_last_error_);
-    v.best_primal_weight            = make_span(best_primal_weight);
-    v.new_primal_step_size          = make_span(primal_step_size);
-    v.new_dual_step_size            = make_span(dual_step_size);
-    v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
-    v.hyper_params                  = hyper_params_;
-    return v;
-  }
+template <typename i_t, typename f_t>
+typename pdlp_restart_strategy_t<i_t, f_t>::cupdlpx_restart_view_t
+pdlp_restart_strategy_t<i_t, f_t>::make_cupdlpx_restart_view(
+  const rmm::device_uvector<f_t>& primal_distance,
+  const rmm::device_uvector<f_t>& dual_distance,
+  const convergence_information_t<i_t, f_t>& current_convergence_information,
+  const rmm::device_uvector<f_t>& step_size,
+  rmm::device_uvector<f_t>& primal_weight,
+  rmm::device_uvector<f_t>& best_primal_weight,
+  rmm::device_uvector<f_t>& primal_step_size,
+  rmm::device_uvector<f_t>& dual_step_size)
+{
+  cupdlpx_restart_view_t v{};
+  v.primal_distance    = make_span(primal_distance);
+  v.dual_distance      = make_span(dual_distance);
+  v.l2_dual_residual   = make_span(current_convergence_information.get_l2_dual_residual());
+  v.l2_primal_residual = make_span(current_convergence_information.get_l2_primal_residual());
+  v.l2_norm_primal_linear_objective =
+    current_convergence_information.get_relative_dual_tolerance_factor();
+  v.l2_norm_primal_right_hand_side =
+    current_convergence_information.get_relative_primal_tolerance_factor();
+  v.step_size                     = make_span(step_size);
+  v.primal_weight                 = make_span(primal_weight);
+  v.primal_weight_error_sum       = make_span(primal_weight_error_sum_);
+  v.primal_weight_last_error      = make_span(primal_weight_last_error_);
+  v.best_primal_weight            = make_span(best_primal_weight);
+  v.new_primal_step_size          = make_span(primal_step_size);
+  v.new_dual_step_size            = make_span(dual_step_size);
+  v.best_primal_dual_residual_gap = make_span(best_primal_dual_residual_gap_);
+  v.hyper_params                  = hyper_params_;
+  return v;
+}
 
-  template <typename i_t, typename f_t>
-  i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
-  {
-    return weighted_average_solution_.get_iterations_since_last_restart();
-  }
+template <typename i_t, typename f_t>
+i_t pdlp_restart_strategy_t<i_t, f_t>::get_iterations_since_last_restart() const
+{
+  return weighted_average_solution_.get_iterations_since_last_restart();
+}
 
-  template <typename i_t, typename f_t>
-  void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
-  {
-    last_restart_was_average_ = value;
-  }
+template <typename i_t, typename f_t>
+void pdlp_restart_strategy_t<i_t, f_t>::set_last_restart_was_average(bool value)
+{
+  last_restart_was_average_ = value;
+}
 
-  template <typename i_t, typename f_t>
-  bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
-  {
-    return last_restart_was_average_;
-  }
+template <typename i_t, typename f_t>
+bool pdlp_restart_strategy_t<i_t, f_t>::get_last_restart_was_average() const
+{
+  return last_restart_was_average_;
+}
 
 #define INSTANTIATE(F_TYPE)                                                                     \
   template class pdlp_restart_strategy_t<int, F_TYPE>;                                          \
@@ -2523,11 +2524,11 @@ void pdlp_restart_strategy_t<i_t, f_t>::solve_bound_constrained_trust_region(
     F_TYPE* primal_product);
 
 #if MIP_INSTANTIATE_FLOAT
-  INSTANTIATE(float)
+INSTANTIATE(float)
 #endif
 
 #if MIP_INSTANTIATE_DOUBLE
-  INSTANTIATE(double)
+INSTANTIATE(double)
 #endif
 
 }  // namespace cuopt::linear_programming::detail

From 73a52b1adb0fd44f598844eb0244de2d1bacefa9 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Fri, 20 Feb 2026 15:50:26 +0000
Subject: [PATCH 16/43] fix use overall time limit, reduce memory consumtion
 and add a bigger buffer

---
 cpp/src/branch_and_bound/pseudo_costs.cpp |  5 +++-
 cpp/src/pdlp/pdlp.cu                      |  6 ++---
 cpp/src/pdlp/solve.cu                     | 33 +++++++----------------
 cpp/src/pdlp/translate.hpp                |  2 ++
 4 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 1a745865e8..926b25cd89 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -402,10 +402,13 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
       fraction_values.push_back(original_root_soln_x[j]);
     }
 
+    f_t elapsed_time = toc(start_time);
+    pdlp_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time);
+
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
     const raft::handle_t batch_pdlp_handle;
     const auto solutions =
-      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values);
+      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Find max iteration on how many are done accross the batch
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index aab9ffdd5d..c3e1e7ab8f 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -446,7 +446,7 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   }
 
   // Check for concurrent limit
-  if (settings_.concurrent_halt != nullptr && *settings_.concurrent_halt == 1) {
+  if (settings_.concurrent_halt != nullptr && settings_.concurrent_halt->load() == 1) {
 #ifdef PDLP_VERBOSE_MODE
     RAFT_CUDA_TRY(cudaDeviceSynchronize());
     std::cout << "Concurrent Limit reached, returning current solution" << std::endl;
@@ -2295,8 +2295,8 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     if (is_major_iteration || artificial_restart_check_main_loop || error_occured ||
         is_conditional_major) {
       if (verbose) {
-        std::cout << "-------------------------------" << std::endl;
-        std::cout << internal_solver_iterations_ << std::endl;
+          std::cout << "-------------------------------" << std::endl;
+          std::cout << internal_solver_iterations_ << std::endl;
         raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout);
         raft::print_device_vector(
           "primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index fa0c79e391..3592798545 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -727,13 +727,13 @@ static size_t batch_pdlp_memory_estimator(const optimization_problem_t<i_t, f_t>
   total_memory += trial_batch_size * problem.get_n_constraints() * sizeof(f_t);
 
   // Data for the solution
-  total_memory += problem.get_n_variables() * max_batch_size * sizeof(f_t);
-  total_memory += problem.get_n_constraints() * max_batch_size * sizeof(f_t);
-  total_memory += problem.get_n_variables() * max_batch_size * sizeof(f_t);
+  total_memory += problem.get_n_variables() * trial_batch_size * sizeof(f_t);
+  total_memory += problem.get_n_constraints() * trial_batch_size * sizeof(f_t);
+  total_memory += problem.get_n_variables() * trial_batch_size * sizeof(f_t);
 
-  // Add a 50% overhead to make sure we have enough memory considering other parts of the solver may
-  // allocate at the same time
-  total_memory *= 1.5;
+  // Add a 70% overhead to make sure we have enough memory considering other parts of the solver may
+  // need memory later while the batch PDLP is running
+  total_memory *= 1.7;
 
   // Data from saddle point state
   return total_memory;
@@ -815,9 +815,10 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     }
   }
 
-  rmm::device_uvector<f_t> full_primal_solution(problem.get_n_variables() * max_batch_size, stream);
-  rmm::device_uvector<f_t> full_dual_solution(problem.get_n_constraints() * max_batch_size, stream);
-  rmm::device_uvector<f_t> full_reduced_cost(problem.get_n_variables() * max_batch_size, stream);
+  // We don't use the solutions vectors for now
+  rmm::device_uvector<f_t> full_primal_solution(0, stream);
+  rmm::device_uvector<f_t> full_dual_solution(0, stream);
+  rmm::device_uvector<f_t> full_reduced_cost(0, stream);
 
   std::vector<
     typename optimization_problem_solution_t<i_t, f_t>::additional_termination_information_t>
@@ -849,20 +850,6 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
     auto sol = solve_lp(problem, batch_settings);
 
-    // Copy results
-    raft::copy(full_primal_solution.data() + i * problem.get_n_variables(),
-               sol.get_primal_solution().data(),
-               problem.get_n_variables() * current_batch_size,
-               stream);
-    raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(),
-               sol.get_dual_solution().data(),
-               problem.get_n_constraints() * current_batch_size,
-               stream);
-    raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(),
-               sol.get_reduced_cost().data(),
-               problem.get_n_variables() * current_batch_size,
-               stream);
-
     auto info = sol.get_additional_termination_informations();
     full_info.insert(full_info.end(), info.begin(), info.end());
 
diff --git a/cpp/src/pdlp/translate.hpp b/cpp/src/pdlp/translate.hpp
index aebe87b140..cbef54b97f 100644
--- a/cpp/src/pdlp/translate.hpp
+++ b/cpp/src/pdlp/translate.hpp
@@ -9,6 +9,8 @@
 
 #include <cuopt/linear_programming/optimization_problem.hpp>
 
+#include <mip_heuristics/problem/problem.cuh>
+
 #include <dual_simplex/presolve.hpp>
 #include <dual_simplex/sparse_matrix.hpp>
 

From dbc94fd35b9898a15a355ff670e999f21f58e817 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 24 Feb 2026 15:32:57 +0000
Subject: [PATCH 17/43] switch to double for memory estimator as size_t was
 hitting overflow + fail safe if batch pdlp fails

---
 cpp/src/branch_and_bound/pseudo_costs.cpp |  6 +++
 cpp/src/pdlp/solve.cu                     | 50 +++++++++++++++--------
 2 files changed, 38 insertions(+), 18 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 926b25cd89..0c66053c50 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -411,6 +411,12 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
       batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
+    // Fail safe in case the batch PDLP failed and produced no solutions
+    if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) {
+      settings.log.printf("Batch PDLP failed and produced no solutions\n");
+      return;
+    }
+
     // Find max iteration on how many are done accross the batch
     i_t max_iterations = 0;
     i_t amount_done    = 0;
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 3592798545..33d8895e2a 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -678,17 +678,17 @@ optimization_problem_solution_t<i_t, f_t> run_pdlp(detail::problem_t<i_t, f_t>&
   return sol;
 }
 
+// Compute in double as some cases overflow when using size_t
 template <typename i_t, typename f_t>
-static size_t batch_pdlp_memory_estimator(const optimization_problem_t<i_t, f_t>& problem,
-                                          int trial_batch_size,
-                                          int max_batch_size)
+static double batch_pdlp_memory_estimator(const optimization_problem_t<i_t, f_t>& problem,
+                                          double trial_batch_size)
 {
-  size_t total_memory = 0;
+  double total_memory = 0.0;
   // In PDLP we store the scaled version of the problem which contains all of those
   total_memory += problem.get_constraint_matrix_indices().size() * sizeof(i_t);
   total_memory += problem.get_constraint_matrix_offsets().size() * sizeof(i_t);
   total_memory += problem.get_constraint_matrix_values().size() * sizeof(f_t);
-  total_memory *= 2;  // To account for the A_t matrix
+  total_memory *= 2.0;  // To account for the A_t matrix
   total_memory += problem.get_objective_coefficients().size() * sizeof(f_t);
   total_memory += problem.get_constraint_bounds().size() * sizeof(f_t);
   total_memory += problem.get_variable_lower_bounds().size() * sizeof(f_t);
@@ -759,32 +759,46 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   f_t initial_primal_weight = std::numeric_limits<f_t>::signaling_NaN();
 
   cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0");
-  const int max_batch_size  = settings.new_bounds.size();
-  int memory_max_batch_size = max_batch_size;
+  const size_t max_batch_size  = settings.new_bounds.size();
+  size_t memory_max_batch_size = max_batch_size;
 
   // Check if we don't hit the limit using max_batch_size
-  const size_t memory_estimate =
-    batch_pdlp_memory_estimator(problem, max_batch_size, max_batch_size);
-  size_t free_mem, total_mem;
-  RAFT_CUDA_TRY(cudaMemGetInfo(&free_mem, &total_mem));
+  const double memory_estimate = batch_pdlp_memory_estimator(problem, max_batch_size);
+  size_t st_free_mem, st_total_mem;
+  RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem));
+  const double free_mem = static_cast<double>(st_free_mem);
+  const double total_mem = static_cast<double>(st_total_mem);
+
+  #ifdef BATCH_VERBOSE_MODE
+  std::cout << "Memory estimate: " << memory_estimate << std::endl;
+  std::cout << "Free memory: " << free_mem << std::endl;
+  std::cout << "Total memory: " << total_mem << std::endl;
+  #endif
 
   if (memory_estimate > free_mem) {
     use_optimal_batch_size = true;
     // Decrement batch size iteratively until we find a batch size that fits
     while (memory_max_batch_size > 1) {
-      const size_t memory_estimate =
-        batch_pdlp_memory_estimator(problem, memory_max_batch_size, max_batch_size);
+      const double memory_estimate =
+        batch_pdlp_memory_estimator(problem, memory_max_batch_size);
       if (memory_estimate <= free_mem) { break; }
+      #ifdef BATCH_VERBOSE_MODE
+      std::cout << "Memory estimate: " << memory_estimate << std::endl;
+      std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl;
+      std::cout << "Free memory: " << free_mem << std::endl;
+      std::cout << "Total memory: " << total_mem << std::endl;
+      std::cout << "--------------------------------" << std::endl;
+      #endif
       memory_max_batch_size--;
     }
-    const size_t min_estimate =
-      batch_pdlp_memory_estimator(problem, memory_max_batch_size, max_batch_size);
+    const double min_estimate =
+      batch_pdlp_memory_estimator(problem, memory_max_batch_size);
     cuopt_expects(min_estimate <= free_mem,
                   error_type_t::OutOfMemoryError,
                   "Insufficient GPU memory for batch PDLP (min batch size still too large)");
   }
 
-  int optimal_batch_size = use_optimal_batch_size
+  size_t optimal_batch_size = use_optimal_batch_size
                              ? detail::optimal_batch_size_handler(problem, memory_max_batch_size)
                              : max_batch_size;
   cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size,
@@ -842,8 +856,8 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
-  for (int i = 0; i < max_batch_size; i += optimal_batch_size) {
-    const int current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
+  for (size_t i = 0; i < max_batch_size; i += optimal_batch_size) {
+    const size_t current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
     // Only take the new bounds from [i, i + current_batch_size)
     batch_settings.new_bounds = std::vector<std::tuple<i_t, f_t, f_t>>(
       original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size);

From 8b1ec9373299d9a73e9e3dcddf0508804899007e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Fri, 20 Feb 2026 15:50:26 +0000
Subject: [PATCH 18/43] add support for dual simplex warm start

---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  2 ++
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 32 ++++++++++++++++---
 cpp/src/branch_and_bound/pseudo_costs.hpp     |  4 ++-
 cpp/src/pdlp/pdlp.cu                          | 21 ++++--------
 .../restart_strategy/pdlp_restart_strategy.cu |  6 ++++
 cpp/src/pdlp/solve.cu                         | 23 ++++++++-----
 6 files changed, 60 insertions(+), 28 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index c46f09258c..ea2c160e1b 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -2357,6 +2357,8 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
                                exploration_stats_.start_time,
                                var_types_,
                                root_relax_soln_.x,
+                               root_relax_soln_.y,
+                               root_relax_soln_.z,
                                fractional,
                                root_objective_,
                                root_vstatus_,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 0c66053c50..c3268427c3 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -14,7 +14,9 @@
 
 #include <cuopt/linear_programming/solve.hpp>
 
-#include <raft/core/nvtx.hpp>
+#include <utilities/copy_helpers.hpp>
+
+#include <raft/common/nvtx.hpp>
 
 #include <omp.h>
 
@@ -276,9 +278,11 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
     } else if (user_problem.row_sense[i] == 'G') {
       constraint_lower[i] = user_problem.rhs[i];
       constraint_upper[i] = std::numeric_limits<f_t>::infinity();
-    } else {
+    } else if (user_problem.row_sense[i] == 'E') {
       constraint_lower[i] = user_problem.rhs[i];
       constraint_upper[i] = user_problem.rhs[i];
+    } else {
+      throw std::runtime_error("Invalid row sense: " + std::string(1, user_problem.row_sense[i]));
     }
   }
 
@@ -354,7 +358,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
                       f_t start_time,
                       const std::vector<variable_type_t>& var_types,
-                      const std::vector<f_t> root_soln,
+                      const std::vector<f_t>& root_soln,
+                      const std::vector<f_t>& root_soln_y,
+                      const std::vector<f_t>& root_soln_z,
                       const std::vector<i_t>& fractional,
                       f_t root_obj,
                       const std::vector<variable_status_t>& root_vstatus,
@@ -397,6 +403,10 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
 
     std::vector<f_t> fraction_values;
 
+    std::vector<f_t> original_root_soln_y, original_root_soln_z;
+    uncrush_dual_solution(
+      original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, original_root_soln_z);
+
     for (i_t k = 0; k < fractional.size(); k++) {
       const i_t j = fractional[k];
       fraction_values.push_back(original_root_soln_x[j]);
@@ -404,9 +414,19 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
 
     f_t elapsed_time = toc(start_time);
     pdlp_settings.time_limit = std::max(0.0, settings.time_limit - elapsed_time);
-
+    
     const auto mps_model = simplex_problem_to_mps_data_model(original_problem);
     const raft::handle_t batch_pdlp_handle;
+
+
+    constexpr bool dual_simplex_primal_dual = false;
+    if (dual_simplex_primal_dual) {
+      pdlp_settings.set_initial_primal_solution(
+        original_root_soln_x.data(), original_root_soln_x.size(), batch_pdlp_handle.get_stream());
+      pdlp_settings.set_initial_dual_solution(
+        original_root_soln_y.data(), original_root_soln_y.size(), batch_pdlp_handle.get_stream());
+    }
+
     const auto solutions =
       batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
@@ -929,7 +949,9 @@ template void strong_branching<int, double>(const user_problem_t<int, double>& o
                                             const simplex_solver_settings_t<int, double>& settings,
                                             double start_time,
                                             const std::vector<variable_type_t>& var_types,
-                                            const std::vector<double> root_soln,
+                                            const std::vector<double>& root_soln,
+                                            const std::vector<double>& root_soln_y,
+                                            const std::vector<double>& root_soln_z,
                                             const std::vector<int>& fractional,
                                             double root_obj,
                                             const std::vector<variable_status_t>& root_vstatus,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 6b6c6917b6..e8aea11428 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -522,7 +522,9 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
                       f_t start_time,
                       const std::vector<variable_type_t>& var_types,
-                      const std::vector<f_t> root_soln,
+                      const std::vector<f_t>& root_soln,
+                      const std::vector<f_t>& root_soln_y,
+                      const std::vector<f_t>& root_soln_z,
                       const std::vector<i_t>& fractional,
                       f_t root_obj,
                       const std::vector<variable_status_t>& root_vstatus,
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index c3e1e7ab8f..08d2ef3cd2 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2263,13 +2263,6 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
   bool warm_start_was_given =
     settings_.get_pdlp_warm_start_data().last_restart_duality_gap_dual_solution_.size() != 0;
 
-  // In batch mode, before running the solver, we need to transpose the primal and dual solution to
-  // row format
-  if (batch_mode_)
-    transpose_primal_dual_to_row(pdhg_solver_.get_potential_next_primal_solution(),
-                                 pdhg_solver_.get_potential_next_dual_solution(),
-                                 pdhg_solver_.get_dual_slack());
-
   if (!inside_mip_) {
     CUOPT_LOG_INFO(
       "   Iter    Primal Obj.      Dual Obj.    Gap        Primal Res.  Dual Res.   Time");
@@ -2332,13 +2325,6 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
         }
       }
 
-#ifdef CUPDLP_DEBUG_MODE
-      print("before scale slack", pdhg_solver_.get_dual_slack());
-      print("before scale potential next primal",
-            pdhg_solver_.get_potential_next_primal_solution());
-      print("before scale potential next dual", pdhg_solver_.get_potential_next_dual_solution());
-#endif
-
       // In case of batch mode, primal and dual matrices are in row format
       // We need to transpose them to column format before doing any checks
       if (batch_mode_) {
@@ -2354,6 +2340,13 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
           pdhg_solver_.get_primal_solution(), pdhg_solver_.get_dual_solution(), dummy);
       }
 
+#ifdef CUPDLP_DEBUG_MODE
+      print("before scale slack", pdhg_solver_.get_dual_slack());
+      print("before scale potential next primal",
+            pdhg_solver_.get_potential_next_primal_solution());
+      print("before scale potential next dual", pdhg_solver_.get_potential_next_dual_solution());
+#endif
+
       // We go back to the unscaled problem here. It ensures that we do not terminate 'too early'
       // because of the error margin being evaluated on the scaled problem
 
diff --git a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
index 8eacd4d246..5adcb74439 100644
--- a/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
+++ b/cpp/src/pdlp/restart_strategy/pdlp_restart_strategy.cu
@@ -691,6 +691,12 @@ void pdlp_restart_strategy_t<i_t, f_t>::should_cupdlpx_restart(i_t total_number_
 {
   std::fill(should_restart.begin(), should_restart.end(), 0);
 
+  #ifdef CUPDLP_DEBUG_MODE
+  // Print the current stats of initial fixed point error and fixed point error
+  print("initial_fixed_point_error", initial_fixed_point_error_);
+  print("fixed_point_error", fixed_point_error_);
+  #endif
+
   if (total_number_of_iterations == hyper_params_.major_iteration) {
 #ifdef CUPDLP_DEBUG_MODE
     printf("forced restart at first major\n");
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 33d8895e2a..3a277b4f85 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -744,12 +744,10 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   optimization_problem_t<i_t, f_t>& problem, pdlp_solver_settings_t<i_t, f_t> const& settings)
 {
   // Hyper parameter than can be changed, I have put what I believe to be the best
-  bool primal_dual_init         = true;
+  bool pdlp_primal_dual_init    = true;
   bool primal_weight_init       = true;
   bool use_optimal_batch_size   = false;
   constexpr int iteration_limit = 100000;
-  // Shouldn't we work on the unpresolved and/or unscaled problem for PDLP?
-  // Shouldn't we put an iteration limit? If yes what should we do with the partial solutions?
 
   rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream();
 
@@ -805,8 +803,15 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
                "Optimal batch size should be between 1 and max batch size");
   using f_t2 = typename type_2<f_t>::type;
 
-  // If need warm start, solve the LP alone
-  if (primal_dual_init || primal_weight_init) {
+  // In case Dual Simplex already provided the initial primal and dual solution
+  if (settings.has_initial_primal_solution() && settings.has_initial_dual_solution()) {
+    initial_primal = rmm::device_uvector<f_t>(
+      settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream());
+    initial_dual = rmm::device_uvector<f_t>(
+      settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream());
+  }
+
+  if (pdlp_primal_dual_init || primal_weight_init) {
     pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
     warm_start_settings.new_bounds.clear();
     warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
@@ -817,7 +822,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     warm_start_settings.inside_mip           = true;
     optimization_problem_solution_t<i_t, f_t> original_solution =
       solve_lp(problem, warm_start_settings);
-    if (primal_dual_init) {
+    if (pdlp_primal_dual_init) {
       initial_primal    = rmm::device_uvector<f_t>(original_solution.get_primal_solution(),
                                                 original_solution.get_primal_solution().stream());
       initial_dual      = rmm::device_uvector<f_t>(original_solution.get_dual_solution(),
@@ -847,12 +852,14 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   batch_settings.detect_infeasibility             = false;
   batch_settings.iteration_limit                  = iteration_limit;
   batch_settings.inside_mip                       = true;
-  if (primal_dual_init) {
+  if (initial_primal.size() > 0) {
     batch_settings.set_initial_primal_solution(
       initial_primal.data(), initial_primal.size(), initial_primal.stream());
     batch_settings.set_initial_dual_solution(
       initial_dual.data(), initial_dual.size(), initial_dual.stream());
-    batch_settings.set_initial_step_size(initial_step_size);
+    if (!std::isnan(initial_step_size)) {
+      batch_settings.set_initial_step_size(initial_step_size);
+    }
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 

From 71e47ebbfeb41b3a65d76fd559b59175a53a0c4a Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 5 Mar 2026 15:30:54 +0100
Subject: [PATCH 19/43] handle batch pdlp being out of memory not has
 teramintion error

---
 cpp/src/pdlp/solve.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index e821e50f07..de538a1351 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -797,9 +797,10 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     }
     const double min_estimate =
       batch_pdlp_memory_estimator(problem, memory_max_batch_size);
-    cuopt_expects(min_estimate <= free_mem,
-                  error_type_t::OutOfMemoryError,
-                  "Insufficient GPU memory for batch PDLP (min batch size still too large)");
+    if (min_estimate > free_mem) {
+      return optimization_problem_solution_t<i_t, f_t>(
+        pdlp_termination_status_t::NumericalError, stream);
+    }
   }
 
   size_t optimal_batch_size = use_optimal_batch_size

From d02544181b97fbd36ba915970c7362e57422f742 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 5 Mar 2026 15:34:00 +0100
Subject: [PATCH 20/43] add a basic batch pdlp race strategy in strong
 branching

---
 cpp/src/branch_and_bound/branch_and_bound.cpp |   3 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 196 +++++++++++++++++-
 2 files changed, 195 insertions(+), 4 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 3d5cbcc64f..0d0ac23e92 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -806,7 +806,8 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
-  log.log                        = false;
+  // TODO put back false
+  log.log                        = true;
   i_t branch_var                 = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   std::vector<f_t> current_incumbent;
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 2757f7f680..1adfb2355b 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -315,6 +315,36 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   return mps_model;
 }
 
+template <typename i_t, typename f_t>
+static cuopt::mps_parser::mps_data_model_t<i_t, f_t> lp_problem_to_mps_data_model(
+  const lp_problem_t<i_t, f_t>& lp_problem)
+{
+  cuopt::mps_parser::mps_data_model_t<i_t, f_t> mps_model;
+  int m = lp_problem.num_rows;
+  int n = lp_problem.num_cols;
+
+  csr_matrix_t<i_t, f_t> csr_A(m, n, 0);
+  lp_problem.A.to_compressed_row(csr_A);
+
+  int nz = csr_A.row_start[m];
+
+  mps_model.set_csr_constraint_matrix(
+    csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1);
+
+  mps_model.set_objective_coefficients(lp_problem.objective.data(), n);
+  mps_model.set_objective_scaling_factor(lp_problem.obj_scale);
+  mps_model.set_objective_offset(lp_problem.obj_constant);
+
+  mps_model.set_variable_lower_bounds(lp_problem.lower.data(), n);
+  mps_model.set_variable_upper_bounds(lp_problem.upper.data(), n);
+
+  mps_model.set_constraint_lower_bounds(lp_problem.rhs.data(), m);
+  mps_model.set_constraint_upper_bounds(lp_problem.rhs.data(), m);
+  mps_model.set_maximize(lp_problem.obj_scale < 0);
+
+  return mps_model;
+}
+
 // Merge a single strong branching result from Dual Simplex and PDLP.
 // Rules:
 //   1. If both found optimal   -> keep DS (higher quality vertex solution)
@@ -793,13 +823,97 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // Shuffle the unreliable list so every variable has the same chance to be selected.
   if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); }
 
+  // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex
+  std::vector<i_t> pdlp_overflow_list;
+  bool use_pdlp = settings.mip_batch_pdlp_strong_branching == 1 &&
+                  static_cast<i_t>(unreliable_list.size()) > num_candidates;
+  if (use_pdlp) {
+    pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end());
+  }
+
+  const i_t num_pdlp_vars = pdlp_overflow_list.size();
+  std::vector<f_t> pdlp_obj_down(num_pdlp_vars, std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> pdlp_obj_up(num_pdlp_vars, std::numeric_limits<f_t>::quiet_NaN());
+
+  // DS can halt PDLP via concurrent_halt, but not the other way around
+  std::atomic<int> concurrent_halt{0};
+  std::thread pdlp_thread;
+
+  if (use_pdlp) {
+    pdlp_thread = std::thread([&]() {
+      log.printf("RB batch PDLP: solving %d overflow unreliable variables\n", num_pdlp_vars);
+
+      f_t start_batch = tic();
+
+      const auto mps_model = lp_problem_to_mps_data_model(worker->leaf_problem);
+
+      std::vector<f_t> fraction_values;
+      fraction_values.reserve(num_pdlp_vars);
+      for (i_t j : pdlp_overflow_list) {
+        fraction_values.push_back(solution[j]);
+      }
+
+      const f_t batch_elapsed_time    = toc(start_time);
+      const f_t batch_remaining_time =
+        std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+      if (batch_remaining_time <= 0.0) { return; }
+
+      pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
+      pdlp_settings.concurrent_halt = &concurrent_halt;
+      pdlp_settings.time_limit      = batch_remaining_time;
+
+      const raft::handle_t batch_pdlp_handle;
+      const auto solutions = batch_pdlp_solve(
+        &batch_pdlp_handle, mps_model, pdlp_overflow_list, fraction_values, pdlp_settings);
+
+      f_t batch_pdlp_time = toc(start_batch);
+
+      if (solutions.get_additional_termination_informations().size() !=
+          static_cast<size_t>(num_pdlp_vars) * 2) {
+        log.printf("RB batch PDLP failed and produced no solutions\n");
+        return;
+      }
+
+      i_t amount_done = 0;
+      for (i_t k = 0; k < num_pdlp_vars * 2; k++) {
+        if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+          amount_done++;
+        }
+      }
+
+      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
+                 batch_pdlp_time,
+                 amount_done,
+                 num_pdlp_vars * 2);
+
+      for (i_t k = 0; k < num_pdlp_vars; k++) {
+        if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+          pdlp_obj_down[k] = solutions.get_dual_objective_value(k);
+        }
+        if (solutions.get_termination_status(k + num_pdlp_vars) ==
+            pdlp_termination_status_t::Optimal) {
+          pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_pdlp_vars);
+        }
+      }
+    });
+  }
+
   if (toc(start_time) > settings.time_limit) {
     log.printf("Time limit reached");
+    if (use_pdlp) {
+      concurrent_halt.store(1);
+      pdlp_thread.join();
+    }
     return branch_var;
   }
 
+  omp_atomic_t<i_t> ds_optimal{0};
+  omp_atomic_t<i_t> ds_infeasible{0};
+  omp_atomic_t<i_t> ds_failed{0};
+  f_t ds_start_time = tic();
+
 #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
-  shared(score_mutex)
+  shared(score_mutex, ds_optimal, ds_infeasible, ds_failed)
   for (i_t i = 0; i < num_candidates; ++i) {
     const i_t j = unreliable_list[i];
 
@@ -826,7 +940,16 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                 reliability_branching_settings.lower_max_lp_iter,
                                 strong_branching_lp_iter);
 
-      if (!std::isnan(obj)) {
+      if (std::isnan(obj)) {
+        ds_failed++;
+      } else if (std::isinf(obj)) {
+        ds_infeasible++;
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = solution[j] - std::floor(solution[j]);
+        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_down[j]++;
+      } else {
+        ds_optimal++;
         f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
         f_t change_in_x   = solution[j] - std::floor(solution[j]);
         pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
@@ -857,7 +980,17 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
                                 reliability_branching_settings.lower_max_lp_iter,
                                 strong_branching_lp_iter);
 
-      if (!std::isnan(obj)) {
+      if (std::isnan(obj)) {
+        ds_failed++;
+      } else if (std::isinf(obj)) {
+        // Is it ok to process infinity obj like this?
+        ds_infeasible++;
+        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_up[j]++;
+      } else {
+        ds_optimal++;
         f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
         f_t change_in_x   = std::ceil(solution[j]) - solution[j];
         pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
@@ -878,6 +1011,63 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     score_mutex.unlock();
   }
 
+  f_t ds_elapsed = toc(ds_start_time);
+  log.printf(
+    "RB Dual Simplex: %d candidates, %d/%d optimal/dual-feasible, %d/%d infeasible, "
+    "%d/%d failed in %.2fs\n",
+    num_candidates,
+    ds_optimal.load(),
+    num_candidates * 2,
+    ds_infeasible.load(),
+    num_candidates * 2,
+    ds_failed.load(),
+    num_candidates * 2,
+    ds_elapsed);
+
+  if (use_pdlp) {
+    // Dual Simplex is done on the main thread, telling Batch PDLP to stop
+    concurrent_halt.store(1);
+    pdlp_thread.join();
+
+    i_t pdlp_optimal  = 0;
+    for (i_t k = 0; k < num_pdlp_vars; k++) {
+      const i_t j = pdlp_overflow_list[k];
+
+      pseudo_cost_mutex_down[j].lock();
+      if (!std::isnan(pdlp_obj_down[k])) {
+        f_t change_in_obj = std::max(pdlp_obj_down[k] - node_ptr->lower_bound, eps);
+        f_t change_in_x   = solution[j] - std::floor(solution[j]);
+        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_down[j]++;
+        pdlp_optimal++;
+      }
+      pseudo_cost_mutex_down[j].unlock();
+
+      pseudo_cost_mutex_up[j].lock();
+      if (!std::isnan(pdlp_obj_up[k])) {
+        f_t change_in_obj = std::max(pdlp_obj_up[k] - node_ptr->lower_bound, eps);
+        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+        pseudo_cost_num_up[j]++;
+        pdlp_optimal++;
+      }
+      pseudo_cost_mutex_up[j].unlock();
+
+      f_t score =
+        calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+      if (score > max_score) {
+        max_score  = score;
+        branch_var = j;
+      }
+    }
+
+    log.printf(
+      "RB batch PDLP: %d candidates, %d/%d optimal\n",
+      num_pdlp_vars,
+      pdlp_optimal,
+      num_pdlp_vars * 2);
+  }
+
   log.printf(
     "pc branching on %d. Value %e. Score %e\n", branch_var, solution[branch_var], max_score);
 

From 3044887041e3624b35d0f16266ece7a9946f9227 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Fri, 6 Mar 2026 16:01:16 +0100
Subject: [PATCH 21/43] fix compilation issue

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 1adfb2355b..1a0a1f260b 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -16,7 +16,7 @@
 
 #include <utilities/copy_helpers.hpp>
 
-#include <raft/common/nvtx.hpp>
+#include <raft/core/nvtx.hpp>
 
 #include <omp.h>
 

From 0108de47c456d7c1168103b88929776b75bb7e97 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 9 Mar 2026 13:19:34 +0000
Subject: [PATCH 22/43] separate the two batch pdlp settings

---
 cpp/include/cuopt/linear_programming/constants.h           | 1 +
 .../cuopt/linear_programming/mip/solver_settings.hpp       | 1 +
 cpp/src/branch_and_bound/pseudo_costs.cpp                  | 7 ++++---
 cpp/src/dual_simplex/simplex_solver_settings.hpp           | 1 +
 cpp/src/math_optimization/solver_settings.cu               | 1 +
 cpp/src/mip_heuristics/solver.cu                           | 2 ++
 6 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index 7eb0aa07d6..551d9e6319 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -69,6 +69,7 @@
 #define CUOPT_MIP_CUT_CHANGE_THRESHOLD        "mip_cut_change_threshold"
 #define CUOPT_MIP_CUT_MIN_ORTHOGONALITY       "mip_cut_min_orthogonality"
 #define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING "mip_batch_pdlp_strong_branching"
+#define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching"
 #define CUOPT_SOLUTION_FILE                   "solution_file"
 #define CUOPT_NUM_CPU_THREADS                 "num_cpu_threads"
 #define CUOPT_NUM_GPUS                        "num_gpus"
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 95b2dffc46..f9735e1994 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -98,6 +98,7 @@ class mip_solver_settings_t {
   f_t cut_change_threshold            = 1e-3;
   f_t cut_min_orthogonality           = 0.5;
   i_t mip_batch_pdlp_strong_branching = 1;
+  i_t mip_batch_pdlp_reliability_branching = 1;
   i_t num_gpus                        = 1;
   bool log_to_console                 = true;
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 1a0a1f260b..2798058d55 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -825,7 +825,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex
   std::vector<i_t> pdlp_overflow_list;
-  bool use_pdlp = settings.mip_batch_pdlp_strong_branching == 1 &&
+  bool use_pdlp = settings.mip_batch_pdlp_reliability_branching == 1 &&
                   static_cast<i_t>(unreliable_list.size()) > num_candidates;
   if (use_pdlp) {
     pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end());
@@ -881,10 +881,11 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
         }
       }
 
-      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
+      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d in %.2fs\n",
                  batch_pdlp_time,
                  amount_done,
-                 num_pdlp_vars * 2);
+                 num_pdlp_vars * 2,
+                 toc(start_batch));
 
       for (i_t k = 0; k < num_pdlp_vars; k++) {
         if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 815e229232..2e38117a75 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -186,6 +186,7 @@ struct simplex_solver_settings_t {
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
   i_t mip_batch_pdlp_strong_branching{0};  // 0 if not using batch PDLP for strong branching, 1 if
                                            // using batch PDLP for strong branching
+  i_t mip_batch_pdlp_reliability_branching{0};  // 0 if not using batch PDLP for reliability branching, 1 if
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index f1350ca432..18e4d1b1e5 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -99,6 +99,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0},
+    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC},
diff --git a/cpp/src/mip_heuristics/solver.cu b/cpp/src/mip_heuristics/solver.cu
index e6f6d50b62..226d3c4b23 100644
--- a/cpp/src/mip_heuristics/solver.cu
+++ b/cpp/src/mip_heuristics/solver.cu
@@ -228,6 +228,8 @@ solution_t<i_t, f_t> mip_solver_t<i_t, f_t>::run_solver()
     branch_and_bound_settings.cut_min_orthogonality = context.settings.cut_min_orthogonality;
     branch_and_bound_settings.mip_batch_pdlp_strong_branching =
       context.settings.mip_batch_pdlp_strong_branching;
+    branch_and_bound_settings.mip_batch_pdlp_reliability_branching =
+      context.settings.mip_batch_pdlp_reliability_branching;
 
     if (context.settings.num_cpu_threads < 0) {
       branch_and_bound_settings.num_threads = std::max(1, omp_get_max_threads() - 1);

From 721a56a65f2e5f12b53349bba1587f43b0a7815a Mon Sep 17 00:00:00 2001
From: Christopher Maes <cmaes@nvidia.com>
Date: Wed, 11 Mar 2026 09:59:38 -0700
Subject: [PATCH 23/43] Fix bug where batch PDLP for strong branching was
 running on problem without cuts

---
 cpp/src/branch_and_bound/branch_and_bound.cpp |   4 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 138 +++++++++++-------
 cpp/src/branch_and_bound/pseudo_costs.hpp     |   4 +-
 3 files changed, 88 insertions(+), 58 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 41d23bc0ff..3fc12705fd 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -2407,10 +2407,10 @@ mip_status_t branch_and_bound_t<i_t, f_t>::solve(mip_solution_t<i_t, f_t>& solut
   pc_.resize(original_lp_.num_cols);
   {
     raft::common::nvtx::range scope_sb("BB::strong_branching");
-    strong_branching<i_t, f_t>(original_problem_,
-                               original_lp_,
+    strong_branching<i_t, f_t>(original_lp_,
                                settings_,
                                exploration_stats_.start_time,
+                               new_slacks_,
                                var_types_,
                                root_relax_soln_.x,
                                fractional,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index ee7e2f7803..3fd240a1e4 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -220,15 +220,46 @@ f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
 template <typename i_t, typename f_t>
 static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data_model(
-  const dual_simplex::user_problem_t<i_t, f_t>& user_problem)
+  const dual_simplex::lp_problem_t<i_t, f_t>& lp,
+  const std::vector<i_t>& new_slacks,
+  const std::vector<f_t>& root_soln,
+  std::vector<f_t>& original_root_soln_x)
 {
+
+  // Branch and bound has a problem of the form:
+  // minimize c^T x
+  // subject to A*x + Es = b
+  //            l <= x <= u
+  //            E_{jj} = sigma_j, where sigma_j is +1 or -1
+
+  // We need to convert this into a problem that is better for PDLP
+  // to solve. PDLP perfers inequality constraints. Thus, we want
+  // to convert the above into the problem:
+  // minimize c^T x
+  // subject to  lb <= A*x <= ub
+  //             l <= x <= u
+
+
   cuopt::mps_parser::mps_data_model_t<i_t, f_t> mps_model;
-  int m = user_problem.num_rows;
-  int n = user_problem.num_cols;
+  int m = lp.num_rows;
+  int n = lp.num_cols - new_slacks.size();
+  original_root_soln_x.resize(n);
+
+  // Remove slacks from A
+  dual_simplex::csc_matrix_t<i_t, f_t> A_no_slacks = lp.A;
+  std::vector<i_t> cols_to_remove(lp.A.n, 0);
+  for (i_t j : new_slacks) {
+    cols_to_remove[j] = 1;
+  }
+  A_no_slacks.remove_columns(cols_to_remove);
+
+  for (i_t j = 0; j < n; j++) {
+    original_root_soln_x[j] = root_soln[j];
+  }
 
   // Convert CSC to CSR using built-in method
   dual_simplex::csr_matrix_t<i_t, f_t> csr_A(m, n, 0);
-  user_problem.A.to_compressed_row(csr_A);
+  A_no_slacks.to_compressed_row(csr_A);
 
   int nz = csr_A.row_start[m];
 
@@ -237,70 +268,74 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
     csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1);
 
   // Set objective coefficients
-  mps_model.set_objective_coefficients(user_problem.objective.data(), n);
+  mps_model.set_objective_coefficients(lp.objective.data(), n);
 
   // Set objective scaling and offset
-  mps_model.set_objective_scaling_factor(user_problem.obj_scale);
-  mps_model.set_objective_offset(user_problem.obj_constant);
+  mps_model.set_objective_scaling_factor(lp.obj_scale);
+  mps_model.set_objective_offset(lp.obj_constant);
 
   // Set variable bounds
-  mps_model.set_variable_lower_bounds(user_problem.lower.data(), n);
-  mps_model.set_variable_upper_bounds(user_problem.upper.data(), n);
+  mps_model.set_variable_lower_bounds(lp.lower.data(), n);
+  mps_model.set_variable_upper_bounds(lp.upper.data(), n);
 
   // Convert row sense and RHS to constraint bounds
   std::vector<f_t> constraint_lower(m);
   std::vector<f_t> constraint_upper(m);
 
-  for (i_t i = 0; i < m; ++i) {
-    if (user_problem.row_sense[i] == 'L') {
-      constraint_lower[i] = -std::numeric_limits<f_t>::infinity();
-      constraint_upper[i] = user_problem.rhs[i];
-    } else if (user_problem.row_sense[i] == 'G') {
-      constraint_lower[i] = user_problem.rhs[i];
-      constraint_upper[i] = std::numeric_limits<f_t>::infinity();
-    } else {
-      constraint_lower[i] = user_problem.rhs[i];
-      constraint_upper[i] = user_problem.rhs[i];
-    }
+  std::vector<i_t> slack_map(m, -1);
+  for (i_t j : new_slacks) {
+    const i_t col_start = lp.A.col_start[j];
+    const i_t i = lp.A.i[col_start];
+    slack_map[i] = j;
   }
 
-  for (i_t k = 0; k < user_problem.num_range_rows; ++k) {
-    i_t i = user_problem.range_rows[k];
-    f_t r = user_problem.range_value[k];
-    f_t b = user_problem.rhs[i];
-    f_t h = -std::numeric_limits<f_t>::infinity();
-    f_t u = std::numeric_limits<f_t>::infinity();
-    if (user_problem.row_sense[i] == 'L') {
-      h = b - std::abs(r);
-      u = b;
-    } else if (user_problem.row_sense[i] == 'G') {
-      h = b;
-      u = b + std::abs(r);
-    } else if (user_problem.row_sense[i] == 'E') {
-      if (r > 0) {
-        h = b;
-        u = b + std::abs(r);
-      } else {
-        h = b - std::abs(r);
-        u = b;
-      }
+  for (i_t i = 0; i < m; ++i) {
+    // Each row is of the form a_i^T x + sigma * s_i = b_i
+    // with sigma = +1 or -1
+    // and l_i <= s_i <= u_i
+    // We have that a_i^T x - b_i = -sigma * s_i
+    // If sigma = -1, then we have
+    //    a_i^T x - b_i = s_i
+    //  l_i <= a_i^T x - b_i <= u_i
+    //  l_i + b_i <= a_i^T x <= u_i + b_i
+    //
+    // If sigma = +1, then we have
+    //    a_i^T x - b_i = -s_i
+    //   -a_i^T x + b_i = s_i
+    //  l_i <= -a_i^T x + b_i <= u_i
+    //  l_i - b_i <= -a_i^T x <= u_i - b_i
+    //  -u_i + b_i <= a_i^T x <= -l_i + b_i
+
+    const i_t slack = slack_map[i];
+    assert(slack != -1);
+    const i_t col_start = lp.A.col_start[slack];
+    const f_t sigma = lp.A.x[col_start];
+    const f_t slack_lower = lp.lower[slack];
+    const f_t slack_upper = lp.upper[slack];
+
+    if (sigma == -1) {
+      constraint_lower[i] = slack_lower + lp.rhs[i];
+      constraint_upper[i] = slack_upper + lp.rhs[i];
+    } else if (sigma == 1) {
+      constraint_lower[i] = -slack_upper + lp.rhs[i];
+      constraint_upper[i] = -slack_lower + lp.rhs[i];
+    } else {
+      assert(sigma == 1.0 || sigma == -1.0);
     }
-    constraint_lower[i] = h;
-    constraint_upper[i] = u;
   }
 
   mps_model.set_constraint_lower_bounds(constraint_lower.data(), m);
   mps_model.set_constraint_upper_bounds(constraint_upper.data(), m);
-  mps_model.set_maximize(user_problem.obj_scale < 0);
+  mps_model.set_maximize(lp.obj_scale < 0);
 
   return mps_model;
 }
 
 template <typename i_t, typename f_t>
-void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
-                      const lp_problem_t<i_t, f_t>& original_lp,
+void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
                       f_t start_time,
+                      const std::vector<i_t>& new_slacks,
                       const std::vector<variable_type_t>& var_types,
                       const std::vector<f_t> root_soln,
                       const std::vector<i_t>& fractional,
@@ -321,14 +356,10 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
     settings.log.printf("Batch PDLP strong branching enabled\n");
 
     f_t start_batch = tic();
+    std::vector<f_t> original_root_soln_x;
 
-    // Use original_problem to create the BatchLP problem
-    csr_matrix_t<i_t, f_t> A_row(original_problem.A.m, original_problem.A.n, 0);
-    original_problem.A.to_compressed_row(A_row);
+    const auto mps_model         = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
 
-    // Convert the root_soln to the original problem space
-    std::vector<f_t> original_root_soln_x;
-    uncrush_primal_solution(original_problem, original_lp, root_soln, original_root_soln_x);
 
     std::vector<f_t> fraction_values;
 
@@ -337,7 +368,6 @@ void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
       fraction_values.push_back(original_root_soln_x[j]);
     }
 
-    const auto mps_model         = simplex_problem_to_mps_data_model(original_problem);
     const f_t batch_elapsed_time = toc(start_time);
     const f_t batch_remaining_time =
       std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
@@ -776,10 +806,10 @@ void pseudo_costs_t<i_t, f_t>::update_pseudo_costs_from_strong_branching(
 
 template class pseudo_costs_t<int, double>;
 
-template void strong_branching<int, double>(const user_problem_t<int, double>& original_problem,
-                                            const lp_problem_t<int, double>& original_lp,
+template void strong_branching<int, double>(const lp_problem_t<int, double>& original_lp,
                                             const simplex_solver_settings_t<int, double>& settings,
                                             double start_time,
+                                            const std::vector<int>& new_slacks,
                                             const std::vector<variable_type_t>& var_types,
                                             const std::vector<double> root_soln,
                                             const std::vector<int>& fractional,
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 6b6c6917b6..3323f8bd6f 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -517,10 +517,10 @@ class pseudo_costs_t {
 };
 
 template <typename i_t, typename f_t>
-void strong_branching(const user_problem_t<i_t, f_t>& original_problem,
-                      const lp_problem_t<i_t, f_t>& original_lp,
+void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
                       f_t start_time,
+                      const std::vector<i_t>& new_slacks,
                       const std::vector<variable_type_t>& var_types,
                       const std::vector<f_t> root_soln,
                       const std::vector<i_t>& fractional,

From ba1e4bd72744f023ff62690b471db092e8596afe Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Fri, 13 Mar 2026 11:45:44 +0100
Subject: [PATCH 24/43] pass slack  and correct problem convertion also in
 reliabiltiy branching, correctly fill the ds_obj objective before merging
 results at the root, correctly clamp the PDLP objective, remove the
 unucessary cuopt_assert regarding fixed point error

---
 cpp/src/branch_and_bound/branch_and_bound.cpp |  4 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 55 +++++++------------
 cpp/src/branch_and_bound/pseudo_costs.hpp     |  4 +-
 cpp/src/pdlp/pdlp.cu                          |  3 -
 4 files changed, 26 insertions(+), 40 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index 1dac28ae46..de448a18d5 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -829,7 +829,9 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
                                                      exploration_stats_,
                                                      upper_bound_,
                                                      worker_pool_.num_idle_workers(),
-                                                     log);
+                                                     log,
+                                                     new_slacks_,
+                                                     original_lp_);
       } else {
         branch_var = pc_.variable_selection(fractional, solution, log);
       }
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index f3f939c9d4..db28888c69 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -102,6 +102,7 @@ void strong_branch_helper(i_t start,
 
       if (branch == 0) {
         pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0);
+        ds_obj_down[k] = std::max(obj - root_obj, 0.0);
         ds_status_down[k]        = status;
         if (verbose) {
           settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
@@ -114,6 +115,7 @@ void strong_branch_helper(i_t start,
         }
       } else {
         pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0);
+        ds_obj_up[k] = std::max(obj - root_obj, 0.0);
         ds_status_up[k]        = status;
         if (verbose) {
           settings.log.printf(
@@ -348,36 +350,6 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   return mps_model;
 }
 
-template <typename i_t, typename f_t>
-static cuopt::mps_parser::mps_data_model_t<i_t, f_t> lp_problem_to_mps_data_model(
-  const lp_problem_t<i_t, f_t>& lp_problem)
-{
-  cuopt::mps_parser::mps_data_model_t<i_t, f_t> mps_model;
-  int m = lp_problem.num_rows;
-  int n = lp_problem.num_cols;
-
-  csr_matrix_t<i_t, f_t> csr_A(m, n, 0);
-  lp_problem.A.to_compressed_row(csr_A);
-
-  int nz = csr_A.row_start[m];
-
-  mps_model.set_csr_constraint_matrix(
-    csr_A.x.data(), nz, csr_A.j.data(), nz, csr_A.row_start.data(), m + 1);
-
-  mps_model.set_objective_coefficients(lp_problem.objective.data(), n);
-  mps_model.set_objective_scaling_factor(lp_problem.obj_scale);
-  mps_model.set_objective_offset(lp_problem.obj_constant);
-
-  mps_model.set_variable_lower_bounds(lp_problem.lower.data(), n);
-  mps_model.set_variable_upper_bounds(lp_problem.upper.data(), n);
-
-  mps_model.set_constraint_lower_bounds(lp_problem.rhs.data(), m);
-  mps_model.set_constraint_upper_bounds(lp_problem.rhs.data(), m);
-  mps_model.set_maximize(lp_problem.obj_scale < 0);
-
-  return mps_model;
-}
-
 // Merge a single strong branching result from Dual Simplex and PDLP.
 // Rules:
 //   1. If both found optimal   -> keep DS (higher quality vertex solution)
@@ -536,8 +508,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                      ? solutions.get_dual_objective_value(k + fractional.size())
                      : std::numeric_limits<f_t>::quiet_NaN();
 
-      pdlp_obj_down[k] = obj_down - root_obj;
-      pdlp_obj_up[k]   = obj_up - root_obj;
+      pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0));
+      pdlp_obj_up[k]   = std::max(obj_up - root_obj, f_t(0.0));
     }
 
     // Batch PDLP finished – tell Dual Simplex to stop
@@ -763,7 +735,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
   f_t upper_bound,
   int max_num_tasks,
-  logger_t& log)
+  logger_t& log,
+  const std::vector<i_t>& new_slacks,
+  const lp_problem_t<i_t, f_t>& original_lp)
 {
   constexpr f_t eps = 1e-6;
   f_t start_time    = bnb_stats.start_time;
@@ -873,12 +847,23 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       f_t start_batch = tic();
 
-      const auto mps_model = lp_problem_to_mps_data_model(worker->leaf_problem);
+      std::vector<f_t> original_soln_x;
+      // Convert the original_lp that has cuts to a problem that is better for PDLP
+      auto mps_model = simplex_problem_to_mps_data_model(
+        original_lp, new_slacks, solution, original_soln_x);
+      // Apply the bounds of the current leaf problem
+      {
+        const i_t n_orig = original_lp.num_cols - new_slacks.size();
+        for (i_t j = 0; j < n_orig; j++) {
+          mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j];
+          mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j];
+        }
+      }
 
       std::vector<f_t> fraction_values;
       fraction_values.reserve(num_pdlp_vars);
       for (i_t j : pdlp_overflow_list) {
-        fraction_values.push_back(solution[j]);
+        fraction_values.push_back(original_soln_x[j]);
       }
 
       const f_t batch_elapsed_time    = toc(start_time);
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 8a408c81e3..75cf660621 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -481,7 +481,9 @@ class pseudo_costs_t {
                                   const branch_and_bound_stats_t<i_t, f_t>& bnb_stats,
                                   f_t upper_bound,
                                   int max_num_tasks,
-                                  logger_t& log);
+                                  logger_t& log,
+                                  const std::vector<i_t>& new_slacks,
+                                  const lp_problem_t<i_t, f_t>& original_lp);
 
   void update_pseudo_costs_from_strong_branching(const std::vector<i_t>& fractional,
                                                  const std::vector<f_t>& root_soln);
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 7bdff6b4e7..dd1848e53a 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -1506,9 +1506,6 @@ HDI void fixed_error_computation(const f_t norm_squared_delta_primal,
     norm_squared_delta_primal * primal_weight + norm_squared_delta_dual / primal_weight;
   const f_t computed_interaction = f_t(2.0) * interaction * step_size;
 
-  cuopt_assert(movement + computed_interaction >= f_t(0.0),
-               "Movement + computed interaction must be >= 0");
-
   // Clamp to 0 to avoid NaN
   *fixed_point_error = cuda::std::sqrt(cuda::std::max(f_t(0.0), movement + computed_interaction));
 

From d513865b34b24dedb21daf6808dd2f9406e7d794 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 18 Mar 2026 11:08:54 +0100
Subject: [PATCH 25/43] add initial pdlp iteartions to the warm start data and
 on by default

---
 .../linear_programming/pdlp/solver_settings.hpp     | 11 +++++++++++
 cpp/src/pdlp/pdlp.cu                                | 13 +++++++++++++
 cpp/src/pdlp/solve.cu                               |  8 ++++++++
 cpp/src/pdlp/solver_settings.cu                     | 12 ++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index d3f59144cc..72be0943da 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -147,6 +147,12 @@ class pdlp_solver_settings_t {
    * @param[in] initial_primal_weight Initial primal weight.
    */
   void set_initial_primal_weight(f_t initial_primal_weight);
+    /**
+   * @brief Set an initial pdlp iteration.
+   *
+   * @param[in] initial_pdlp_iteration Initial pdlp iteration.
+   */
+   void set_initial_pdlp_iteration(i_t initial_pdlp_iteration);
 
   /**
    * @brief Set the pdlp warm start data. This allows to restart PDLP with a
@@ -213,6 +219,8 @@ class pdlp_solver_settings_t {
   std::optional<f_t> get_initial_step_size() const;
   // TODO batch mode: tmp
   std::optional<f_t> get_initial_primal_weight() const;
+  // TODO batch mode: tmp
+  std::optional<i_t> get_initial_pdlp_iteration() const;
 
   const rmm::device_uvector<f_t>& get_initial_primal_solution() const;
   const rmm::device_uvector<f_t>& get_initial_dual_solution() const;
@@ -284,6 +292,9 @@ class pdlp_solver_settings_t {
   /** Initial primal weight */
   // TODO batch mode: tmp
   std::optional<f_t> initial_primal_weight_;
+  /** Initial pdlp iteration */
+  // TODO batch mode: tmp
+  std::optional<i_t> initial_pdlp_iteration_;
   /** GPU-backed warm start data (device_uvector), used by C++ API and local GPU solves */
   pdlp_warm_start_data_t<i_t, f_t> pdlp_warm_start_data_;
   /** Warm start data as spans over external memory, used by Cython/Python interface */
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index dd1848e53a..7edbeaff15 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -2195,6 +2195,19 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     pdhg_solver_.total_pdhg_iterations_ = initial_k_.value();
     pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(initial_k_.value(), stream_view_);
   }
+  if (settings_.get_initial_pdlp_iteration().has_value()) {
+    total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
+    // This is meaningless in batch mode since pdhg step is never used, set it just to avoid assertions
+    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, stream_view_);
+    pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
+    // Reset the fixed point error since at this pdlp iteration it is expected to already be initialized to some value
+    std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
+    restart_strategy_.initial_fixed_point_error_.end(),
+    f_t(0.0));
+    std::fill(restart_strategy_.fixed_point_error_.begin(),
+              restart_strategy_.fixed_point_error_.end(),
+              f_t(0.0));
+  }
 
   // Only the primal_weight_ and step_size_ variables are initialized during the initial phase
   // The associated primal/dual step_size (computed using the two firstly mentionned) are not
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 90f2a03590..9676ef483f 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -906,6 +906,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   // Hyper parameter than can be changed, I have put what I believe to be the best
   bool pdlp_primal_dual_init    = true;
   bool primal_weight_init       = true;
+  bool use_initial_pdlp_iterations = true;
   bool use_optimal_batch_size   = false;
   constexpr int iteration_limit = 100000;
 
@@ -915,6 +916,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   rmm::device_uvector<f_t> initial_dual(0, stream);
   f_t initial_step_size     = std::numeric_limits<f_t>::signaling_NaN();
   f_t initial_primal_weight = std::numeric_limits<f_t>::signaling_NaN();
+  i_t initial_pdlp_iteration = -1;
 
   cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0");
   const size_t max_batch_size  = settings.new_bounds.size();
@@ -993,6 +995,9 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     if (primal_weight_init) {
       initial_primal_weight = original_solution.get_pdlp_warm_start_data().initial_primal_weight_;
     }
+    if (use_initial_pdlp_iterations) {
+      initial_pdlp_iteration = original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+    }
   }
 
   // We don't use the solutions vectors for now
@@ -1021,6 +1026,9 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     if (!std::isnan(initial_step_size)) {
       batch_settings.set_initial_step_size(initial_step_size);
     }
+    if (use_initial_pdlp_iterations) {
+      batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration);
+    }
   }
   if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
 
diff --git a/cpp/src/pdlp/solver_settings.cu b/cpp/src/pdlp/solver_settings.cu
index 7acfc7481c..30d5ccaea5 100644
--- a/cpp/src/pdlp/solver_settings.cu
+++ b/cpp/src/pdlp/solver_settings.cu
@@ -348,6 +348,18 @@ std::optional<f_t> pdlp_solver_settings_t<i_t, f_t>::get_initial_primal_weight()
   return initial_primal_weight_;
 }
 
+template <typename i_t, typename f_t>
+void pdlp_solver_settings_t<i_t, f_t>::set_initial_pdlp_iteration(i_t initial_pdlp_iteration)
+{
+  initial_pdlp_iteration_ = std::make_optional(initial_pdlp_iteration);
+}
+
+template <typename i_t, typename f_t>
+std::optional<i_t> pdlp_solver_settings_t<i_t, f_t>::get_initial_pdlp_iteration() const
+{
+  return initial_pdlp_iteration_;
+}
+
 template <typename i_t, typename f_t>
 const pdlp_warm_start_data_t<i_t, f_t>& pdlp_solver_settings_t<i_t, f_t>::get_pdlp_warm_start_data()
   const noexcept

From a3a458d65c834e50098c83d1d67dce174ddba9ae Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Mar 2026 12:39:34 +0000
Subject: [PATCH 26/43] put clique table in lp necessary file, add solver
 setting flag to generate solution in batch pdlp only for the test that needs
 it

---
 .../pdlp/solver_settings.hpp                  |  3 +++
 cpp/src/mip_heuristics/CMakeLists.txt         |  2 +-
 cpp/src/pdlp/solve.cu                         | 25 ++++++++++++++++---
 cpp/tests/linear_programming/pdlp_test.cu     |  9 ++++---
 4 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 72be0943da..91ca14e96c 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -280,6 +280,9 @@ class pdlp_solver_settings_t {
   // concurrently i.e. if new_bounds.size() == 2, then 2 versions of the problem with updated bounds
   // will be solved concurrently
   std::vector<std::tuple<i_t, f_t, f_t>> new_bounds;
+  // By default to save memory and speed we don't store and copy each climber's primal and dual solutions
+  // We only retrieve termination statistics and the objective values
+  bool generate_batch_primal_dual_solution{false};
 
  private:
   /** Initial primal solution */
diff --git a/cpp/src/mip_heuristics/CMakeLists.txt b/cpp/src/mip_heuristics/CMakeLists.txt
index a200d4265b..5e3d19c8b0 100644
--- a/cpp/src/mip_heuristics/CMakeLists.txt
+++ b/cpp/src/mip_heuristics/CMakeLists.txt
@@ -14,6 +14,7 @@ set(MIP_LP_NECESSARY_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/third_party_presolve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/gf2_presolve.cpp
   ${CMAKE_CURRENT_SOURCE_DIR}/solution/solution.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu
 )
 
 # Files that are MIP-specific and not needed for pure LP
@@ -38,7 +39,6 @@ set(MIP_NON_LP_FILES
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/multi_probe.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/probing_cache.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/presolve/trivial_presolve.cu
-  ${CMAKE_CURRENT_SOURCE_DIR}/presolve/conflict_graph/clique_table.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/feasibility_jump_kernels.cu
   ${CMAKE_CURRENT_SOURCE_DIR}/feasibility_jump/fj_cpu.cu)
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 9676ef483f..c59bf1bbdb 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -1000,10 +1000,12 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     }
   }
 
-  // We don't use the solutions vectors for now
-  rmm::device_uvector<f_t> full_primal_solution(0, stream);
-  rmm::device_uvector<f_t> full_dual_solution(0, stream);
-  rmm::device_uvector<f_t> full_reduced_cost(0, stream);
+  
+  const bool collect_solutions = settings.generate_batch_primal_dual_solution;
+  
+  rmm::device_uvector<f_t> full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
+  rmm::device_uvector<f_t> full_dual_solution((collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream);
+  rmm::device_uvector<f_t> full_reduced_cost((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
 
   std::vector<
     typename optimization_problem_solution_t<i_t, f_t>::additional_termination_information_t>
@@ -1045,6 +1047,21 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
     auto status = sol.get_terminations_status();
     full_status.insert(full_status.end(), status.begin(), status.end());
+
+    if (collect_solutions) {
+        raft::copy(full_primal_solution.data() + i * problem.get_n_variables(),
+                   sol.get_primal_solution().data(),
+                   sol.get_primal_solution().size(),
+                   stream);
+        raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(),
+                   sol.get_dual_solution().data(),
+                   sol.get_dual_solution().size(),
+                   stream);
+        raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(),
+                   sol.get_reduced_cost().data(),
+                   sol.get_reduced_cost().size(),
+                   stream);
+    }
   }
 
   return optimization_problem_solution_t<i_t, f_t>(full_primal_solution,
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index d5a8d69008..9cbca2d86e 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -1677,10 +1677,11 @@ TEST(pdlp_class, strong_branching_test)
   const std::vector<int> fractional     = {1, 2, 4};
   const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
 
-  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
-  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
-  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
-  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+  auto solver_settings                                  = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method                                = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode                      = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver                             = cuopt::linear_programming::presolver_t::None;
+  solver_settings.generate_batch_primal_dual_solution   = true;
 
   const int n_fractional = fractional.size();
   const int batch_size   = n_fractional * 2;

From 79d05e770d1c65bf0976b6b9c72d56edeb74c606 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Mar 2026 15:02:55 +0100
Subject: [PATCH 27/43] initial version of work stealing

---
 .../pdlp/solver_settings.hpp                  |   6 +
 cpp/src/branch_and_bound/pseudo_costs.cpp     |  71 +++--
 .../shared_strong_branching_context.hpp       |  50 +++
 cpp/src/pdlp/pdlp.cu                          |  40 ++-
 cpp/src/pdlp/solve.cu                         |   8 +
 .../termination_strategy.cu                   |  10 +-
 .../termination_strategy.hpp                  |   1 +
 cpp/tests/linear_programming/pdlp_test.cu     | 298 ++++++++++++++++++
 8 files changed, 461 insertions(+), 23 deletions(-)
 create mode 100644 cpp/src/branch_and_bound/shared_strong_branching_context.hpp

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 91ca14e96c..f3521edc54 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -18,6 +18,8 @@
 
 #include <atomic>
 
+#include <branch_and_bound/shared_strong_branching_context.hpp>
+
 namespace cuopt::linear_programming {
 
 // Forward declare solver_settings_t for friend class
@@ -272,6 +274,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
+  // Shared strong branching context view for cooperative DS + PDLP
+  dual_simplex::shared_strong_branching_context_view_t<i_t, f_t> shared_sb_view;
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:
@@ -283,6 +287,8 @@ class pdlp_solver_settings_t {
   // By default to save memory and speed we don't store and copy each climber's primal and dual solutions
   // We only retrieve termination statistics and the objective values
   bool generate_batch_primal_dual_solution{false};
+  // Used to force batch PDLP to solve a subbatch of the problems at a time
+  i_t sub_batch_size{0};
 
  private:
   /** Initial primal solution */
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index db28888c69..52a7a0ac78 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -6,6 +6,7 @@
 /* clang-format on */
 
 #include <branch_and_bound/pseudo_costs.hpp>
+#include <branch_and_bound/shared_strong_branching_context.hpp>
 
 #include <dual_simplex/phase2.hpp>
 #include <dual_simplex/simplex_solver_settings.hpp>
@@ -41,7 +42,8 @@ void strong_branch_helper(i_t start,
                           std::vector<f_t>& ds_obj_up,
                           std::vector<dual::status_t>& ds_status_down,
                           std::vector<dual::status_t>& ds_status_up,
-                          std::atomic<int>* concurrent_halt)
+                          std::atomic<int>* concurrent_halt,
+                          shared_strong_branching_context_view_t<i_t, f_t>& sb_view)
 {
   raft::common::nvtx::range scope("BB::strong_branch_helper");
   lp_problem_t child_problem = original_lp;
@@ -56,6 +58,15 @@ void strong_branch_helper(i_t start,
 
     for (i_t branch = 0; branch < 2; branch++) {
       // Do the down branch
+      const i_t shared_idx = (branch == 0) ? k : k + static_cast<i_t>(fractional.size());
+      // Batch PDLP has already solved this subproblem, skip it
+      if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) {
+        settings.log.printf(
+          "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved by PDLP\n",
+          thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+        continue;
+      }
+
       if (branch == 0) {
         child_problem.lower[j] = original_lp.lower[j];
         child_problem.upper[j] = std::floor(root_soln[j]);
@@ -131,6 +142,13 @@ void strong_branch_helper(i_t start,
             toc(start_time));
         }
       }
+      // Mark the subproblem as solved so that batch PDLP removes it from the batch
+      if (sb_view.is_valid()) {
+        sb_view.mark_solved(shared_idx);
+        settings.log.printf(
+          "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n",
+          thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+      }
       if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
         break; 
       }
@@ -408,7 +426,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       settings.num_threads,
                       fractional.size());
 
-  // Race both batch PDLP and parallel Dual Simplex
+  // Cooperative DS + PDLP: shared context tracks which subproblems are solved
+  shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
+  shared_strong_branching_context_view_t<i_t, f_t> sb_view(std::span(shared_ctx.solved));
+
   std::atomic<int> concurrent_halt{0};
 
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
@@ -446,6 +467,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
     pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
     pdlp_settings.concurrent_halt = &concurrent_halt;
+    pdlp_settings.shared_sb_view  = sb_view;
 
     pdlp_settings.time_limit = batch_remaining_time;
     const raft::handle_t batch_pdlp_handle;
@@ -512,8 +534,6 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       pdlp_obj_up[k]   = std::max(obj_up - root_obj, f_t(0.0));
     }
 
-    // Batch PDLP finished – tell Dual Simplex to stop
-    concurrent_halt.store(1);
   });
   
   std::vector<dual::status_t> ds_status_down(fractional.size(), dual::status_t::UNSET);
@@ -559,20 +579,12 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                              ds_obj_up,
                              ds_status_down,
                              ds_status_up,
-                             &concurrent_halt);
+                             &concurrent_halt,
+                             sb_view);
       }
     }
 
-  if (settings.mip_batch_pdlp_strong_branching == 1) {
-    if (concurrent_halt.load() == 1) {
-      settings.log.printf("Batch PDLP finished before Dual Simplex\n");
-    }
-    else {
-      settings.log.printf("Dual Simplex finished before Batch PDLP\n");
-    }
-  }
-
-  // Dual Simplex finished all subproblems – tell Batch PDLP to stop
+  // DS done: signal PDLP to stop (time-limit or all work done) and wait
   concurrent_halt.store(1);
 
   pdlp_thread.join();
@@ -614,25 +626,46 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   i_t merged_from_ds = 0;
   i_t merged_from_pdlp = 0;
   i_t merged_nan = 0;
+  i_t solved_by_both_down = 0;
+  i_t solved_by_both_up = 0;
   for (i_t k = 0; k < fractional.size(); k++) {
-    const auto [value_down, source_down] = merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], !std::isnan(pdlp_obj_down[k]));
+    bool ds_has_down   = ds_status_down[k] != dual::status_t::UNSET;
+    bool pdlp_has_down = !std::isnan(pdlp_obj_down[k]);
+    const auto [value_down, source_down] = merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down);
     pc.strong_branch_down[k] = value_down;
     if (source_down == 0) merged_from_ds++;
     else if (source_down == 1) merged_from_pdlp++;
     else merged_nan++;
-    const auto [value_up, source_up] = merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], !std::isnan(pdlp_obj_up[k]));
+    if (ds_has_down && pdlp_has_down) {
+      solved_by_both_down++;
+      settings.log.printf(
+        "[COOP SB] Merge: variable %d DOWN solved by BOTH (DS=%e PDLP=%e) -> kept %s\n",
+        fractional[k], ds_obj_down[k], pdlp_obj_down[k], source_down == 0 ? "DS" : "PDLP");
+    }
+
+    bool ds_has_up   = ds_status_up[k] != dual::status_t::UNSET;
+    bool pdlp_has_up = !std::isnan(pdlp_obj_up[k]);
+    const auto [value_up, source_up] = merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up);
     pc.strong_branch_up[k] = value_up;
     if (source_up == 0) merged_from_ds++;
     else if (source_up == 1) merged_from_pdlp++;
     else merged_nan++;
+    if (ds_has_up && pdlp_has_up) {
+      solved_by_both_up++;
+      settings.log.printf(
+        "[COOP SB] Merge: variable %d UP solved by BOTH (DS=%e PDLP=%e) -> kept %s\n",
+        fractional[k], ds_obj_up[k], pdlp_obj_up[k], source_up == 0 ? "DS" : "PDLP");
+    }
   }
 
   if (settings.mip_batch_pdlp_strong_branching == 1) {
     settings.log.printf(
-      "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN)\n",
+      "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n",
       merged_from_ds,
       merged_from_pdlp,
-      merged_nan);
+      merged_nan,
+      solved_by_both_down,
+      solved_by_both_up);
   }
 
   pc.update_pseudo_costs_from_strong_branching(fractional, root_soln);
diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
new file mode 100644
index 0000000000..6cbea737f5
--- /dev/null
+++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
@@ -0,0 +1,50 @@
+/* clang-format off */
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+/* clang-format on */
+
+#pragma once
+
+#include <atomic>
+#include <span>
+#include <vector>
+
+namespace cuopt::linear_programming::dual_simplex {
+
+template <typename i_t, typename f_t>
+struct shared_strong_branching_context_t {
+  std::vector<std::atomic<int>> solved;
+
+  explicit shared_strong_branching_context_t(size_t num_subproblems) : solved(num_subproblems)
+  {
+    for (auto& s : solved)
+      s.store(0);
+  }
+};
+
+template <typename i_t, typename f_t>
+struct shared_strong_branching_context_view_t {
+  std::span<std::atomic<int>> solved;
+
+  shared_strong_branching_context_view_t() = default;
+
+  shared_strong_branching_context_view_t(std::span<std::atomic<int>> s) : solved(s) {}
+
+  bool is_valid() const { return !solved.empty(); }
+
+  bool is_solved(i_t local_idx) const
+  {
+    return solved[local_idx].load() != 0;
+  }
+
+  void mark_solved(i_t local_idx) const { solved[local_idx].store(1); }
+
+  shared_strong_branching_context_view_t subview(i_t offset, i_t count) const
+  {
+    return {solved.subspan(offset, count)};
+  }
+};
+
+}  // namespace cuopt::linear_programming::dual_simplex
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 7edbeaff15..9d5715a936 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -776,7 +776,27 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
   }
 #endif
 
-  // All are optimal or infeasible
+  // Sync external solved status into internal termination strategy before all_done() check
+  if (settings_.shared_sb_view.is_valid()) {
+    for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+      // If PDLP has solved it to optimality we want to keep it and resolved both solvers having solved the problem later
+      if (current_termination_strategy_.is_done(
+            current_termination_strategy_.get_termination_status(i)))
+        continue;
+      const i_t local_idx = climber_strategies_[i].original_index;
+      if (settings_.shared_sb_view.is_solved(local_idx)) {
+        current_termination_strategy_.set_termination_status(i,
+          pdlp_termination_status_t::ConcurrentLimit);
+#ifdef BATCH_VERBOSE_MODE
+        std::cout << "[COOP SB] DS already solved climber " << i << " (original_index "
+                  << local_idx << "), synced to ConcurrentLimit at step "
+                  << total_pdlp_iterations_ << std::endl;
+#endif
+      }
+    }
+  }
+
+  // All are optimal, infeasible, or externally solved
   if (current_termination_strategy_.all_done()) {
     const auto original_batch_size = settings_.new_bounds.size();
     // Some climber got removed from the batch while the optimization was running
@@ -823,6 +843,9 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
           .get_additional_termination_informations()[climber_strategies_[i].original_index]
           .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) !=
                              pdlp_termination_status_t::ConcurrentLimit);
+        if (settings_.shared_sb_view.is_valid()) {
+          settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
+        }
       }
       current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_);
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
@@ -839,6 +862,11 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
         std::move(batch_solution_to_return_.get_additional_termination_informations()),
         std::move(batch_solution_to_return_.get_terminations_status())};
     }
+    if (settings_.shared_sb_view.is_valid()) {
+      for (size_t i = 0; i < climber_strategies_.size(); ++i) {
+        settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
+      }
+    }
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
     return current_termination_strategy_.fill_return_problem_solution(
       internal_solver_iterations_,
@@ -857,8 +885,11 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
             current_termination_strategy_.get_termination_status(i))) {
         raft::common::nvtx::range fun_scope("remove_done_climber");
 #ifdef BATCH_VERBOSE_MODE
-        std::cout << "Removing climber " << i << " because it is done. Its original index is "
-                  << climber_strategies_[i].original_index << std::endl;
+        const bool externally_solved = (current_termination_strategy_.get_termination_status(i) ==
+                                        pdlp_termination_status_t::ConcurrentLimit);
+        std::cout << "Removing climber " << i << " (original_index "
+                  << climber_strategies_[i].original_index << ") because it is done"
+                  << (externally_solved ? " [solved by DS]" : " [solved by PDLP]") << std::endl;
 #endif
         to_remove.emplace(i);
         // Copy current climber solution information
@@ -891,6 +922,9 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
           .get_additional_termination_informations()[climber_strategies_[i].original_index]
           .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) !=
                              pdlp_termination_status_t::ConcurrentLimit);
+        if (settings_.shared_sb_view.is_valid()) {
+          settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
+        }
       }
     }
     if (to_remove.size() > 0) {
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index c59bf1bbdb..ced3844a9b 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -962,6 +962,9 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   size_t optimal_batch_size = use_optimal_batch_size
                              ? detail::optimal_batch_size_handler(problem, memory_max_batch_size)
                              : max_batch_size;
+  if (settings.sub_batch_size > 0) {
+    optimal_batch_size = settings.sub_batch_size;
+  }
   cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size,
                "Optimal batch size should be between 1 and max batch size");
   using f_t2 = typename type_2<f_t>::type;
@@ -1040,6 +1043,11 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     batch_settings.new_bounds = std::vector<std::tuple<i_t, f_t, f_t>>(
       original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size);
 
+    if (settings.shared_sb_view.is_valid()) {
+      batch_settings.shared_sb_view =
+        settings.shared_sb_view.subview(i, current_batch_size);
+    }
+
     auto sol = solve_lp(problem, batch_settings);
 
     auto info = sol.get_additional_termination_informations();
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
index 7179df6a49..563850dc0c 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
@@ -124,6 +124,13 @@ pdlp_termination_status_t pdlp_termination_strategy_t<i_t, f_t>::get_termination
   return (pdlp_termination_status_t)termination_status_[id];
 }
 
+template <typename i_t, typename f_t>
+void pdlp_termination_strategy_t<i_t, f_t>::set_termination_status(
+  i_t id, pdlp_termination_status_t status)
+{
+  termination_status_[id] = (i_t)status;
+}
+
 template <typename i_t, typename f_t>
 std::vector<pdlp_termination_status_t>
 pdlp_termination_strategy_t<i_t, f_t>::get_terminations_status()
@@ -389,7 +396,8 @@ __host__ __device__ bool pdlp_termination_strategy_t<i_t, f_t>::is_done(
 {
   return termination_status == pdlp_termination_status_t::Optimal ||
          termination_status == pdlp_termination_status_t::PrimalInfeasible ||
-         termination_status == pdlp_termination_status_t::DualInfeasible;
+         termination_status == pdlp_termination_status_t::DualInfeasible ||
+         termination_status == pdlp_termination_status_t::ConcurrentLimit;
 }
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
index 6fe118c488..efb7a41d7b 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.hpp
@@ -140,6 +140,7 @@ class pdlp_termination_strategy_t {
   f_t get_relative_primal_tolerance_factor() const;
 
   pdlp_termination_status_t get_termination_status(i_t id) const;
+  void set_termination_status(i_t id, pdlp_termination_status_t status);
   std::vector<pdlp_termination_status_t> get_terminations_status();
   bool all_optimal_status() const;
   bool all_done() const;
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index 9cbca2d86e..be91e96015 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -43,8 +43,11 @@
 #include <cmath>
 #include <cstdint>
 #include <sstream>
+#include <thread>
 #include <vector>
 
+#include <branch_and_bound/shared_strong_branching_context.hpp>
+
 namespace cuopt::linear_programming::test {
 
 constexpr double afiro_primal_objective = -464.0;
@@ -2044,6 +2047,301 @@ TEST(pdlp_class, precision_single_pslp_presolve)
     afiro_primal_objective, solution.get_additional_termination_information().primal_objective));
 }
 
+// ---------------------------------------------------------------------------
+// Cooperative strong branching tests
+// ---------------------------------------------------------------------------
+
+TEST(pdlp_class, shared_sb_context_unit)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+
+  constexpr int N = 10;
+  shared_strong_branching_context_t<int, double> ctx(N);
+  shared_strong_branching_context_view_t<int, double> view(std::span(ctx.solved));
+
+  EXPECT_TRUE(view.is_valid());
+
+  shared_strong_branching_context_view_t<int, double> empty_view;
+  EXPECT_FALSE(empty_view.is_valid());
+
+  for (int i = 0; i < N; ++i) {
+    EXPECT_FALSE(view.is_solved(i));
+  }
+
+  view.mark_solved(0);
+  view.mark_solved(3);
+  view.mark_solved(7);
+
+  EXPECT_TRUE(view.is_solved(0));
+  EXPECT_FALSE(view.is_solved(1));
+  EXPECT_FALSE(view.is_solved(2));
+  EXPECT_TRUE(view.is_solved(3));
+  EXPECT_FALSE(view.is_solved(4));
+  EXPECT_FALSE(view.is_solved(5));
+  EXPECT_FALSE(view.is_solved(6));
+  EXPECT_TRUE(view.is_solved(7));
+  EXPECT_FALSE(view.is_solved(8));
+  EXPECT_FALSE(view.is_solved(9));
+
+  // subview(2, 5) covers global indices [2..6]
+  auto sv = view.subview(2, 5);
+  EXPECT_TRUE(sv.is_valid());
+  EXPECT_FALSE(sv.is_solved(0));  // global 2
+  EXPECT_TRUE(sv.is_solved(1));   // global 3
+  EXPECT_FALSE(sv.is_solved(2));  // global 4
+  EXPECT_FALSE(sv.is_solved(3));  // global 5
+  EXPECT_FALSE(sv.is_solved(4));  // global 6
+
+  // Mark through subview: local 4 -> global 6
+  sv.mark_solved(4);
+  EXPECT_TRUE(view.is_solved(6));
+  EXPECT_TRUE(sv.is_solved(4));
+}
+
+TEST(pdlp_class, shared_sb_view_batch_pre_solved)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+
+  const raft::handle_t handle_{};
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::mps_parser::mps_data_model_t<int, double> op_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, true);
+
+  const std::vector<int> fractional     = {1, 2, 4};
+  const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
+  const int n_fractional                = fractional.size();
+  const int batch_size                  = n_fractional * 2;  // 6
+
+  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+
+  // Build new_bounds: down branches [0..2], up branches [3..5]
+  for (int i = 0; i < n_fractional; ++i)
+    solver_settings.new_bounds.push_back({fractional[i],
+                                          op_problem.get_variable_lower_bounds()[fractional[i]],
+                                          std::floor(root_soln_x[i])});
+  for (int i = 0; i < n_fractional; ++i)
+    solver_settings.new_bounds.push_back({fractional[i],
+                                          std::ceil(root_soln_x[i]),
+                                          op_problem.get_variable_upper_bounds()[fractional[i]]});
+
+  shared_strong_branching_context_t<int, double> ctx(batch_size);
+
+  // Pre-mark entries 1 and 4 as solved (simulating DS)
+  ctx.solved[1].store(1);
+  ctx.solved[4].store(1);
+
+  solver_settings.shared_sb_view =
+    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+
+  auto solution = solve_lp(&handle_, op_problem, solver_settings);
+
+  ASSERT_EQ(solution.get_terminations_status().size(), batch_size);
+
+  // Pre-solved entries should have ConcurrentLimit
+  EXPECT_EQ(solution.get_termination_status(1), pdlp_termination_status_t::ConcurrentLimit);
+  EXPECT_EQ(solution.get_termination_status(4), pdlp_termination_status_t::ConcurrentLimit);
+
+  // Others should be Optimal
+  EXPECT_EQ(solution.get_termination_status(0), pdlp_termination_status_t::Optimal);
+  EXPECT_EQ(solution.get_termination_status(2), pdlp_termination_status_t::Optimal);
+  EXPECT_EQ(solution.get_termination_status(3), pdlp_termination_status_t::Optimal);
+  EXPECT_EQ(solution.get_termination_status(5), pdlp_termination_status_t::Optimal);
+
+  // All entries should now be marked solved in the shared context
+  for (int i = 0; i < batch_size; ++i) {
+    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+  }
+}
+
+TEST(pdlp_class, shared_sb_view_subbatch)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+
+  const raft::handle_t handle_{};
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::mps_parser::mps_data_model_t<int, double> op_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, true);
+
+  const std::vector<int> fractional     = {1, 2, 4};
+  const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
+  const int n_fractional                = fractional.size();
+  const int batch_size                  = n_fractional * 2;
+
+  auto solver_settings                   = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method                 = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode       = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver              = cuopt::linear_programming::presolver_t::None;
+  solver_settings.sub_batch_size         = 2;
+
+  shared_strong_branching_context_t<int, double> ctx(batch_size);
+
+  // Pre-mark one entry in each sub-batch of size 2: indices 1, 4
+  ctx.solved[1].store(1);
+  ctx.solved[4].store(1);
+
+  solver_settings.shared_sb_view =
+    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+
+  auto solution = batch_pdlp_solve(&handle_, op_problem, fractional, root_soln_x, solver_settings);
+
+  ASSERT_EQ(solution.get_terminations_status().size(), batch_size);
+
+  // Pre-solved entries should have ConcurrentLimit
+  EXPECT_EQ(solution.get_termination_status(1), pdlp_termination_status_t::ConcurrentLimit);
+  EXPECT_EQ(solution.get_termination_status(4), pdlp_termination_status_t::ConcurrentLimit);
+
+  // Others should be Optimal
+  for (int i = 0; i < batch_size; ++i) {
+    if (i == 1 || i == 4) continue;
+    EXPECT_EQ(solution.get_termination_status(i), pdlp_termination_status_t::Optimal)
+      << "Entry " << i << " should be Optimal";
+  }
+
+  // All should be marked solved
+  for (int i = 0; i < batch_size; ++i) {
+    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+  }
+}
+
+TEST(pdlp_class, shared_sb_view_concurrent_mark)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+
+  const raft::handle_t handle_{};
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::mps_parser::mps_data_model_t<int, double> op_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, true);
+
+  const std::vector<int> fractional     = {1, 2, 4};
+  const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
+  const int n_fractional                = fractional.size();
+  const int batch_size                  = n_fractional * 2;
+
+  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+  solver_settings.iteration_limit  = 1000000;
+
+  for (int i = 0; i < n_fractional; ++i)
+    solver_settings.new_bounds.push_back({fractional[0],
+                                          -5,
+                                          -5});
+
+  for (int i = 0; i < n_fractional; ++i)
+    solver_settings.new_bounds.push_back({fractional[i],
+                                          std::ceil(root_soln_x[i]),
+                                          op_problem.get_variable_upper_bounds()[fractional[i]]});
+
+  shared_strong_branching_context_t<int, double> ctx(batch_size);
+
+  solver_settings.shared_sb_view =
+    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+
+  optimization_problem_solution_t<int, double>* result_ptr = nullptr;
+
+  auto pdlp_thread = std::thread([&]() {
+    auto sol = new optimization_problem_solution_t<int, double>(
+      solve_lp(&handle_, op_problem, solver_settings));
+    result_ptr = sol;
+  });
+
+  // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS)
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  for (int i = 0; i < n_fractional; ++i)
+    ctx.solved[i].store(1);
+
+  pdlp_thread.join();
+
+  ASSERT_NE(result_ptr, nullptr);
+  auto& solution = *result_ptr;
+
+  ASSERT_EQ(solution.get_terminations_status().size(), batch_size);
+
+  for (int i = 0; i < batch_size; ++i) {
+    auto status = solution.get_termination_status(i);
+    // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it)
+    EXPECT_TRUE(status == pdlp_termination_status_t::Optimal ||
+                status == pdlp_termination_status_t::ConcurrentLimit)
+      << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t<int, double>::get_termination_status_string(status);
+  }
+
+  // All entries should end up marked solved
+  for (int i = 0; i < batch_size; ++i) {
+    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+  }
+
+  delete result_ptr;
+}
+
+TEST(pdlp_class, shared_sb_view_all_infeasible)
+{
+  using namespace cuopt::linear_programming::dual_simplex;
+
+  const raft::handle_t handle_{};
+  auto path = make_path_absolute("linear_programming/afiro_original.mps");
+  cuopt::mps_parser::mps_data_model_t<int, double> op_problem =
+    cuopt::mps_parser::parse_mps<int, double>(path, true);
+
+  const std::vector<int> fractional     = {1, 2, 4};
+  const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
+  const int n_fractional                = fractional.size();
+  const int batch_size                  = n_fractional;
+
+  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+  solver_settings.iteration_limit  = 1000000;
+
+  for (int i = 0; i < n_fractional; ++i)
+    solver_settings.new_bounds.push_back({fractional[0],
+                                          -5,
+                                          -5});
+
+  shared_strong_branching_context_t<int, double> ctx(batch_size);
+
+  solver_settings.shared_sb_view =
+    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+
+  optimization_problem_solution_t<int, double>* result_ptr = nullptr;
+
+  auto pdlp_thread = std::thread([&]() {
+    auto sol = new optimization_problem_solution_t<int, double>(
+      solve_lp(&handle_, op_problem, solver_settings));
+    result_ptr = sol;
+  });
+
+  // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS)
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));
+  for (int i = 0; i < n_fractional; ++i)
+    ctx.solved[i].store(1);
+
+  pdlp_thread.join();
+
+  ASSERT_NE(result_ptr, nullptr);
+  auto& solution = *result_ptr;
+
+  ASSERT_EQ(solution.get_terminations_status().size(), batch_size);
+
+  for (int i = 0; i < batch_size; ++i) {
+    auto status = solution.get_termination_status(i);
+    // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it)
+    EXPECT_TRUE(status == pdlp_termination_status_t::ConcurrentLimit)
+      << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t<int, double>::get_termination_status_string(status);
+  }
+
+  // All entries should end up marked solved
+  for (int i = 0; i < batch_size; ++i) {
+    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+  }
+
+  delete result_ptr;
+}
+
 }  // namespace cuopt::linear_programming::test
 
 CUOPT_TEST_PROGRAM_MAIN()

From 2c8bbfd56ea23c18a016601cb0719d5122597ae7 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Thu, 19 Mar 2026 17:11:21 +0100
Subject: [PATCH 28/43] add option to use either dual simplex, bpdlp, or both
 with work stealing

---
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 57 ++++++++++++-------
 .../dual_simplex/simplex_solver_settings.hpp  |  3 +-
 cpp/src/math_optimization/solver_settings.cu  |  2 +-
 .../linear_programming/data_definition.py     |  5 +-
 4 files changed, 43 insertions(+), 24 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 52a7a0ac78..503d958a83 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -440,7 +440,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     if (settings.mip_batch_pdlp_strong_branching == 0)
      return;
   
-    settings.log.printf("Racing batch PDLP and Dual Simplex for strong branching\n");
+    settings.log.printf(settings.mip_batch_pdlp_strong_branching == 2
+      ? "Batch PDLP only for strong branching\n"
+      : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
 
     f_t start_batch = tic();
     std::vector<f_t> original_root_soln_x;
@@ -466,8 +468,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     if (batch_remaining_time <= 0.0) { return; }
 
     pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-    pdlp_settings.concurrent_halt = &concurrent_halt;
-    pdlp_settings.shared_sb_view  = sb_view;
+    if (settings.mip_batch_pdlp_strong_branching == 1) {
+      pdlp_settings.concurrent_halt = &concurrent_halt;
+      pdlp_settings.shared_sb_view  = sb_view;
+    }
 
     pdlp_settings.time_limit = batch_remaining_time;
     const raft::handle_t batch_pdlp_handle;
@@ -542,6 +546,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   std::vector<f_t> ds_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   f_t dual_simplex_strong_branching_time = tic();
 
+  if (settings.mip_batch_pdlp_strong_branching != 2) {
 #pragma omp parallel num_threads(settings.num_threads)
     {
       i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
@@ -586,6 +591,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
   // DS done: signal PDLP to stop (time-limit or all work done) and wait
   concurrent_halt.store(1);
+  }
 
   pdlp_thread.join();
 
@@ -593,24 +599,37 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
 
   // Collect Dual Simplex statistics
-  i_t ds_optimal_count = 0;
-  i_t ds_dual_feasible_only_count = 0;
+  i_t ds_optimal = 0, ds_infeasible = 0, ds_iter_limit = 0;
+  i_t ds_numerical = 0, ds_cutoff = 0, ds_time_limit = 0;
+  i_t ds_concurrent = 0, ds_work_limit = 0, ds_unset = 0;
+  const i_t total_subproblems = fractional.size() * 2;
   for (i_t k = 0; k < fractional.size(); k++) {
-    if (ds_status_down[k] == dual::status_t::OPTIMAL) ds_optimal_count++;
-    if (ds_status_up[k] == dual::status_t::OPTIMAL) ds_optimal_count++;
-    if (ds_status_down[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++;
-    if (ds_status_up[k] == dual::status_t::ITERATION_LIMIT) ds_dual_feasible_only_count++;
+    for (auto st : {ds_status_down[k], ds_status_up[k]}) {
+      switch (st) {
+        case dual::status_t::OPTIMAL:          ds_optimal++;     break;
+        case dual::status_t::DUAL_UNBOUNDED:   ds_infeasible++;  break;
+        case dual::status_t::ITERATION_LIMIT:  ds_iter_limit++;  break;
+        case dual::status_t::NUMERICAL:        ds_numerical++;   break;
+        case dual::status_t::CUTOFF:           ds_cutoff++;      break;
+        case dual::status_t::TIME_LIMIT:       ds_time_limit++;  break;
+        case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++;  break;
+        case dual::status_t::WORK_LIMIT:       ds_work_limit++;  break;
+        case dual::status_t::UNSET:            ds_unset++;       break;
+      }
+    }
   }
 
-  settings.log.printf(
-    "Dual Simplex found %d/%d optimal solutions and %d/%d dual feasible only solutions\n",
-    ds_optimal_count,
-    fractional.size() * 2,
-    ds_dual_feasible_only_count,
-    fractional.size() * 2);
-
-  if (settings.mip_batch_pdlp_strong_branching == 1) {
-    // Collect Batch PDLP statistics
+  settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit",
+    ds_optimal, total_subproblems, ds_infeasible, ds_iter_limit);
+  if (ds_cutoff)     settings.log.printf(", %d cutoff", ds_cutoff);
+  if (ds_time_limit) settings.log.printf(", %d time-limit", ds_time_limit);
+  if (ds_numerical)  settings.log.printf(", %d numerical", ds_numerical);
+  if (ds_concurrent) settings.log.printf(", %d concurrent-halt", ds_concurrent);
+  if (ds_work_limit) settings.log.printf(", %d work-limit", ds_work_limit);
+  if (ds_unset)      settings.log.printf(", %d unset/skipped", ds_unset);
+  settings.log.printf("\n");
+
+  if (settings.mip_batch_pdlp_strong_branching != 0) {
     i_t pdlp_optimal_count = 0;
     for (i_t k = 0; k < fractional.size(); k++) {
       if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++;
@@ -658,7 +677,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
   }
 
-  if (settings.mip_batch_pdlp_strong_branching == 1) {
+  if (settings.mip_batch_pdlp_strong_branching != 0) {
     settings.log.printf(
       "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n",
       merged_from_ds,
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 8de5302978..29c7d7a80f 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -187,8 +187,7 @@ struct simplex_solver_settings_t {
                                    // strengthening
   f_t cut_change_threshold;        // threshold for cut change
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
-  i_t mip_batch_pdlp_strong_branching{0};  // 0 if not using batch PDLP for strong branching, 1 if
-                                           // using batch PDLP for strong branching
+  i_t mip_batch_pdlp_strong_branching{0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
   i_t mip_batch_pdlp_reliability_branching{0};  // 0 if not using batch PDLP for reliability branching, 1 if
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index aa3741852f..cc2f09d58d 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -99,7 +99,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
-    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 1, 0},
+    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
     {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 59ea62089d..9ea5cf4e1b 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -452,8 +452,9 @@ class SolverConfig(BaseModel):
     )
     mip_batch_pdlp_strong_branching: Optional[int] = Field(
         default=0,
-        description="Set 1 to enable batch PDLP strong branching "
-        "in the MIP solver, 0 to disable.",
+        description="Strong branching mode: 0 = Dual Simplex only, "
+        "1 = cooperative work-stealing (DS + batch PDLP), "
+        "2 = batch PDLP only.",
     )
     num_cpu_threads: Optional[int] = Field(
         default=None,

From 0968167a551448a20ec80e4865a037e4274cb91b Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 23 Mar 2026 13:04:45 +0000
Subject: [PATCH 29/43] fix: resize the buffers to handle the case where we go
 to a single column which internally makes the spmm switch to spmv which need
 a new buffer

---
 cpp/src/pdlp/pdlp.cu  | 86 ++++++++++++++++++++++++++++++++++++++++++-
 cpp/src/pdlp/solve.cu |  8 ++++
 2 files changed, 93 insertions(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 9d5715a936..bd53b1d93b 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -790,7 +790,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
 #ifdef BATCH_VERBOSE_MODE
         std::cout << "[COOP SB] DS already solved climber " << i << " (original_index "
                   << local_idx << "), synced to ConcurrentLimit at step "
-                  << total_pdlp_iterations_ << std::endl;
+                  << internal_solver_iterations_ << std::endl;
 #endif
       }
     }
@@ -1798,6 +1798,90 @@ void pdlp_solver_t<i_t, f_t>::resize_and_swap_all_context_loop(
     pdhg_solver_.get_primal_tmp_resource().data(),
     CUSPARSE_ORDER_COL);
 
+  // Recalculate SpMM buffer sizes for the new batch dimensions.
+  // cuSparse may require different buffer sizes when the number of columns changes
+  // (e.g. SpMM with 1 column may internally fall back to SpMV with larger buffer needs).
+  {
+    size_t new_buf_size = 0;
+
+    // PDHG row-row: A_T * batch_dual_solutions -> batch_current_AtYs
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
+      handle_ptr_->get_cusparse_handle(),
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      reusable_device_scalar_value_1_.data(),
+      pdhg_cusparse_view.A_T,
+      pdhg_cusparse_view.batch_dual_solutions,
+      reusable_device_scalar_value_0_.data(),
+      pdhg_cusparse_view.batch_current_AtYs,
+      (deterministic_batch_pdlp) ? CUSPARSE_SPMM_CSR_ALG3 : CUSPARSE_SPMM_CSR_ALG2,
+      &new_buf_size,
+      stream_view_));
+    pdhg_cusparse_view.buffer_transpose_batch_row_row_.resize(new_buf_size, stream_view_);
+
+    // PDHG row-row: A * batch_reflected_primal_solutions -> batch_dual_gradients
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
+      handle_ptr_->get_cusparse_handle(),
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      reusable_device_scalar_value_1_.data(),
+      pdhg_cusparse_view.A,
+      pdhg_cusparse_view.batch_reflected_primal_solutions,
+      reusable_device_scalar_value_0_.data(),
+      pdhg_cusparse_view.batch_dual_gradients,
+      (deterministic_batch_pdlp) ? CUSPARSE_SPMM_CSR_ALG3 : CUSPARSE_SPMM_CSR_ALG2,
+      &new_buf_size,
+      stream_view_));
+    pdhg_cusparse_view.buffer_non_transpose_batch_row_row_.resize(new_buf_size, stream_view_);
+
+    // Adaptive step size: A_T * batch_potential_next_dual_solution -> batch_next_AtYs
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
+      handle_ptr_->get_cusparse_handle(),
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      reusable_device_scalar_value_1_.data(),
+      pdhg_cusparse_view.A_T,
+      pdhg_cusparse_view.batch_potential_next_dual_solution,
+      reusable_device_scalar_value_0_.data(),
+      pdhg_cusparse_view.batch_next_AtYs,
+      CUSPARSE_SPMM_CSR_ALG3,
+      &new_buf_size,
+      stream_view_));
+    pdhg_cusparse_view.buffer_transpose_batch.resize(new_buf_size, stream_view_);
+
+    // Convergence info: A_T * batch_dual_solutions -> batch_tmp_primals
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
+      handle_ptr_->get_cusparse_handle(),
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      reusable_device_scalar_value_1_.data(),
+      current_op_problem_evaluation_cusparse_view_.A_T,
+      current_op_problem_evaluation_cusparse_view_.batch_dual_solutions,
+      reusable_device_scalar_value_0_.data(),
+      current_op_problem_evaluation_cusparse_view_.batch_tmp_primals,
+      CUSPARSE_SPMM_CSR_ALG3,
+      &new_buf_size,
+      stream_view_));
+    current_op_problem_evaluation_cusparse_view_.buffer_transpose_batch.resize(new_buf_size,
+                                                                              stream_view_);
+
+    // Convergence info: A * batch_primal_solutions -> batch_tmp_duals
+    RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
+      handle_ptr_->get_cusparse_handle(),
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      CUSPARSE_OPERATION_NON_TRANSPOSE,
+      reusable_device_scalar_value_1_.data(),
+      current_op_problem_evaluation_cusparse_view_.A,
+      current_op_problem_evaluation_cusparse_view_.batch_primal_solutions,
+      reusable_device_scalar_value_0_.data(),
+      current_op_problem_evaluation_cusparse_view_.batch_tmp_duals,
+      CUSPARSE_SPMM_CSR_ALG3,
+      &new_buf_size,
+      stream_view_));
+    current_op_problem_evaluation_cusparse_view_.buffer_non_transpose_batch.resize(new_buf_size,
+                                                                                  stream_view_);
+  }
+
   // Rerun preprocess
 
   // PDHG SpMM preprocess
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index ced3844a9b..6bb2456c31 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -986,8 +986,16 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     warm_start_settings.detect_infeasibility = false;
     warm_start_settings.iteration_limit      = iteration_limit;
     warm_start_settings.inside_mip           = true;
+    #ifdef BATCH_VERBOSE_MODE
+    auto start_time = std::chrono::high_resolution_clock::now();
+    #endif
     optimization_problem_solution_t<i_t, f_t> original_solution =
       solve_lp(problem, warm_start_settings);
+    #ifdef BATCH_VERBOSE_MODE
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+    std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
+    #endif
     if (pdlp_primal_dual_init) {
       initial_primal    = rmm::device_uvector<f_t>(original_solution.get_primal_solution(),
                                                 original_solution.get_primal_solution().stream());

From 7642ded0d45ffade79097a54a77e0f5dec2e3b82 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 23 Mar 2026 18:36:11 +0100
Subject: [PATCH 30/43] general batch pdlp improvements and support work
 stealing in RB

---
 .../mip/solver_settings.hpp                   |   4 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 488 ++++++++++++------
 cpp/src/branch_and_bound/pseudo_costs.hpp     |  16 +
 .../dual_simplex/simplex_solver_settings.hpp  |   2 +-
 cpp/src/math_optimization/solver_settings.cu  |   2 +-
 cpp/src/pdlp/pdlp.cu                          |   2 +-
 cpp/src/pdlp/solve.cu                         |  91 ++--
 .../linear_programming/data_definition.py     |   6 +
 8 files changed, 388 insertions(+), 223 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 07a28a7748..62e88d5eb0 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -98,8 +98,8 @@ class mip_solver_settings_t {
   i_t reduced_cost_strengthening      = -1;
   f_t cut_change_threshold            = -1.0;
   f_t cut_min_orthogonality           = 0.5;
-  i_t mip_batch_pdlp_strong_branching = 1;
-  i_t mip_batch_pdlp_reliability_branching = 1;
+  i_t mip_batch_pdlp_strong_branching{1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+  i_t mip_batch_pdlp_reliability_branching{1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
   i_t num_gpus                        = 1;
   bool log_to_console                 = true;
 
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 503d958a83..204d28c386 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -177,7 +177,7 @@ void strong_branch_helper(i_t start,
 }
 
 template <typename i_t, typename f_t>
-f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
+std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
                     const simplex_solver_settings_t<i_t, f_t>& settings,
                     const std::vector<variable_type_t>& var_types,
                     const std::vector<variable_status_t>& vstatus,
@@ -244,12 +244,12 @@ f_t trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
   if (status == dual::status_t::DUAL_UNBOUNDED) {
     // LP was infeasible
-    return std::numeric_limits<f_t>::infinity();
+    return {std::numeric_limits<f_t>::infinity(), dual::status_t::DUAL_UNBOUNDED};
   } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
              status == dual::status_t::CUTOFF) {
-    return compute_objective(child_problem, solution.x);
+    return {compute_objective(child_problem, solution.x), status};
   } else {
-    return std::numeric_limits<f_t>::quiet_NaN();
+    return {std::numeric_limits<f_t>::quiet_NaN(), dual::status_t::NUMERICAL};
   }
 }
 
@@ -394,8 +394,8 @@ static std::pair<f_t, i_t> merge_sb_result(f_t ds_val,
   if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; }
   if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; }
 
-  // Rule 4: Dual Simplex hit iteration limit -> keep DS
-  if (ds_status == dual::status_t::ITERATION_LIMIT) { return {ds_val, 0}; }
+  // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS
+  if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || ds_status == dual::status_t::CUTOFF) { return {ds_val, 0}; }
 
   // Rule 5: None converged -> NaN
   return {std::numeric_limits<f_t>::quiet_NaN(), 2};
@@ -447,6 +447,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     f_t start_batch = tic();
     std::vector<f_t> original_root_soln_x;
 
+    if (concurrent_halt.load() == 1) { return; }
+
     const auto mps_model         = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
 
 
@@ -462,6 +464,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       fraction_values.push_back(original_root_soln_x[j]);
     }
 
+    if (concurrent_halt.load() == 1) { return; }
+
     const f_t batch_elapsed_time = toc(start_time);
     const f_t batch_remaining_time =
       std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
@@ -474,16 +478,80 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
 
     pdlp_settings.time_limit = batch_remaining_time;
-    const raft::handle_t batch_pdlp_handle;
-    constexpr bool dual_simplex_primal_dual = false;
-    if (dual_simplex_primal_dual) {
+
+    if (!pc.pdlp_warm_cache.populated) {
+      pdlp_solver_settings_t<i_t, f_t> ws_settings;
+      ws_settings.method               = method_t::PDLP;
+      ws_settings.presolver            = presolver_t::None;
+      ws_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
+      ws_settings.detect_infeasibility = false;
+      // Since the warm start will be used over and over again we want to maximize the chance of convergeance
+      // Batch PDLP is very compute intensive so we want to minimize the number of iterations
+      constexpr int warm_start_iteration_limit = 500000;
+      ws_settings.iteration_limit      = warm_start_iteration_limit;
+      constexpr f_t pdlp_tolerance = 1e-6;
+      ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
+      ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
+      ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
+      ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
+      ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance;
+      ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance;
+      ws_settings.inside_mip           = true;
+      if (settings.mip_batch_pdlp_strong_branching == 1) {
+        ws_settings.concurrent_halt = &concurrent_halt;
+      }
+
+      #ifdef BATCH_VERBOSE_MODE
+      auto start_time = std::chrono::high_resolution_clock::now();
+      #endif
+
+      auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
+
+      #ifdef BATCH_VERBOSE_MODE
+      auto end_time = std::chrono::high_resolution_clock::now();
+      auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+      std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
+      #endif
+
+      if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) {
+        auto& cache          = pc.pdlp_warm_cache;
+        const auto& ws_primal = ws_solution.get_primal_solution();
+        const auto& ws_dual   = ws_solution.get_dual_solution();
+        // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm start
+        cache.initial_primal  = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
+        cache.initial_dual    = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
+        cache.step_size       = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
+        cache.primal_weight   = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+        cache.pdlp_iteration  = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+        cache.populated       = true;
+
+        settings.log.printf("Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
+          cache.initial_primal.size(), cache.initial_dual.size(),
+          cache.step_size, cache.primal_weight, cache.pdlp_iteration);
+      } else {
+        settings.log.printf("PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
+          ws_solution.get_termination_status_string().c_str());
+        return;
+      }
+    }
+
+    if (concurrent_halt.load() == 1) { return; }
+
+    if (pc.pdlp_warm_cache.populated) {
+      auto& cache = pc.pdlp_warm_cache;
       pdlp_settings.set_initial_primal_solution(
-        original_root_soln_x.data(), original_root_soln_x.size(), batch_pdlp_handle.get_stream());
+        cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream());
       pdlp_settings.set_initial_dual_solution(
-        original_root_soln_y.data(), original_root_soln_y.size(), batch_pdlp_handle.get_stream());
+        cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+      pdlp_settings.set_initial_step_size(cache.step_size);
+      pdlp_settings.set_initial_primal_weight(cache.primal_weight);
+      pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
     }
+
+    if (concurrent_halt.load() == 1) { return; }
+
     const auto solutions =
-      batch_pdlp_solve(&batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
+      batch_pdlp_solve(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Fail safe in case the batch PDLP failed and produced no solutions
@@ -856,9 +924,25 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     return branch_var;
   }
 
+  const i_t rb_mode = settings.mip_batch_pdlp_reliability_branching;
+  // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled
+  // This indicates that PDLP alone (not batched) couldn't even run at the root node
+  // So it will most likely perform poorly compared to DS
+  // Also, if the number of candidate is very small we don't use batch PDLP
+  constexpr i_t min_num_candidates_for_pdlp = 5;
+  const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp;
+
+  if (rb_mode != 0 && !pdlp_warm_cache.populated) {
+    log.printf("PDLP warm start data not populated, using DS only\n");
+  }
+  if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
+    log.printf("Not enough candidates to use batch PDLP, using DS only\n");
+  }
+
   const int num_tasks          = std::max(max_num_tasks, 1);
   const int task_priority      = reliability_branching_settings.task_priority;
-  const i_t max_num_candidates = reliability_branching_settings.max_num_candidates;
+  // If both batch PDLP and DS are used we double the max number of candidates
+  const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates;
   const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
 
   assert(task_priority > 0);
@@ -877,33 +961,36 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // Shuffle the unreliable list so every variable has the same chance to be selected.
   if (unreliable_list.size() > max_num_candidates) { worker->rng.shuffle(unreliable_list); }
 
-  // Variables beyond num_candidates are solved by batch PDLP instead of Dual Simplex
-  std::vector<i_t> pdlp_overflow_list;
-  bool use_pdlp = settings.mip_batch_pdlp_reliability_branching == 1 &&
-                  static_cast<i_t>(unreliable_list.size()) > num_candidates;
-  if (use_pdlp) {
-    pdlp_overflow_list.assign(unreliable_list.begin() + num_candidates, unreliable_list.end());
-  }
+  // Both DS and PDLP work on the same candidate set
+  std::vector<i_t> candidate_vars(unreliable_list.begin(),
+                                  unreliable_list.begin() + num_candidates);
+
+  // Shared context for cooperative work-stealing (mode 1)
+  // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up
+  shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * num_candidates);
+  shared_strong_branching_context_view_t<i_t, f_t> sb_view(std::span(shared_ctx.solved));
 
-  const i_t num_pdlp_vars = pdlp_overflow_list.size();
-  std::vector<f_t> pdlp_obj_down(num_pdlp_vars, std::numeric_limits<f_t>::quiet_NaN());
-  std::vector<f_t> pdlp_obj_up(num_pdlp_vars, std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> pdlp_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> pdlp_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
 
-  // DS can halt PDLP via concurrent_halt, but not the other way around
   std::atomic<int> concurrent_halt{0};
   std::thread pdlp_thread;
 
   if (use_pdlp) {
     pdlp_thread = std::thread([&]() {
-      log.printf("RB batch PDLP: solving %d overflow unreliable variables\n", num_pdlp_vars);
+      log.printf(rb_mode == 2
+        ? "RB batch PDLP only for %d candidates\n"
+        : "RB cooperative batch PDLP and DS for %d candidates\n",
+        num_candidates);
 
       f_t start_batch = tic();
 
       std::vector<f_t> original_soln_x;
-      // Convert the original_lp that has cuts to a problem that is better for PDLP
+
+      if (concurrent_halt.load() == 1) { return; }
+
       auto mps_model = simplex_problem_to_mps_data_model(
         original_lp, new_slacks, solution, original_soln_x);
-      // Apply the bounds of the current leaf problem
       {
         const i_t n_orig = original_lp.num_cols - new_slacks.size();
         for (i_t j = 0; j < n_orig; j++) {
@@ -913,59 +1000,74 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       }
 
       std::vector<f_t> fraction_values;
-      fraction_values.reserve(num_pdlp_vars);
-      for (i_t j : pdlp_overflow_list) {
+      fraction_values.reserve(num_candidates);
+      for (i_t j : candidate_vars) {
         fraction_values.push_back(original_soln_x[j]);
       }
 
-      const f_t batch_elapsed_time    = toc(start_time);
+      if (concurrent_halt.load() == 1) { return; }
+
+      const f_t batch_elapsed_time = toc(start_time);
       const f_t batch_remaining_time =
         std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
       if (batch_remaining_time <= 0.0) { return; }
 
       pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-      pdlp_settings.concurrent_halt = &concurrent_halt;
-      pdlp_settings.time_limit      = batch_remaining_time;
+      if (rb_mode == 1) {
+        pdlp_settings.concurrent_halt = &concurrent_halt;
+        pdlp_settings.shared_sb_view  = sb_view;
+      }
+      pdlp_settings.time_limit = batch_remaining_time;
+
+
+      if (pdlp_warm_cache.populated) {
+        auto& cache = pdlp_warm_cache;
+        pdlp_settings.set_initial_primal_solution(
+          cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream());
+        pdlp_settings.set_initial_dual_solution(
+          cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+        pdlp_settings.set_initial_step_size(cache.step_size);
+        pdlp_settings.set_initial_primal_weight(cache.primal_weight);
+        pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
+      }
+
+      if (concurrent_halt.load() == 1) { return; }
 
-      const raft::handle_t batch_pdlp_handle;
       const auto solutions = batch_pdlp_solve(
-        &batch_pdlp_handle, mps_model, pdlp_overflow_list, fraction_values, pdlp_settings);
+        &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
 
       f_t batch_pdlp_time = toc(start_batch);
 
       if (solutions.get_additional_termination_informations().size() !=
-          static_cast<size_t>(num_pdlp_vars) * 2) {
+          static_cast<size_t>(num_candidates) * 2) {
         log.printf("RB batch PDLP failed and produced no solutions\n");
         return;
       }
 
       i_t amount_done = 0;
-      for (i_t k = 0; k < num_pdlp_vars * 2; k++) {
+      for (i_t k = 0; k < num_candidates * 2; k++) {
         if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
           amount_done++;
         }
       }
 
-      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d in %.2fs\n",
-                 batch_pdlp_time,
-                 amount_done,
-                 num_pdlp_vars * 2,
-                 toc(start_batch));
+      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
+                 batch_pdlp_time, amount_done, num_candidates * 2);
 
-      for (i_t k = 0; k < num_pdlp_vars; k++) {
+      for (i_t k = 0; k < num_candidates; k++) {
         if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
           pdlp_obj_down[k] = solutions.get_dual_objective_value(k);
         }
-        if (solutions.get_termination_status(k + num_pdlp_vars) ==
+        if (solutions.get_termination_status(k + num_candidates) ==
             pdlp_termination_status_t::Optimal) {
-          pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_pdlp_vars);
+          pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates);
         }
       }
     });
   }
 
   if (toc(start_time) > settings.time_limit) {
-    log.printf("Time limit reached");
+    log.printf("Time limit reached\n");
     if (use_pdlp) {
       concurrent_halt.store(1);
       pdlp_thread.join();
@@ -973,165 +1075,211 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     return branch_var;
   }
 
+  std::vector<f_t> ds_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<f_t> ds_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
+  std::vector<dual::status_t> ds_status_down(num_candidates, dual::status_t::UNSET);
+  std::vector<dual::status_t> ds_status_up(num_candidates, dual::status_t::UNSET);
+
   omp_atomic_t<i_t> ds_optimal{0};
   omp_atomic_t<i_t> ds_infeasible{0};
   omp_atomic_t<i_t> ds_failed{0};
+  omp_atomic_t<i_t> ds_skipped{0};
   f_t ds_start_time = tic();
 
+  if (rb_mode != 2) {
 #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
-  shared(score_mutex, ds_optimal, ds_infeasible, ds_failed)
-  for (i_t i = 0; i < num_candidates; ++i) {
-    const i_t j = unreliable_list[i];
-
-    if (toc(start_time) > settings.time_limit) { continue; }
-
-    pseudo_cost_mutex_down[j].lock();
-    if (pseudo_cost_num_down[j] < reliable_threshold) {
-      // Do trial branching on the down branch
-      f_t obj = trial_branching(worker->leaf_problem,
-                                settings,
-                                var_types,
-                                node_ptr->vstatus,
-                                worker->leaf_edge_norms,
-                                worker->basis_factors,
-                                worker->basic_list,
-                                worker->nonbasic_list,
-                                j,
-                                worker->leaf_problem.lower[j],
-                                std::floor(solution[j]),
-                                upper_bound,
-                                branch_and_bound_lp_iter_per_node,
-                                start_time,
-                                reliability_branching_settings.upper_max_lp_iter,
-                                reliability_branching_settings.lower_max_lp_iter,
-                                strong_branching_lp_iter);
-
-      if (std::isnan(obj)) {
-        ds_failed++;
-      } else if (std::isinf(obj)) {
-        ds_infeasible++;
-        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-        f_t change_in_x   = solution[j] - std::floor(solution[j]);
-        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_down[j]++;
+  shared(score_mutex, ds_optimal, ds_infeasible, ds_failed, ds_skipped, ds_obj_down, ds_obj_up, ds_status_down, ds_status_up, sb_view)
+    for (i_t i = 0; i < num_candidates; ++i) {
+      const i_t j = unreliable_list[i];
+
+      if (toc(start_time) > settings.time_limit) { continue; }
+
+      if (rb_mode == 1 && sb_view.is_solved(i)) {
+        ds_skipped++;
       } else {
-        ds_optimal++;
-        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-        f_t change_in_x   = solution[j] - std::floor(solution[j]);
-        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_down[j]++;
+        pseudo_cost_mutex_down[j].lock();
+        if (pseudo_cost_num_down[j] < reliable_threshold) {
+          // Do trial branching on the down branch
+          const auto [obj, status] = trial_branching(worker->leaf_problem,
+                                    settings,
+                                    var_types,
+                                    node_ptr->vstatus,
+                                    worker->leaf_edge_norms,
+                                    worker->basis_factors,
+                                    worker->basic_list,
+                                    worker->nonbasic_list,
+                                    j,
+                                    worker->leaf_problem.lower[j],
+                                    std::floor(solution[j]),
+                                    upper_bound,
+                                    branch_and_bound_lp_iter_per_node,
+                                    start_time,
+                                    reliability_branching_settings.upper_max_lp_iter,
+                                    reliability_branching_settings.lower_max_lp_iter,
+                                    strong_branching_lp_iter);
+
+          ds_obj_down[i] = obj;
+          ds_status_down[i] = status;
+          if (std::isnan(obj)) {
+            ds_failed++;
+          } else if (std::isinf(obj)) {
+            ds_infeasible++;
+            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = solution[j] - std::floor(solution[j]);
+            pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_down[j]++;
+          } else {
+            ds_optimal++;
+            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = solution[j] - std::floor(solution[j]);
+            pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_down[j]++;
+          }
+          if (rb_mode == 1) { sb_view.mark_solved(i); }
+        }
+        pseudo_cost_mutex_down[j].unlock();
       }
-    }
-    pseudo_cost_mutex_down[j].unlock();
-
-    if (toc(start_time) > settings.time_limit) { continue; }
-
-    pseudo_cost_mutex_up[j].lock();
-    if (pseudo_cost_num_up[j] < reliable_threshold) {
-      f_t obj = trial_branching(worker->leaf_problem,
-                                settings,
-                                var_types,
-                                node_ptr->vstatus,
-                                worker->leaf_edge_norms,
-                                worker->basis_factors,
-                                worker->basic_list,
-                                worker->nonbasic_list,
-                                j,
-                                std::ceil(solution[j]),
-                                worker->leaf_problem.upper[j],
-                                upper_bound,
-                                branch_and_bound_lp_iter_per_node,
-                                start_time,
-                                reliability_branching_settings.upper_max_lp_iter,
-                                reliability_branching_settings.lower_max_lp_iter,
-                                strong_branching_lp_iter);
-
-      if (std::isnan(obj)) {
-        ds_failed++;
-      } else if (std::isinf(obj)) {
-        // Is it ok to process infinity obj like this?
-        ds_infeasible++;
-        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
-        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_up[j]++;
+
+      if (toc(start_time) > settings.time_limit) { continue; }
+
+      const i_t shared_idx = i + num_candidates;
+      if (rb_mode == 1 && sb_view.is_solved(shared_idx)) {
+        ds_skipped++;
       } else {
-        ds_optimal++;
-        f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
-        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_up[j]++;
+        pseudo_cost_mutex_up[j].lock();
+        if (pseudo_cost_num_up[j] < reliable_threshold) {
+          const auto [obj, status] = trial_branching(worker->leaf_problem,
+                                    settings,
+                                    var_types,
+                                    node_ptr->vstatus,
+                                    worker->leaf_edge_norms,
+                                    worker->basis_factors,
+                                    worker->basic_list,
+                                    worker->nonbasic_list,
+                                    j,
+                                    std::ceil(solution[j]),
+                                    worker->leaf_problem.upper[j],
+                                    upper_bound,
+                                    branch_and_bound_lp_iter_per_node,
+                                    start_time,
+                                    reliability_branching_settings.upper_max_lp_iter,
+                                    reliability_branching_settings.lower_max_lp_iter,
+                                    strong_branching_lp_iter);
+
+          ds_obj_up[i] = obj;
+          ds_status_up[i] = status;
+          if (std::isnan(obj)) {
+            ds_failed++;
+          } else if (std::isinf(obj)) {
+            ds_infeasible++;
+            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+            pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_up[j]++;
+          } else {
+            ds_optimal++;
+            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+            pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_up[j]++;
+          }
+          if (rb_mode == 1) { sb_view.mark_solved(shared_idx); }
+        }
+        pseudo_cost_mutex_up[j].unlock();
       }
-    }
-    pseudo_cost_mutex_up[j].unlock();
 
-    if (toc(start_time) > settings.time_limit) { continue; }
+      if (toc(start_time) > settings.time_limit) { continue; }
 
-    f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+      f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
 
-    score_mutex.lock();
-    if (score > max_score) {
-      max_score  = score;
-      branch_var = j;
+      score_mutex.lock();
+      if (score > max_score) {
+        max_score  = score;
+        branch_var = j;
+      }
+      score_mutex.unlock();
     }
-    score_mutex.unlock();
+
+    concurrent_halt.store(1);
   }
 
   f_t ds_elapsed = toc(ds_start_time);
-  log.printf(
-    "RB Dual Simplex: %d candidates, %d/%d optimal/dual-feasible, %d/%d infeasible, "
-    "%d/%d failed in %.2fs\n",
-    num_candidates,
-    ds_optimal.load(),
-    num_candidates * 2,
-    ds_infeasible.load(),
-    num_candidates * 2,
-    ds_failed.load(),
-    num_candidates * 2,
-    ds_elapsed);
+
+  if (rb_mode != 2) {
+    if (rb_mode == 1) {
+      log.printf(
+        "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n",
+        num_candidates,
+        ds_optimal.load(), num_candidates * 2,
+        ds_infeasible.load(), num_candidates * 2,
+        ds_failed.load(), num_candidates * 2,
+        ds_skipped.load(), ds_elapsed);
+    } else {
+      log.printf(
+        "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n",
+        num_candidates,
+        ds_optimal.load(), num_candidates * 2,
+        ds_infeasible.load(), num_candidates * 2,
+        ds_failed.load(), num_candidates * 2,
+        ds_elapsed);
+    }
+  }
 
   if (use_pdlp) {
-    // Dual Simplex is done on the main thread, telling Batch PDLP to stop
-    concurrent_halt.store(1);
     pdlp_thread.join();
 
-    i_t pdlp_optimal  = 0;
-    for (i_t k = 0; k < num_pdlp_vars; k++) {
-      const i_t j = pdlp_overflow_list[k];
+    i_t pdlp_applied = 0;
+    i_t pdlp_optimal = 0;
+    for (i_t i = 0; i < num_candidates; i++) {
+      const i_t j = candidate_vars[i];
 
-      pseudo_cost_mutex_down[j].lock();
-      if (!std::isnan(pdlp_obj_down[k])) {
-        f_t change_in_obj = std::max(pdlp_obj_down[k] - node_ptr->lower_bound, eps);
-        f_t change_in_x   = solution[j] - std::floor(solution[j]);
-        pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_down[j]++;
+      // Down: check if PDLP should override DS
+      if (!std::isnan(pdlp_obj_down[i])) {
         pdlp_optimal++;
+        const auto [merged_obj, source] =
+          merge_sb_result<i_t, f_t>(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true);
+        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable)
+        if (source == 1) {
+          pseudo_cost_mutex_down[j].lock();
+          if (pseudo_cost_num_down[j] < reliable_threshold) {
+            f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = solution[j] - std::floor(solution[j]);
+            pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_down[j]++;
+            pdlp_applied++;
+          }
+          pseudo_cost_mutex_down[j].unlock();
+        }
       }
-      pseudo_cost_mutex_down[j].unlock();
 
-      pseudo_cost_mutex_up[j].lock();
-      if (!std::isnan(pdlp_obj_up[k])) {
-        f_t change_in_obj = std::max(pdlp_obj_up[k] - node_ptr->lower_bound, eps);
-        f_t change_in_x   = std::ceil(solution[j]) - solution[j];
-        pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
-        pseudo_cost_num_up[j]++;
+      // Up: check if PDLP should override DS
+      if (!std::isnan(pdlp_obj_up[i])) {
         pdlp_optimal++;
+        const auto [merged_obj, source] =
+          merge_sb_result<i_t, f_t>(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true);
+        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable)
+        if (source == 1) {
+          pseudo_cost_mutex_up[j].lock();
+          if (pseudo_cost_num_up[j] < reliable_threshold) {
+            f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
+            f_t change_in_x   = std::ceil(solution[j]) - solution[j];
+            pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
+            pseudo_cost_num_up[j]++;
+            pdlp_applied++;
+          }
+          pseudo_cost_mutex_up[j].unlock();
+        }
       }
-      pseudo_cost_mutex_up[j].unlock();
 
-      f_t score =
-        calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
+      f_t score = calculate_pseudocost_score(j, solution, pseudo_cost_up_avg, pseudo_cost_down_avg);
       if (score > max_score) {
         max_score  = score;
         branch_var = j;
       }
     }
 
-    log.printf(
-      "RB batch PDLP: %d candidates, %d/%d optimal\n",
-      num_pdlp_vars,
-      pdlp_optimal,
-      num_pdlp_vars * 2);
+    log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n",
+      num_candidates, pdlp_optimal, num_candidates * 2, pdlp_applied);
   }
 
   log.printf(
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index 75cf660621..c48ed908d7 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -20,7 +20,10 @@
 
 #include <omp.h>
 #include <cmath>
+#include <rmm/device_uvector.hpp>
+
 #include <cstdint>
+#include <limits>
 
 namespace cuopt::linear_programming::dual_simplex {
 
@@ -405,6 +408,17 @@ struct reliability_branching_settings_t {
   i_t min_reliable_threshold = 1;
 };
 
+template <typename i_t, typename f_t>
+struct batch_pdlp_warm_cache_t {
+  const raft::handle_t batch_pdlp_handle{};
+  rmm::device_uvector<f_t> initial_primal{0, batch_pdlp_handle.get_stream()};
+  rmm::device_uvector<f_t> initial_dual{0, batch_pdlp_handle.get_stream()};
+  f_t step_size{std::numeric_limits<f_t>::signaling_NaN()};
+  f_t primal_weight{std::numeric_limits<f_t>::signaling_NaN()};
+  i_t pdlp_iteration{-1};
+  bool populated{false};
+};
+
 template <typename i_t, typename f_t>
 class pseudo_costs_t {
  public:
@@ -516,6 +530,8 @@ class pseudo_costs_t {
   std::vector<omp_mutex_t> pseudo_cost_mutex_down;
   omp_atomic_t<i_t> num_strong_branches_completed = 0;
   omp_atomic_t<int64_t> strong_branching_lp_iter  = 0;
+
+  batch_pdlp_warm_cache_t<i_t, f_t> pdlp_warm_cache;
 };
 
 template <typename i_t, typename f_t>
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index 29c7d7a80f..c097baf561 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -188,7 +188,7 @@ struct simplex_solver_settings_t {
   f_t cut_change_threshold;        // threshold for cut change
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
   i_t mip_batch_pdlp_strong_branching{0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
-  i_t mip_batch_pdlp_reliability_branching{0};  // 0 if not using batch PDLP for reliability branching, 1 if
+  i_t mip_batch_pdlp_reliability_branching{0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index cc2f09d58d..749d89a35c 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -100,7 +100,7 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
     {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
-    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 1, 0},
+    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC},
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index bd53b1d93b..37e9e1a31f 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -770,7 +770,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
     if (current_termination_strategy_.is_done(term)) {
       std::cout << "[BATCH MODE]: Climber " << i << " is done with "
                 << optimization_problem_solution_t<i_t, f_t>::get_termination_status_string(term)
-                << " at step " << total_pdlp_iterations_ << ". It's original index is "
+                << " at step " << internal_solver_iterations_ << ". It's original index is "
                 << climber_strategies_[i].original_index << std::endl;
     }
   }
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 6bb2456c31..b9cdb8c9c6 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -904,11 +904,12 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   optimization_problem_t<i_t, f_t>& problem, pdlp_solver_settings_t<i_t, f_t> const& settings)
 {
   // Hyper parameter than can be changed, I have put what I believe to be the best
-  bool pdlp_primal_dual_init    = true;
-  bool primal_weight_init       = true;
-  bool use_initial_pdlp_iterations = true;
+  constexpr bool pdlp_primal_dual_init    = true;
+  constexpr bool primal_weight_init       = true;
+  constexpr bool use_initial_pdlp_iterations = true;
   bool use_optimal_batch_size   = false;
-  constexpr int iteration_limit = 100000;
+  constexpr int batch_iteration_limit = 100000;
+  constexpr f_t pdlp_tolerance = 1e-6;
 
   rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream();
 
@@ -967,47 +968,31 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   }
   cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size,
                "Optimal batch size should be between 1 and max batch size");
-  using f_t2 = typename type_2<f_t>::type;
-
-  // In case Dual Simplex already provided the initial primal and dual solution
-  if (settings.has_initial_primal_solution() && settings.has_initial_dual_solution()) {
-    initial_primal = rmm::device_uvector<f_t>(
-      settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream());
-    initial_dual = rmm::device_uvector<f_t>(
-      settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream());
-  }
 
-  if (pdlp_primal_dual_init || primal_weight_init) {
-    pdlp_solver_settings_t<i_t, f_t> warm_start_settings = settings;
-    warm_start_settings.new_bounds.clear();
-    warm_start_settings.method               = cuopt::linear_programming::method_t::PDLP;
-    warm_start_settings.presolver            = cuopt::linear_programming::presolver_t::None;
-    warm_start_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
-    warm_start_settings.detect_infeasibility = false;
-    warm_start_settings.iteration_limit      = iteration_limit;
-    warm_start_settings.inside_mip           = true;
-    #ifdef BATCH_VERBOSE_MODE
-    auto start_time = std::chrono::high_resolution_clock::now();
-    #endif
-    optimization_problem_solution_t<i_t, f_t> original_solution =
-      solve_lp(problem, warm_start_settings);
+  const bool warm_start_from_settings =
+    settings.has_initial_primal_solution() || settings.has_initial_dual_solution() ||
+    settings.get_initial_step_size().has_value() ||
+    settings.get_initial_primal_weight().has_value() ||
+    settings.get_initial_pdlp_iteration().has_value();
+
+  if (warm_start_from_settings) {
     #ifdef BATCH_VERBOSE_MODE
-    auto end_time = std::chrono::high_resolution_clock::now();
-    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
-    std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
+    std::cout << "Using warm start from settings" << std::endl;
     #endif
-    if (pdlp_primal_dual_init) {
-      initial_primal    = rmm::device_uvector<f_t>(original_solution.get_primal_solution(),
-                                                original_solution.get_primal_solution().stream());
-      initial_dual      = rmm::device_uvector<f_t>(original_solution.get_dual_solution(),
-                                              original_solution.get_dual_solution().stream());
-      initial_step_size = original_solution.get_pdlp_warm_start_data().initial_step_size_;
+    if (settings.has_initial_primal_solution() && pdlp_primal_dual_init) {
+      initial_primal = rmm::device_uvector<f_t>(settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream());
+    }
+    if (settings.has_initial_dual_solution() && pdlp_primal_dual_init) {
+      initial_dual = rmm::device_uvector<f_t>(settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream());
+    }
+    if (settings.get_initial_step_size().has_value() && pdlp_primal_dual_init) {
+      initial_step_size = *settings.get_initial_step_size();
     }
-    if (primal_weight_init) {
-      initial_primal_weight = original_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+    if (settings.get_initial_primal_weight().has_value() && primal_weight_init) {
+      initial_primal_weight = *settings.get_initial_primal_weight();
     }
-    if (use_initial_pdlp_iterations) {
-      initial_pdlp_iteration = original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+    if (settings.get_initial_pdlp_iteration().has_value() && use_initial_pdlp_iterations) {
+      initial_pdlp_iteration = *settings.get_initial_pdlp_iteration();
     }
   }
 
@@ -1029,21 +1014,31 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   batch_settings.presolver                        = presolver_t::None;
   batch_settings.pdlp_solver_mode                 = pdlp_solver_mode_t::Stable3;
   batch_settings.detect_infeasibility             = false;
-  batch_settings.iteration_limit                  = iteration_limit;
+  batch_settings.iteration_limit                  = batch_iteration_limit;
   batch_settings.inside_mip                       = true;
+  batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance;
   if (initial_primal.size() > 0) {
     batch_settings.set_initial_primal_solution(
       initial_primal.data(), initial_primal.size(), initial_primal.stream());
+  }
+  if (initial_dual.size() > 0) {
     batch_settings.set_initial_dual_solution(
       initial_dual.data(), initial_dual.size(), initial_dual.stream());
-    if (!std::isnan(initial_step_size)) {
-      batch_settings.set_initial_step_size(initial_step_size);
-    }
-    if (use_initial_pdlp_iterations) {
-      batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration);
-    }
   }
-  if (primal_weight_init) { batch_settings.set_initial_primal_weight(initial_primal_weight); }
+  if (!std::isnan(initial_step_size)) {
+    batch_settings.set_initial_step_size(initial_step_size);
+  }
+  if (initial_pdlp_iteration != -1) {
+    batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration);
+  }
+  if (!std::isnan(initial_primal_weight)) {
+    batch_settings.set_initial_primal_weight(initial_primal_weight);
+  }
 
   for (size_t i = 0; i < max_batch_size; i += optimal_batch_size) {
     const size_t current_batch_size = std::min(optimal_batch_size, max_batch_size - i);
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 9ea5cf4e1b..32cf860f28 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -456,6 +456,12 @@ class SolverConfig(BaseModel):
         "1 = cooperative work-stealing (DS + batch PDLP), "
         "2 = batch PDLP only.",
     )
+    mip_batch_pdlp_reliability_branching: Optional[int] = Field(
+        default=0,
+        description="Reliability branching mode: 0 = Dual Simplex only, "
+        "1 = cooperative work-stealing (DS + batch PDLP), "
+        "2 = batch PDLP only.",
+    )
     num_cpu_threads: Optional[int] = Field(
         default=None,
         description="Set the number of CPU threads to use for branch and bound.",  # noqa

From 84dab81fb8c98631b8e5cb8efabefd2f73d3b3e5 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 24 Mar 2026 14:09:56 +0100
Subject: [PATCH 31/43] turn off logs

---
 cpp/src/branch_and_bound/branch_and_bound.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/branch_and_bound/branch_and_bound.cpp b/cpp/src/branch_and_bound/branch_and_bound.cpp
index eccadfbb74..b940134fbb 100644
--- a/cpp/src/branch_and_bound/branch_and_bound.cpp
+++ b/cpp/src/branch_and_bound/branch_and_bound.cpp
@@ -827,8 +827,7 @@ branch_variable_t<i_t> branch_and_bound_t<i_t, f_t>::variable_selection(
   branch_and_bound_worker_t<i_t, f_t>* worker)
 {
   logger_t log;
-  // TODO put back false
-  log.log                        = true;
+  log.log                        = false;
   i_t branch_var                 = -1;
   rounding_direction_t round_dir = rounding_direction_t::NONE;
   std::vector<f_t> current_incumbent;

From a97cca7ad29b0df0c7603bac55fe44f58f3428f4 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Tue, 24 Mar 2026 18:07:15 +0100
Subject: [PATCH 32/43] few improvements to BPDLP

---
 .../pdlp/solver_settings.hpp                  |   1 +
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 128 +++++++++---------
 cpp/src/pdlp/solve.cu                         |  32 ++---
 3 files changed, 80 insertions(+), 81 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index f3521edc54..17fa7c548f 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -288,6 +288,7 @@ class pdlp_solver_settings_t {
   // We only retrieve termination statistics and the objective values
   bool generate_batch_primal_dual_solution{false};
   // Used to force batch PDLP to solve a subbatch of the problems at a time
+  // The 0 default value will make the solver use its heuristic to determine the subbatch size
   i_t sub_batch_size{0};
 
  private:
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 204d28c386..c9f96b3666 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -25,6 +25,11 @@ namespace cuopt::linear_programming::dual_simplex {
 
 namespace {
 
+static bool ds_is_valid_done(dual::status_t status)
+{
+  return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF;
+}
+
 template <typename i_t, typename f_t>
 void strong_branch_helper(i_t start,
                           i_t end,
@@ -100,7 +105,7 @@ void strong_branch_helper(i_t start,
       if (status == dual::status_t::DUAL_UNBOUNDED) {
         // LP was infeasible
         obj = std::numeric_limits<f_t>::infinity();
-      } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT) {
+      } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) {
         obj = compute_objective(child_problem, solution.x);
       } else {
         settings.log.debug("Thread id %2d remaining %d variable %d branch %d status %d\n",
@@ -144,10 +149,14 @@ void strong_branch_helper(i_t start,
       }
       // Mark the subproblem as solved so that batch PDLP removes it from the batch
       if (sb_view.is_valid()) {
-        sb_view.mark_solved(shared_idx);
-        settings.log.printf(
-          "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n",
-          thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+        // We could not mark as solved nodes hitting iteartion limit in DS
+        if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || (branch == 1 && ds_is_valid_done(ds_status_up[k])))
+        {
+          sb_view.mark_solved(shared_idx);
+          settings.log.printf(
+            "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n",
+            thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+        }
       }
       if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
         break; 
@@ -422,6 +431,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   pc.strong_branch_up.assign(fractional.size(), 0);
   pc.num_strong_branches_completed = 0;
 
+  const f_t elapsed_time = toc(start_time);
+  if (elapsed_time > settings.time_limit) { return; }
+
   settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
                       settings.num_threads,
                       fractional.size());
@@ -466,18 +478,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
     if (concurrent_halt.load() == 1) { return; }
 
-    const f_t batch_elapsed_time = toc(start_time);
-    const f_t batch_remaining_time =
+    f_t batch_elapsed_time = toc(start_time);
+    const f_t warm_start_remaining_time =
       std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
-    if (batch_remaining_time <= 0.0) { return; }
-
-    pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-    if (settings.mip_batch_pdlp_strong_branching == 1) {
-      pdlp_settings.concurrent_halt = &concurrent_halt;
-      pdlp_settings.shared_sb_view  = sb_view;
-    }
-
-    pdlp_settings.time_limit = batch_remaining_time;
+    if (warm_start_remaining_time <= 0.0) { return; }
 
     if (!pc.pdlp_warm_cache.populated) {
       pdlp_solver_settings_t<i_t, f_t> ws_settings;
@@ -489,6 +493,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       // Batch PDLP is very compute intensive so we want to minimize the number of iterations
       constexpr int warm_start_iteration_limit = 500000;
       ws_settings.iteration_limit      = warm_start_iteration_limit;
+      ws_settings.time_limit = warm_start_remaining_time;
       constexpr f_t pdlp_tolerance = 1e-6;
       ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
       ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
@@ -537,6 +542,18 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
     if (concurrent_halt.load() == 1) { return; }
 
+    pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
+    if (settings.mip_batch_pdlp_strong_branching == 1) {
+      pdlp_settings.concurrent_halt = &concurrent_halt;
+      pdlp_settings.shared_sb_view  = sb_view;
+    }
+
+    batch_elapsed_time = toc(start_time);
+    const f_t batch_remaining_time =
+    std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+    if (batch_remaining_time <= 0.0) { return; }
+    pdlp_settings.time_limit = batch_remaining_time;
+
     if (pc.pdlp_warm_cache.populated) {
       auto& cache = pc.pdlp_warm_cache;
       pdlp_settings.set_initial_primal_solution(
@@ -928,7 +945,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // We don't use batch PDLP in reliability branching if the PDLP warm start data was not filled
   // This indicates that PDLP alone (not batched) couldn't even run at the root node
   // So it will most likely perform poorly compared to DS
-  // Also, if the number of candidate is very small we don't use batch PDLP
+  // It is also off if the number of candidate is very small
   constexpr i_t min_num_candidates_for_pdlp = 5;
   const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp;
 
@@ -1080,22 +1097,18 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   std::vector<dual::status_t> ds_status_down(num_candidates, dual::status_t::UNSET);
   std::vector<dual::status_t> ds_status_up(num_candidates, dual::status_t::UNSET);
 
-  omp_atomic_t<i_t> ds_optimal{0};
-  omp_atomic_t<i_t> ds_infeasible{0};
-  omp_atomic_t<i_t> ds_failed{0};
-  omp_atomic_t<i_t> ds_skipped{0};
   f_t ds_start_time = tic();
 
   if (rb_mode != 2) {
 #pragma omp taskloop if (num_tasks > 1) priority(task_priority) num_tasks(num_tasks) \
-  shared(score_mutex, ds_optimal, ds_infeasible, ds_failed, ds_skipped, ds_obj_down, ds_obj_up, ds_status_down, ds_status_up, sb_view)
+  shared(score_mutex, sb_view)
     for (i_t i = 0; i < num_candidates; ++i) {
       const i_t j = unreliable_list[i];
 
       if (toc(start_time) > settings.time_limit) { continue; }
 
       if (rb_mode == 1 && sb_view.is_solved(i)) {
-        ds_skipped++;
+        log.printf("DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i);
       } else {
         pseudo_cost_mutex_down[j].lock();
         if (pseudo_cost_num_down[j] < reliable_threshold) {
@@ -1120,22 +1133,14 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
           ds_obj_down[i] = obj;
           ds_status_down[i] = status;
-          if (std::isnan(obj)) {
-            ds_failed++;
-          } else if (std::isinf(obj)) {
-            ds_infeasible++;
-            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = solution[j] - std::floor(solution[j]);
-            pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
-            pseudo_cost_num_down[j]++;
-          } else {
-            ds_optimal++;
+          if (!std::isnan(obj)) {
             f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
             f_t change_in_x   = solution[j] - std::floor(solution[j]);
             pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
             pseudo_cost_num_down[j]++;
+            // Should be valid if were are already here
+            if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(i); }
           }
-          if (rb_mode == 1) { sb_view.mark_solved(i); }
         }
         pseudo_cost_mutex_down[j].unlock();
       }
@@ -1144,7 +1149,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       const i_t shared_idx = i + num_candidates;
       if (rb_mode == 1 && sb_view.is_solved(shared_idx)) {
-        ds_skipped++;
+        log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", j, shared_idx);
       } else {
         pseudo_cost_mutex_up[j].lock();
         if (pseudo_cost_num_up[j] < reliable_threshold) {
@@ -1168,22 +1173,14 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
           ds_obj_up[i] = obj;
           ds_status_up[i] = status;
-          if (std::isnan(obj)) {
-            ds_failed++;
-          } else if (std::isinf(obj)) {
-            ds_infeasible++;
-            f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
-            f_t change_in_x   = std::ceil(solution[j]) - solution[j];
-            pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
-            pseudo_cost_num_up[j]++;
-          } else {
-            ds_optimal++;
+          if (!std::isnan(obj)) {
             f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
             f_t change_in_x   = std::ceil(solution[j]) - solution[j];
             pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
             pseudo_cost_num_up[j]++;
+            // Should be valid if were are already here
+            if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(shared_idx); }
           }
-          if (rb_mode == 1) { sb_view.mark_solved(shared_idx); }
         }
         pseudo_cost_mutex_up[j].unlock();
       }
@@ -1205,25 +1202,26 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   f_t ds_elapsed = toc(ds_start_time);
 
-  if (rb_mode != 2) {
-    if (rb_mode == 1) {
-      log.printf(
-        "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n",
-        num_candidates,
-        ds_optimal.load(), num_candidates * 2,
-        ds_infeasible.load(), num_candidates * 2,
-        ds_failed.load(), num_candidates * 2,
-        ds_skipped.load(), ds_elapsed);
-    } else {
-      log.printf(
-        "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n",
-        num_candidates,
-        ds_optimal.load(), num_candidates * 2,
-        ds_infeasible.load(), num_candidates * 2,
-        ds_failed.load(), num_candidates * 2,
-        ds_elapsed);
-    }
-  }
+  // TODO put back
+  //if (rb_mode != 2) {
+  //  if (rb_mode == 1) {
+  //    log.printf(
+  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n",
+  //      num_candidates,
+  //      ds_optimal.load(), num_candidates * 2,
+  //      ds_infeasible.load(), num_candidates * 2,
+  //      ds_failed.load(), num_candidates * 2,
+  //      ds_skipped.load(), ds_elapsed);
+  //  } else {
+  //    log.printf(
+  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n",
+  //      num_candidates,
+  //      ds_optimal.load(), num_candidates * 2,
+  //      ds_infeasible.load(), num_candidates * 2,
+  //      ds_failed.load(), num_candidates * 2,
+  //      ds_elapsed);
+  //  }
+  //}
 
   if (use_pdlp) {
     pdlp_thread.join();
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index b9cdb8c9c6..a27ecd965c 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -996,7 +996,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     }
   }
 
-  
+  // Only used in tests
   const bool collect_solutions = settings.generate_batch_primal_dual_solution;
   
   rmm::device_uvector<f_t> full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
@@ -1053,26 +1053,26 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
     auto sol = solve_lp(problem, batch_settings);
 
+    
+    if (collect_solutions) {
+      raft::copy(full_primal_solution.data() + i * problem.get_n_variables(),
+      sol.get_primal_solution().data(),
+      sol.get_primal_solution().size(),
+      stream);
+      raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(),
+      sol.get_dual_solution().data(),
+      sol.get_dual_solution().size(),
+      stream);
+      raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(),
+      sol.get_reduced_cost().data(),
+      sol.get_reduced_cost().size(),
+      stream);
+    }
     auto info = sol.get_additional_termination_informations();
     full_info.insert(full_info.end(), info.begin(), info.end());
 
     auto status = sol.get_terminations_status();
     full_status.insert(full_status.end(), status.begin(), status.end());
-
-    if (collect_solutions) {
-        raft::copy(full_primal_solution.data() + i * problem.get_n_variables(),
-                   sol.get_primal_solution().data(),
-                   sol.get_primal_solution().size(),
-                   stream);
-        raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(),
-                   sol.get_dual_solution().data(),
-                   sol.get_dual_solution().size(),
-                   stream);
-        raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(),
-                   sol.get_reduced_cost().data(),
-                   sol.get_reduced_cost().size(),
-                   stream);
-    }
   }
 
   return optimization_problem_solution_t<i_t, f_t>(full_primal_solution,

From 697908624d49fa51f1b41cb616f8a887c8d638ef Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 25 Mar 2026 15:05:31 +0100
Subject: [PATCH 33/43] reduce accuracy to 1e-5, no BPDLP if in sub mip,
 disable BPDLP in RB if root BPDLP couldn't solve more than 5%

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 35 ++++++++++++++++-------
 cpp/src/branch_and_bound/pseudo_costs.hpp |  1 +
 cpp/src/pdlp/solve.cu                     |  2 +-
 3 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index c9f96b3666..a9b2177a29 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -434,6 +434,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   const f_t elapsed_time = toc(start_time);
   if (elapsed_time > settings.time_limit) { return; }
 
+  const i_t effective_batch_pdlp = settings.sub_mip ? 0 : settings.mip_batch_pdlp_strong_branching;
+
   settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
                       settings.num_threads,
                       fractional.size());
@@ -449,10 +451,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
   auto pdlp_thread = std::thread([&]() {
     
-    if (settings.mip_batch_pdlp_strong_branching == 0)
+    if (effective_batch_pdlp == 0)
      return;
   
-    settings.log.printf(settings.mip_batch_pdlp_strong_branching == 2
+    settings.log.printf(effective_batch_pdlp == 2
       ? "Batch PDLP only for strong branching\n"
       : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
 
@@ -494,7 +496,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       constexpr int warm_start_iteration_limit = 500000;
       ws_settings.iteration_limit      = warm_start_iteration_limit;
       ws_settings.time_limit = warm_start_remaining_time;
-      constexpr f_t pdlp_tolerance = 1e-6;
+      constexpr f_t pdlp_tolerance = 1e-5;
       ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
       ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
       ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
@@ -502,7 +504,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance;
       ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance;
       ws_settings.inside_mip           = true;
-      if (settings.mip_batch_pdlp_strong_branching == 1) {
+      if (effective_batch_pdlp == 1) {
         ws_settings.concurrent_halt = &concurrent_halt;
       }
 
@@ -543,7 +545,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     if (concurrent_halt.load() == 1) { return; }
 
     pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-    if (settings.mip_batch_pdlp_strong_branching == 1) {
+    if (effective_batch_pdlp == 1) {
       pdlp_settings.concurrent_halt = &concurrent_halt;
       pdlp_settings.shared_sb_view  = sb_view;
     }
@@ -631,7 +633,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   std::vector<f_t> ds_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   f_t dual_simplex_strong_branching_time = tic();
 
-  if (settings.mip_batch_pdlp_strong_branching != 2) {
+  if (effective_batch_pdlp != 2) {
 #pragma omp parallel num_threads(settings.num_threads)
     {
       i_t n = std::min<i_t>(4 * settings.num_threads, fractional.size());
@@ -714,7 +716,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   if (ds_unset)      settings.log.printf(", %d unset/skipped", ds_unset);
   settings.log.printf("\n");
 
-  if (settings.mip_batch_pdlp_strong_branching != 0) {
+  if (effective_batch_pdlp != 0) {
     i_t pdlp_optimal_count = 0;
     for (i_t k = 0; k < fractional.size(); k++) {
       if (!std::isnan(pdlp_obj_down[k])) pdlp_optimal_count++;
@@ -724,7 +726,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     settings.log.printf(
       "Batch PDLP found %d/%d optimal solutions\n",
       pdlp_optimal_count,
-      fractional.size() * 2);
+      static_cast<int>(fractional.size() * 2));
   }
 
   i_t merged_from_ds = 0;
@@ -762,7 +764,10 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     }
   }
 
-  if (settings.mip_batch_pdlp_strong_branching != 0) {
+  
+  if (effective_batch_pdlp != 0) {
+    pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
+    settings.log.printf("Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root);
     settings.log.printf(
       "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n",
       merged_from_ds,
@@ -946,15 +951,23 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // This indicates that PDLP alone (not batched) couldn't even run at the root node
   // So it will most likely perform poorly compared to DS
   // It is also off if the number of candidate is very small
+  // If warm start could run but almost none of the BPDLP results were used, we also want to avoid using batch PDLP
   constexpr i_t min_num_candidates_for_pdlp = 5;
-  const bool use_pdlp = (rb_mode != 0) && (pdlp_warm_cache.populated) && unreliable_list.size() > min_num_candidates_for_pdlp;
+  constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
+  const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp;
 
   if (rb_mode != 0 && !pdlp_warm_cache.populated) {
     log.printf("PDLP warm start data not populated, using DS only\n");
   }
-  if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
+  else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
     log.printf("Not enough candidates to use batch PDLP, using DS only\n");
   }
+  else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) {
+    log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n");
+  }
+  else if (use_pdlp) {
+    log.printf("Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved by batch PDLP at root is %f%% (> %f%%)\n", static_cast<i_t>(unreliable_list.size()), min_num_candidates_for_pdlp, pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp);
+  }
 
   const int num_tasks          = std::max(max_num_tasks, 1);
   const int task_priority      = reliability_branching_settings.task_priority;
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index c48ed908d7..be8f9f71d4 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -416,6 +416,7 @@ struct batch_pdlp_warm_cache_t {
   f_t step_size{std::numeric_limits<f_t>::signaling_NaN()};
   f_t primal_weight{std::numeric_limits<f_t>::signaling_NaN()};
   i_t pdlp_iteration{-1};
+  f_t pourcent_solved_by_batch_pdlp_at_root{f_t(0.0)};
   bool populated{false};
 };
 
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index a27ecd965c..275c119d03 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -909,7 +909,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   constexpr bool use_initial_pdlp_iterations = true;
   bool use_optimal_batch_size   = false;
   constexpr int batch_iteration_limit = 100000;
-  constexpr f_t pdlp_tolerance = 1e-6;
+  constexpr f_t pdlp_tolerance = 1e-5;
 
   rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream();
 

From b0061e4805a0b33f0b6aa5b7a834ebc314c5aaff Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Wed, 25 Mar 2026 15:21:20 +0100
Subject: [PATCH 34/43] empty just to run a new benchmark


From f504a75561060f38963ac476e52dd32db13ab743 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 11:29:06 +0200
Subject: [PATCH 35/43] fix PR review comments

---
 .../cuopt/linear_programming/constants.h      | 112 ++--
 .../mip/solver_settings.hpp                   |  18 +-
 .../pdlp/solver_settings.hpp                  |   8 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 483 ++++++++++--------
 .../shared_strong_branching_context.hpp       |   5 +-
 .../dual_simplex/simplex_solver_settings.hpp  |   6 +-
 cpp/src/math_optimization/solver_settings.cu  |   4 +-
 cpp/src/pdlp/pdlp.cu                          |  34 +-
 cpp/src/pdlp/pdlp_constants.hpp               |   2 -
 cpp/src/pdlp/solve.cu                         | 123 +++--
 cpp/src/pdlp/solver_settings.cu               |  21 +
 .../termination_strategy.cu                   |   5 +-
 cpp/src/pdlp/utilities/ping_pong_graph.cu     |   1 +
 cpp/tests/linear_programming/pdlp_test.cu     |  36 +-
 .../linear_programming/data_definition.py     |   4 +-
 15 files changed, 470 insertions(+), 392 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/constants.h b/cpp/include/cuopt/linear_programming/constants.h
index c20a20a571..1b9d7e85a4 100644
--- a/cpp/include/cuopt/linear_programming/constants.h
+++ b/cpp/include/cuopt/linear_programming/constants.h
@@ -20,63 +20,63 @@
 #define CUOPT_INSTANTIATE_INT64  0
 
 /* @brief LP/MIP parameter string constants */
-#define CUOPT_ABSOLUTE_DUAL_TOLERANCE         "absolute_dual_tolerance"
-#define CUOPT_RELATIVE_DUAL_TOLERANCE         "relative_dual_tolerance"
-#define CUOPT_ABSOLUTE_PRIMAL_TOLERANCE       "absolute_primal_tolerance"
-#define CUOPT_RELATIVE_PRIMAL_TOLERANCE       "relative_primal_tolerance"
-#define CUOPT_ABSOLUTE_GAP_TOLERANCE          "absolute_gap_tolerance"
-#define CUOPT_RELATIVE_GAP_TOLERANCE          "relative_gap_tolerance"
-#define CUOPT_INFEASIBILITY_DETECTION         "infeasibility_detection"
-#define CUOPT_STRICT_INFEASIBILITY            "strict_infeasibility"
-#define CUOPT_PRIMAL_INFEASIBLE_TOLERANCE     "primal_infeasible_tolerance"
-#define CUOPT_DUAL_INFEASIBLE_TOLERANCE       "dual_infeasible_tolerance"
-#define CUOPT_ITERATION_LIMIT                 "iteration_limit"
-#define CUOPT_TIME_LIMIT                      "time_limit"
-#define CUOPT_WORK_LIMIT                      "work_limit"
-#define CUOPT_PDLP_SOLVER_MODE                "pdlp_solver_mode"
-#define CUOPT_METHOD                          "method"
-#define CUOPT_PER_CONSTRAINT_RESIDUAL         "per_constraint_residual"
-#define CUOPT_SAVE_BEST_PRIMAL_SO_FAR         "save_best_primal_so_far"
-#define CUOPT_FIRST_PRIMAL_FEASIBLE           "first_primal_feasible"
-#define CUOPT_LOG_FILE                        "log_file"
-#define CUOPT_LOG_TO_CONSOLE                  "log_to_console"
-#define CUOPT_CROSSOVER                       "crossover"
-#define CUOPT_FOLDING                         "folding"
-#define CUOPT_AUGMENTED                       "augmented"
-#define CUOPT_DUALIZE                         "dualize"
-#define CUOPT_ORDERING                        "ordering"
-#define CUOPT_BARRIER_DUAL_INITIAL_POINT      "barrier_dual_initial_point"
-#define CUOPT_ELIMINATE_DENSE_COLUMNS         "eliminate_dense_columns"
-#define CUOPT_CUDSS_DETERMINISTIC             "cudss_deterministic"
-#define CUOPT_PRESOLVE                        "presolve"
-#define CUOPT_DUAL_POSTSOLVE                  "dual_postsolve"
-#define CUOPT_MIP_DETERMINISM_MODE            "mip_determinism_mode"
-#define CUOPT_MIP_ABSOLUTE_TOLERANCE          "mip_absolute_tolerance"
-#define CUOPT_MIP_RELATIVE_TOLERANCE          "mip_relative_tolerance"
-#define CUOPT_MIP_INTEGRALITY_TOLERANCE       "mip_integrality_tolerance"
-#define CUOPT_MIP_ABSOLUTE_GAP                "mip_absolute_gap"
-#define CUOPT_MIP_RELATIVE_GAP                "mip_relative_gap"
-#define CUOPT_MIP_HEURISTICS_ONLY             "mip_heuristics_only"
-#define CUOPT_MIP_SCALING                     "mip_scaling"
-#define CUOPT_MIP_PRESOLVE                    "mip_presolve"
-#define CUOPT_MIP_RELIABILITY_BRANCHING       "mip_reliability_branching"
-#define CUOPT_MIP_CUT_PASSES                  "mip_cut_passes"
-#define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS "mip_mixed_integer_rounding_cuts"
-#define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS   "mip_mixed_integer_gomory_cuts"
-#define CUOPT_MIP_KNAPSACK_CUTS               "mip_knapsack_cuts"
-#define CUOPT_MIP_CLIQUE_CUTS                 "mip_clique_cuts"
-#define CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS  "mip_strong_chvatal_gomory_cuts"
-#define CUOPT_MIP_REDUCED_COST_STRENGTHENING  "mip_reduced_cost_strengthening"
-#define CUOPT_MIP_CUT_CHANGE_THRESHOLD        "mip_cut_change_threshold"
-#define CUOPT_MIP_CUT_MIN_ORTHOGONALITY       "mip_cut_min_orthogonality"
-#define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING "mip_batch_pdlp_strong_branching"
+#define CUOPT_ABSOLUTE_DUAL_TOLERANCE              "absolute_dual_tolerance"
+#define CUOPT_RELATIVE_DUAL_TOLERANCE              "relative_dual_tolerance"
+#define CUOPT_ABSOLUTE_PRIMAL_TOLERANCE            "absolute_primal_tolerance"
+#define CUOPT_RELATIVE_PRIMAL_TOLERANCE            "relative_primal_tolerance"
+#define CUOPT_ABSOLUTE_GAP_TOLERANCE               "absolute_gap_tolerance"
+#define CUOPT_RELATIVE_GAP_TOLERANCE               "relative_gap_tolerance"
+#define CUOPT_INFEASIBILITY_DETECTION              "infeasibility_detection"
+#define CUOPT_STRICT_INFEASIBILITY                 "strict_infeasibility"
+#define CUOPT_PRIMAL_INFEASIBLE_TOLERANCE          "primal_infeasible_tolerance"
+#define CUOPT_DUAL_INFEASIBLE_TOLERANCE            "dual_infeasible_tolerance"
+#define CUOPT_ITERATION_LIMIT                      "iteration_limit"
+#define CUOPT_TIME_LIMIT                           "time_limit"
+#define CUOPT_WORK_LIMIT                           "work_limit"
+#define CUOPT_PDLP_SOLVER_MODE                     "pdlp_solver_mode"
+#define CUOPT_METHOD                               "method"
+#define CUOPT_PER_CONSTRAINT_RESIDUAL              "per_constraint_residual"
+#define CUOPT_SAVE_BEST_PRIMAL_SO_FAR              "save_best_primal_so_far"
+#define CUOPT_FIRST_PRIMAL_FEASIBLE                "first_primal_feasible"
+#define CUOPT_LOG_FILE                             "log_file"
+#define CUOPT_LOG_TO_CONSOLE                       "log_to_console"
+#define CUOPT_CROSSOVER                            "crossover"
+#define CUOPT_FOLDING                              "folding"
+#define CUOPT_AUGMENTED                            "augmented"
+#define CUOPT_DUALIZE                              "dualize"
+#define CUOPT_ORDERING                             "ordering"
+#define CUOPT_BARRIER_DUAL_INITIAL_POINT           "barrier_dual_initial_point"
+#define CUOPT_ELIMINATE_DENSE_COLUMNS              "eliminate_dense_columns"
+#define CUOPT_CUDSS_DETERMINISTIC                  "cudss_deterministic"
+#define CUOPT_PRESOLVE                             "presolve"
+#define CUOPT_DUAL_POSTSOLVE                       "dual_postsolve"
+#define CUOPT_MIP_DETERMINISM_MODE                 "mip_determinism_mode"
+#define CUOPT_MIP_ABSOLUTE_TOLERANCE               "mip_absolute_tolerance"
+#define CUOPT_MIP_RELATIVE_TOLERANCE               "mip_relative_tolerance"
+#define CUOPT_MIP_INTEGRALITY_TOLERANCE            "mip_integrality_tolerance"
+#define CUOPT_MIP_ABSOLUTE_GAP                     "mip_absolute_gap"
+#define CUOPT_MIP_RELATIVE_GAP                     "mip_relative_gap"
+#define CUOPT_MIP_HEURISTICS_ONLY                  "mip_heuristics_only"
+#define CUOPT_MIP_SCALING                          "mip_scaling"
+#define CUOPT_MIP_PRESOLVE                         "mip_presolve"
+#define CUOPT_MIP_RELIABILITY_BRANCHING            "mip_reliability_branching"
+#define CUOPT_MIP_CUT_PASSES                       "mip_cut_passes"
+#define CUOPT_MIP_MIXED_INTEGER_ROUNDING_CUTS      "mip_mixed_integer_rounding_cuts"
+#define CUOPT_MIP_MIXED_INTEGER_GOMORY_CUTS        "mip_mixed_integer_gomory_cuts"
+#define CUOPT_MIP_KNAPSACK_CUTS                    "mip_knapsack_cuts"
+#define CUOPT_MIP_CLIQUE_CUTS                      "mip_clique_cuts"
+#define CUOPT_MIP_STRONG_CHVATAL_GOMORY_CUTS       "mip_strong_chvatal_gomory_cuts"
+#define CUOPT_MIP_REDUCED_COST_STRENGTHENING       "mip_reduced_cost_strengthening"
+#define CUOPT_MIP_CUT_CHANGE_THRESHOLD             "mip_cut_change_threshold"
+#define CUOPT_MIP_CUT_MIN_ORTHOGONALITY            "mip_cut_min_orthogonality"
+#define CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING      "mip_batch_pdlp_strong_branching"
 #define CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING "mip_batch_pdlp_reliability_branching"
-#define CUOPT_SOLUTION_FILE                   "solution_file"
-#define CUOPT_NUM_CPU_THREADS                 "num_cpu_threads"
-#define CUOPT_NUM_GPUS                        "num_gpus"
-#define CUOPT_USER_PROBLEM_FILE               "user_problem_file"
-#define CUOPT_RANDOM_SEED                     "random_seed"
-#define CUOPT_PDLP_PRECISION                  "pdlp_precision"
+#define CUOPT_SOLUTION_FILE                        "solution_file"
+#define CUOPT_NUM_CPU_THREADS                      "num_cpu_threads"
+#define CUOPT_NUM_GPUS                             "num_gpus"
+#define CUOPT_USER_PROBLEM_FILE                    "user_problem_file"
+#define CUOPT_RANDOM_SEED                          "random_seed"
+#define CUOPT_PDLP_PRECISION                       "pdlp_precision"
 
 /* @brief MIP determinism mode constants */
 #define CUOPT_MODE_OPPORTUNISTIC 0
diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 62e88d5eb0..4af5e727d8 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -94,14 +94,16 @@ class mip_solver_settings_t {
   i_t mixed_integer_gomory_cuts = -1;
   i_t knapsack_cuts             = -1;
   i_t clique_cuts               = -1;
-  i_t strong_chvatal_gomory_cuts      = -1;
-  i_t reduced_cost_strengthening      = -1;
-  f_t cut_change_threshold            = -1.0;
-  f_t cut_min_orthogonality           = 0.5;
-  i_t mip_batch_pdlp_strong_branching{1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
-  i_t mip_batch_pdlp_reliability_branching{1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
-  i_t num_gpus                        = 1;
-  bool log_to_console                 = true;
+  i_t strong_chvatal_gomory_cuts = -1;
+  i_t reduced_cost_strengthening = -1;
+  f_t cut_change_threshold       = -1.0;
+  f_t cut_min_orthogonality      = 0.5;
+  i_t mip_batch_pdlp_strong_branching{
+    1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+  i_t mip_batch_pdlp_reliability_branching{
+    1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+  i_t num_gpus        = 1;
+  bool log_to_console = true;
 
   std::string log_file;
   std::string sol_file;
diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index 17fa7c548f..40b61d4ab0 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -149,12 +149,12 @@ class pdlp_solver_settings_t {
    * @param[in] initial_primal_weight Initial primal weight.
    */
   void set_initial_primal_weight(f_t initial_primal_weight);
-    /**
+  /**
    * @brief Set an initial pdlp iteration.
    *
    * @param[in] initial_pdlp_iteration Initial pdlp iteration.
    */
-   void set_initial_pdlp_iteration(i_t initial_pdlp_iteration);
+  void set_initial_pdlp_iteration(i_t initial_pdlp_iteration);
 
   /**
    * @brief Set the pdlp warm start data. This allows to restart PDLP with a
@@ -284,8 +284,8 @@ class pdlp_solver_settings_t {
   // concurrently i.e. if new_bounds.size() == 2, then 2 versions of the problem with updated bounds
   // will be solved concurrently
   std::vector<std::tuple<i_t, f_t, f_t>> new_bounds;
-  // By default to save memory and speed we don't store and copy each climber's primal and dual solutions
-  // We only retrieve termination statistics and the objective values
+  // By default to save memory and speed we don't store and copy each climber's primal and dual
+  // solutions We only retrieve termination statistics and the objective values
   bool generate_batch_primal_dual_solution{false};
   // Used to force batch PDLP to solve a subbatch of the problems at a time
   // The 0 default value will make the solver use its heuristic to determine the subbatch size
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index a9b2177a29..0dbc4764f5 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -13,6 +13,8 @@
 #include <dual_simplex/solve.hpp>
 #include <dual_simplex/tic_toc.hpp>
 
+#include <pdlp/pdlp_constants.hpp>
+
 #include <cuopt/linear_programming/solve.hpp>
 
 #include <utilities/copy_helpers.hpp>
@@ -27,7 +29,8 @@ namespace {
 
 static bool ds_is_valid_done(dual::status_t status)
 {
-  return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF;
+  return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL ||
+         status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF;
 }
 
 template <typename i_t, typename f_t>
@@ -67,8 +70,12 @@ void strong_branch_helper(i_t start,
       // Batch PDLP has already solved this subproblem, skip it
       if (sb_view.is_valid() && sb_view.is_solved(shared_idx)) {
         settings.log.printf(
-          "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved by PDLP\n",
-          thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+          "[COOP SB] DS thread %d skipping variable %d branch %s (shared_idx %d): already solved "
+          "by PDLP\n",
+          thread_id,
+          j,
+          branch == 0 ? "down" : "up",
+          shared_idx);
         continue;
       }
 
@@ -105,7 +112,8 @@ void strong_branch_helper(i_t start,
       if (status == dual::status_t::DUAL_UNBOUNDED) {
         // LP was infeasible
         obj = std::numeric_limits<f_t>::infinity();
-      } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF) {
+      } else if (status == dual::status_t::OPTIMAL || status == dual::status_t::ITERATION_LIMIT ||
+                 status == dual::status_t::CUTOFF) {
         obj = compute_objective(child_problem, solution.x);
       } else {
         settings.log.debug("Thread id %2d remaining %d variable %d branch %d status %d\n",
@@ -118,7 +126,7 @@ void strong_branch_helper(i_t start,
 
       if (branch == 0) {
         pc.strong_branch_down[k] = std::max(obj - root_obj, 0.0);
-        ds_obj_down[k] = std::max(obj - root_obj, 0.0);
+        ds_obj_down[k]           = std::max(obj - root_obj, 0.0);
         ds_status_down[k]        = status;
         if (verbose) {
           settings.log.printf("Thread id %2d remaining %d variable %d branch %d obj %e time %.2f\n",
@@ -131,7 +139,7 @@ void strong_branch_helper(i_t start,
         }
       } else {
         pc.strong_branch_up[k] = std::max(obj - root_obj, 0.0);
-        ds_obj_up[k] = std::max(obj - root_obj, 0.0);
+        ds_obj_up[k]           = std::max(obj - root_obj, 0.0);
         ds_status_up[k]        = status;
         if (verbose) {
           settings.log.printf(
@@ -150,21 +158,21 @@ void strong_branch_helper(i_t start,
       // Mark the subproblem as solved so that batch PDLP removes it from the batch
       if (sb_view.is_valid()) {
         // We could not mark as solved nodes hitting iteartion limit in DS
-        if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) || (branch == 1 && ds_is_valid_done(ds_status_up[k])))
-        {
+        if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) ||
+            (branch == 1 && ds_is_valid_done(ds_status_up[k]))) {
           sb_view.mark_solved(shared_idx);
           settings.log.printf(
-            "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in shared context\n",
-            thread_id, j, branch == 0 ? "down" : "up", shared_idx);
+            "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in "
+            "shared context\n",
+            thread_id,
+            j,
+            branch == 0 ? "down" : "up",
+            shared_idx);
         }
       }
-      if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
-        break; 
-      }
-    }
-    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
-      break; 
+      if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; }
     }
+    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; }
 
     const i_t completed = pc.num_strong_branches_completed++;
 
@@ -179,30 +187,28 @@ void strong_branch_helper(i_t start,
     child_problem.lower[j] = original_lp.lower[j];
     child_problem.upper[j] = original_lp.upper[j];
 
-    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) {
-      break; 
-    }
+    if (toc(start_time) > settings.time_limit || *concurrent_halt == 1) { break; }
   }
 }
 
 template <typename i_t, typename f_t>
 std::pair<f_t, dual::status_t> trial_branching(const lp_problem_t<i_t, f_t>& original_lp,
-                    const simplex_solver_settings_t<i_t, f_t>& settings,
-                    const std::vector<variable_type_t>& var_types,
-                    const std::vector<variable_status_t>& vstatus,
-                    const std::vector<f_t>& edge_norms,
-                    const basis_update_mpf_t<i_t, f_t>& basis_factors,
-                    const std::vector<i_t>& basic_list,
-                    const std::vector<i_t>& nonbasic_list,
-                    i_t branch_var,
-                    f_t branch_var_lower,
-                    f_t branch_var_upper,
-                    f_t upper_bound,
-                    i_t bnb_lp_iter_per_node,
-                    f_t start_time,
-                    i_t upper_max_lp_iter,
-                    i_t lower_max_lp_iter,
-                    omp_atomic_t<int64_t>& total_lp_iter)
+                                               const simplex_solver_settings_t<i_t, f_t>& settings,
+                                               const std::vector<variable_type_t>& var_types,
+                                               const std::vector<variable_status_t>& vstatus,
+                                               const std::vector<f_t>& edge_norms,
+                                               const basis_update_mpf_t<i_t, f_t>& basis_factors,
+                                               const std::vector<i_t>& basic_list,
+                                               const std::vector<i_t>& nonbasic_list,
+                                               i_t branch_var,
+                                               f_t branch_var_lower,
+                                               f_t branch_var_upper,
+                                               f_t upper_bound,
+                                               i_t bnb_lp_iter_per_node,
+                                               f_t start_time,
+                                               i_t upper_max_lp_iter,
+                                               i_t lower_max_lp_iter,
+                                               omp_atomic_t<int64_t>& total_lp_iter)
 {
   lp_problem_t child_problem      = original_lp;
   child_problem.lower[branch_var] = branch_var_lower;
@@ -271,7 +277,6 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   const std::vector<f_t>& root_soln,
   std::vector<f_t>& original_root_soln_x)
 {
-
   // Branch and bound has a problem of the form:
   // minimize c^T x
   // subject to A*x + Es = b
@@ -285,7 +290,6 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   // subject to  lb <= A*x <= ub
   //             l <= x <= u
 
-
   cuopt::mps_parser::mps_data_model_t<i_t, f_t> mps_model;
   int m = lp.num_rows;
   int n = lp.num_cols - new_slacks.size();
@@ -331,8 +335,8 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   std::vector<i_t> slack_map(m, -1);
   for (i_t j : new_slacks) {
     const i_t col_start = lp.A.col_start[j];
-    const i_t i = lp.A.i[col_start];
-    slack_map[i] = j;
+    const i_t i         = lp.A.i[col_start];
+    slack_map[i]        = j;
   }
 
   for (i_t i = 0; i < m; ++i) {
@@ -354,8 +358,8 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
 
     const i_t slack = slack_map[i];
     assert(slack != -1);
-    const i_t col_start = lp.A.col_start[slack];
-    const f_t sigma = lp.A.x[col_start];
+    const i_t col_start   = lp.A.col_start[slack];
+    const f_t sigma       = lp.A.x[col_start];
     const f_t slack_lower = lp.lower[slack];
     const f_t slack_upper = lp.upper[slack];
 
@@ -387,9 +391,9 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
 // Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both
 template <typename i_t, typename f_t>
 static std::pair<f_t, i_t> merge_sb_result(f_t ds_val,
-                    dual::status_t ds_status,
-                    f_t pdlp_dual_obj,
-                    bool pdlp_optimal)
+                                           dual::status_t ds_status,
+                                           f_t pdlp_dual_obj,
+                                           bool pdlp_optimal)
 {
   // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify
 
@@ -397,20 +401,24 @@ static std::pair<f_t, i_t> merge_sb_result(f_t ds_val,
   if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; }
 
   // Rule 2: Dual Simplex found infeasible -> declare infeasible
-  if (ds_status == dual::status_t::DUAL_UNBOUNDED) { return {std::numeric_limits<f_t>::infinity(), 0}; }
+  if (ds_status == dual::status_t::DUAL_UNBOUNDED) {
+    return {std::numeric_limits<f_t>::infinity(), 0};
+  }
 
   // Rule 3: Only one converged -> keep that
   if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; }
   if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; }
 
   // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS
-  if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT || ds_status == dual::status_t::CUTOFF) { return {ds_val, 0}; }
+  if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT ||
+      ds_status == dual::status_t::CUTOFF) {
+    return {ds_val, 0};
+  }
 
   // Rule 5: None converged -> NaN
   return {std::numeric_limits<f_t>::quiet_NaN(), 2};
 }
 
-
 template <typename i_t, typename f_t>
 void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -450,28 +458,27 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
 
   auto pdlp_thread = std::thread([&]() {
-    
-    if (effective_batch_pdlp == 0)
-     return;
-  
+    if (effective_batch_pdlp == 0) return;
+
     settings.log.printf(effective_batch_pdlp == 2
-      ? "Batch PDLP only for strong branching\n"
-      : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
+                          ? "Batch PDLP only for strong branching\n"
+                          : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
 
     f_t start_batch = tic();
     std::vector<f_t> original_root_soln_x;
 
     if (concurrent_halt.load() == 1) { return; }
 
-    const auto mps_model         = simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
-
+    const auto mps_model =
+      simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
 
     std::vector<f_t> fraction_values;
 
     std::vector<f_t> original_root_soln_y, original_root_soln_z;
     // TODO put back later once Chris has this part
     /*uncrush_dual_solution(
-      original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y, original_root_soln_z);*/
+      original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y,
+      original_root_soln_z);*/
 
     for (i_t k = 0; k < fractional.size(); k++) {
       const i_t j = fractional[k];
@@ -485,58 +492,69 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
     if (warm_start_remaining_time <= 0.0) { return; }
 
+    assert(!pc.pdlp_warm_cache.populated &&
+           "PDLP warm cache should not be populated at this point");
+
     if (!pc.pdlp_warm_cache.populated) {
       pdlp_solver_settings_t<i_t, f_t> ws_settings;
       ws_settings.method               = method_t::PDLP;
       ws_settings.presolver            = presolver_t::None;
       ws_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
       ws_settings.detect_infeasibility = false;
-      // Since the warm start will be used over and over again we want to maximize the chance of convergeance
-      // Batch PDLP is very compute intensive so we want to minimize the number of iterations
-      constexpr int warm_start_iteration_limit = 500000;
-      ws_settings.iteration_limit      = warm_start_iteration_limit;
-      ws_settings.time_limit = warm_start_remaining_time;
-      constexpr f_t pdlp_tolerance = 1e-5;
-      ws_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
-      ws_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
+      // Since the warm start will be used over and over again we want to maximize the chance of
+      // convergeance Batch PDLP is very compute intensive so we want to minimize the number of
+      // iterations
+      constexpr int warm_start_iteration_limit         = 500000;
+      ws_settings.iteration_limit                      = warm_start_iteration_limit;
+      ws_settings.time_limit                           = warm_start_remaining_time;
+      constexpr f_t pdlp_tolerance                     = 1e-5;
+      ws_settings.tolerances.relative_dual_tolerance   = pdlp_tolerance;
+      ws_settings.tolerances.absolute_dual_tolerance   = pdlp_tolerance;
       ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
       ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
-      ws_settings.tolerances.relative_gap_tolerance = pdlp_tolerance;
-      ws_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance;
-      ws_settings.inside_mip           = true;
-      if (effective_batch_pdlp == 1) {
-        ws_settings.concurrent_halt = &concurrent_halt;
-      }
+      ws_settings.tolerances.relative_gap_tolerance    = pdlp_tolerance;
+      ws_settings.tolerances.absolute_gap_tolerance    = pdlp_tolerance;
+      ws_settings.inside_mip                           = true;
+      if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
 
-      #ifdef BATCH_VERBOSE_MODE
+#ifdef BATCH_VERBOSE_MODE
       auto start_time = std::chrono::high_resolution_clock::now();
-      #endif
+#endif
 
       auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
 
-      #ifdef BATCH_VERBOSE_MODE
+#ifdef BATCH_VERBOSE_MODE
       auto end_time = std::chrono::high_resolution_clock::now();
-      auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
-      std::cout << "Original problem solved in " << duration << " milliseconds" << " and iterations: " << original_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
-      #endif
+      auto duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+      std::cout << "Original problem solved in " << duration << " milliseconds"
+                << " and iterations: "
+                << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
+#endif
 
       if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) {
-        auto& cache          = pc.pdlp_warm_cache;
+        auto& cache           = pc.pdlp_warm_cache;
         const auto& ws_primal = ws_solution.get_primal_solution();
         const auto& ws_dual   = ws_solution.get_dual_solution();
-        // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm start
-        cache.initial_primal  = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
-        cache.initial_dual    = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
-        cache.step_size       = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
-        cache.primal_weight   = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
-        cache.pdlp_iteration  = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
-        cache.populated       = true;
-
-        settings.log.printf("Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
-          cache.initial_primal.size(), cache.initial_dual.size(),
-          cache.step_size, cache.primal_weight, cache.pdlp_iteration);
+        // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm
+        // start
+        cache.initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
+        cache.initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
+        cache.step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
+        cache.primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+        cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+        cache.populated      = true;
+
+        settings.log.printf(
+          "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
+          cache.initial_primal.size(),
+          cache.initial_dual.size(),
+          cache.step_size,
+          cache.primal_weight,
+          cache.pdlp_iteration);
       } else {
-        settings.log.printf("PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
+        settings.log.printf(
+          "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
           ws_solution.get_termination_status_string().c_str());
         return;
       }
@@ -552,14 +570,15 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
     batch_elapsed_time = toc(start_time);
     const f_t batch_remaining_time =
-    std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+      std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
     if (batch_remaining_time <= 0.0) { return; }
     pdlp_settings.time_limit = batch_remaining_time;
 
     if (pc.pdlp_warm_cache.populated) {
       auto& cache = pc.pdlp_warm_cache;
-      pdlp_settings.set_initial_primal_solution(
-        cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream());
+      pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
+                                                cache.initial_primal.size(),
+                                                cache.batch_pdlp_handle.get_stream());
       pdlp_settings.set_initial_dual_solution(
         cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
       pdlp_settings.set_initial_step_size(cache.step_size);
@@ -569,8 +588,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
     if (concurrent_halt.load() == 1) { return; }
 
-    const auto solutions =
-      batch_pdlp_solve(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
+    const auto solutions = batch_pdlp_solve(
+      &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
     f_t batch_pdlp_strong_branching_time = toc(start_batch);
 
     // Fail safe in case the batch PDLP failed and produced no solutions
@@ -624,9 +643,8 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0));
       pdlp_obj_up[k]   = std::max(obj_up - root_obj, f_t(0.0));
     }
-
   });
-  
+
   std::vector<dual::status_t> ds_status_down(fractional.size(), dual::status_t::UNSET);
   std::vector<dual::status_t> ds_status_up(fractional.size(), dual::status_t::UNSET);
   std::vector<f_t> ds_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
@@ -676,15 +694,14 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       }
     }
 
-  // DS done: signal PDLP to stop (time-limit or all work done) and wait
-  concurrent_halt.store(1);
+    // DS done: signal PDLP to stop (time-limit or all work done) and wait
+    concurrent_halt.store(1);
   }
 
   pdlp_thread.join();
 
   settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time));
 
-
   // Collect Dual Simplex statistics
   i_t ds_optimal = 0, ds_infeasible = 0, ds_iter_limit = 0;
   i_t ds_numerical = 0, ds_cutoff = 0, ds_time_limit = 0;
@@ -693,27 +710,30 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   for (i_t k = 0; k < fractional.size(); k++) {
     for (auto st : {ds_status_down[k], ds_status_up[k]}) {
       switch (st) {
-        case dual::status_t::OPTIMAL:          ds_optimal++;     break;
-        case dual::status_t::DUAL_UNBOUNDED:   ds_infeasible++;  break;
-        case dual::status_t::ITERATION_LIMIT:  ds_iter_limit++;  break;
-        case dual::status_t::NUMERICAL:        ds_numerical++;   break;
-        case dual::status_t::CUTOFF:           ds_cutoff++;      break;
-        case dual::status_t::TIME_LIMIT:       ds_time_limit++;  break;
-        case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++;  break;
-        case dual::status_t::WORK_LIMIT:       ds_work_limit++;  break;
-        case dual::status_t::UNSET:            ds_unset++;       break;
+        case dual::status_t::OPTIMAL: ds_optimal++; break;
+        case dual::status_t::DUAL_UNBOUNDED: ds_infeasible++; break;
+        case dual::status_t::ITERATION_LIMIT: ds_iter_limit++; break;
+        case dual::status_t::NUMERICAL: ds_numerical++; break;
+        case dual::status_t::CUTOFF: ds_cutoff++; break;
+        case dual::status_t::TIME_LIMIT: ds_time_limit++; break;
+        case dual::status_t::CONCURRENT_LIMIT: ds_concurrent++; break;
+        case dual::status_t::WORK_LIMIT: ds_work_limit++; break;
+        case dual::status_t::UNSET: ds_unset++; break;
       }
     }
   }
 
   settings.log.printf("Dual Simplex: %d/%d optimal, %d infeasible, %d iter-limit",
-    ds_optimal, total_subproblems, ds_infeasible, ds_iter_limit);
-  if (ds_cutoff)     settings.log.printf(", %d cutoff", ds_cutoff);
+                      ds_optimal,
+                      total_subproblems,
+                      ds_infeasible,
+                      ds_iter_limit);
+  if (ds_cutoff) settings.log.printf(", %d cutoff", ds_cutoff);
   if (ds_time_limit) settings.log.printf(", %d time-limit", ds_time_limit);
-  if (ds_numerical)  settings.log.printf(", %d numerical", ds_numerical);
+  if (ds_numerical) settings.log.printf(", %d numerical", ds_numerical);
   if (ds_concurrent) settings.log.printf(", %d concurrent-halt", ds_concurrent);
   if (ds_work_limit) settings.log.printf(", %d work-limit", ds_work_limit);
-  if (ds_unset)      settings.log.printf(", %d unset/skipped", ds_unset);
+  if (ds_unset) settings.log.printf(", %d unset/skipped", ds_unset);
   settings.log.printf("\n");
 
   if (effective_batch_pdlp != 0) {
@@ -723,53 +743,69 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
       if (!std::isnan(pdlp_obj_up[k])) pdlp_optimal_count++;
     }
 
-    settings.log.printf(
-      "Batch PDLP found %d/%d optimal solutions\n",
-      pdlp_optimal_count,
-      static_cast<int>(fractional.size() * 2));
+    settings.log.printf("Batch PDLP found %d/%d optimal solutions\n",
+                        pdlp_optimal_count,
+                        static_cast<int>(fractional.size() * 2));
   }
 
-  i_t merged_from_ds = 0;
-  i_t merged_from_pdlp = 0;
-  i_t merged_nan = 0;
+  i_t merged_from_ds      = 0;
+  i_t merged_from_pdlp    = 0;
+  i_t merged_nan          = 0;
   i_t solved_by_both_down = 0;
-  i_t solved_by_both_up = 0;
+  i_t solved_by_both_up   = 0;
   for (i_t k = 0; k < fractional.size(); k++) {
     bool ds_has_down   = ds_status_down[k] != dual::status_t::UNSET;
     bool pdlp_has_down = !std::isnan(pdlp_obj_down[k]);
-    const auto [value_down, source_down] = merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down);
+    const auto [value_down, source_down] =
+      merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down);
     pc.strong_branch_down[k] = value_down;
-    if (source_down == 0) merged_from_ds++;
-    else if (source_down == 1) merged_from_pdlp++;
-    else merged_nan++;
+    if (source_down == 0)
+      merged_from_ds++;
+    else if (source_down == 1)
+      merged_from_pdlp++;
+    else
+      merged_nan++;
     if (ds_has_down && pdlp_has_down) {
       solved_by_both_down++;
       settings.log.printf(
         "[COOP SB] Merge: variable %d DOWN solved by BOTH (DS=%e PDLP=%e) -> kept %s\n",
-        fractional[k], ds_obj_down[k], pdlp_obj_down[k], source_down == 0 ? "DS" : "PDLP");
+        fractional[k],
+        ds_obj_down[k],
+        pdlp_obj_down[k],
+        source_down == 0 ? "DS" : "PDLP");
     }
 
     bool ds_has_up   = ds_status_up[k] != dual::status_t::UNSET;
     bool pdlp_has_up = !std::isnan(pdlp_obj_up[k]);
-    const auto [value_up, source_up] = merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up);
+    const auto [value_up, source_up] =
+      merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up);
     pc.strong_branch_up[k] = value_up;
-    if (source_up == 0) merged_from_ds++;
-    else if (source_up == 1) merged_from_pdlp++;
-    else merged_nan++;
+    if (source_up == 0)
+      merged_from_ds++;
+    else if (source_up == 1)
+      merged_from_pdlp++;
+    else
+      merged_nan++;
     if (ds_has_up && pdlp_has_up) {
       solved_by_both_up++;
       settings.log.printf(
         "[COOP SB] Merge: variable %d UP solved by BOTH (DS=%e PDLP=%e) -> kept %s\n",
-        fractional[k], ds_obj_up[k], pdlp_obj_up[k], source_up == 0 ? "DS" : "PDLP");
+        fractional[k],
+        ds_obj_up[k],
+        pdlp_obj_up[k],
+        source_up == 0 ? "DS" : "PDLP");
     }
   }
 
-  
   if (effective_batch_pdlp != 0) {
-    pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root = (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
-    settings.log.printf("Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n", pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root);
+    pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root =
+      (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
     settings.log.printf(
-      "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both (down/up)\n",
+      "Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n",
+      pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root);
+    settings.log.printf(
+      "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both "
+      "(down/up)\n",
       merged_from_ds,
       merged_from_pdlp,
       merged_nan,
@@ -951,28 +987,36 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // This indicates that PDLP alone (not batched) couldn't even run at the root node
   // So it will most likely perform poorly compared to DS
   // It is also off if the number of candidate is very small
-  // If warm start could run but almost none of the BPDLP results were used, we also want to avoid using batch PDLP
-  constexpr i_t min_num_candidates_for_pdlp = 5;
+  // If warm start could run but almost none of the BPDLP results were used, we also want to avoid
+  // using batch PDLP
+  constexpr i_t min_num_candidates_for_pdlp                        = 5;
   constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
-  const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated && unreliable_list.size() > min_num_candidates_for_pdlp && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root > min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp;
+  const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated &&
+                        unreliable_list.size() > min_num_candidates_for_pdlp &&
+                        pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root >
+                          min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp;
 
   if (rb_mode != 0 && !pdlp_warm_cache.populated) {
     log.printf("PDLP warm start data not populated, using DS only\n");
-  }
-  else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
+  } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
     log.printf("Not enough candidates to use batch PDLP, using DS only\n");
-  }
-  else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) {
+  } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) {
     log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n");
-  }
-  else if (use_pdlp) {
-    log.printf("Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved by batch PDLP at root is %f%% (> %f%%)\n", static_cast<i_t>(unreliable_list.size()), min_num_candidates_for_pdlp, pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root, min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp);
+  } else if (use_pdlp) {
+    log.printf(
+      "Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved "
+      "by batch PDLP at root is %f%% (> %f%%)\n",
+      static_cast<i_t>(unreliable_list.size()),
+      min_num_candidates_for_pdlp,
+      pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root,
+      min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp);
   }
 
-  const int num_tasks          = std::max(max_num_tasks, 1);
-  const int task_priority      = reliability_branching_settings.task_priority;
+  const int num_tasks     = std::max(max_num_tasks, 1);
+  const int task_priority = reliability_branching_settings.task_priority;
   // If both batch PDLP and DS are used we double the max number of candidates
-  const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates : reliability_branching_settings.max_num_candidates;
+  const i_t max_num_candidates = use_pdlp ? 2 * reliability_branching_settings.max_num_candidates
+                                          : reliability_branching_settings.max_num_candidates;
   const i_t num_candidates     = std::min<size_t>(unreliable_list.size(), max_num_candidates);
 
   assert(task_priority > 0);
@@ -1008,10 +1052,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
   if (use_pdlp) {
     pdlp_thread = std::thread([&]() {
-      log.printf(rb_mode == 2
-        ? "RB batch PDLP only for %d candidates\n"
-        : "RB cooperative batch PDLP and DS for %d candidates\n",
-        num_candidates);
+      log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n"
+                              : "RB cooperative batch PDLP and DS for %d candidates\n",
+                 num_candidates);
 
       f_t start_batch = tic();
 
@@ -1019,8 +1062,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       if (concurrent_halt.load() == 1) { return; }
 
-      auto mps_model = simplex_problem_to_mps_data_model(
-        original_lp, new_slacks, solution, original_soln_x);
+      auto mps_model =
+        simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x);
       {
         const i_t n_orig = original_lp.num_cols - new_slacks.size();
         for (i_t j = 0; j < n_orig; j++) {
@@ -1049,13 +1092,14 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       }
       pdlp_settings.time_limit = batch_remaining_time;
 
-
       if (pdlp_warm_cache.populated) {
         auto& cache = pdlp_warm_cache;
-        pdlp_settings.set_initial_primal_solution(
-          cache.initial_primal.data(), cache.initial_primal.size(), cache.batch_pdlp_handle.get_stream());
-        pdlp_settings.set_initial_dual_solution(
-          cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+        pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
+                                                  cache.initial_primal.size(),
+                                                  cache.batch_pdlp_handle.get_stream());
+        pdlp_settings.set_initial_dual_solution(cache.initial_dual.data(),
+                                                cache.initial_dual.size(),
+                                                cache.batch_pdlp_handle.get_stream());
         pdlp_settings.set_initial_step_size(cache.step_size);
         pdlp_settings.set_initial_primal_weight(cache.primal_weight);
         pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
@@ -1063,8 +1107,11 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       if (concurrent_halt.load() == 1) { return; }
 
-      const auto solutions = batch_pdlp_solve(
-        &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
+      const auto solutions = batch_pdlp_solve(&pdlp_warm_cache.batch_pdlp_handle,
+                                              mps_model,
+                                              candidate_vars,
+                                              fraction_values,
+                                              pdlp_settings);
 
       f_t batch_pdlp_time = toc(start_batch);
 
@@ -1082,7 +1129,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       }
 
       log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
-                 batch_pdlp_time, amount_done, num_candidates * 2);
+                 batch_pdlp_time,
+                 amount_done,
+                 num_candidates * 2);
 
       for (i_t k = 0; k < num_candidates; k++) {
         if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
@@ -1121,30 +1170,32 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       if (toc(start_time) > settings.time_limit) { continue; }
 
       if (rb_mode == 1 && sb_view.is_solved(i)) {
-        log.printf("DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i);
+        log.printf(
+          "DS skipping variable %d branch down (shared_idx %d): already solved by PDLP\n", j, i);
       } else {
         pseudo_cost_mutex_down[j].lock();
         if (pseudo_cost_num_down[j] < reliable_threshold) {
           // Do trial branching on the down branch
-          const auto [obj, status] = trial_branching(worker->leaf_problem,
-                                    settings,
-                                    var_types,
-                                    node_ptr->vstatus,
-                                    worker->leaf_edge_norms,
-                                    worker->basis_factors,
-                                    worker->basic_list,
-                                    worker->nonbasic_list,
-                                    j,
-                                    worker->leaf_problem.lower[j],
-                                    std::floor(solution[j]),
-                                    upper_bound,
-                                    branch_and_bound_lp_iter_per_node,
-                                    start_time,
-                                    reliability_branching_settings.upper_max_lp_iter,
-                                    reliability_branching_settings.lower_max_lp_iter,
-                                    strong_branching_lp_iter);
-
-          ds_obj_down[i] = obj;
+          const auto [obj, status] =
+            trial_branching(worker->leaf_problem,
+                            settings,
+                            var_types,
+                            node_ptr->vstatus,
+                            worker->leaf_edge_norms,
+                            worker->basis_factors,
+                            worker->basic_list,
+                            worker->nonbasic_list,
+                            j,
+                            worker->leaf_problem.lower[j],
+                            std::floor(solution[j]),
+                            upper_bound,
+                            branch_and_bound_lp_iter_per_node,
+                            start_time,
+                            reliability_branching_settings.upper_max_lp_iter,
+                            reliability_branching_settings.lower_max_lp_iter,
+                            strong_branching_lp_iter);
+
+          ds_obj_down[i]    = obj;
           ds_status_down[i] = status;
           if (!std::isnan(obj)) {
             f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
@@ -1162,29 +1213,32 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
 
       const i_t shared_idx = i + num_candidates;
       if (rb_mode == 1 && sb_view.is_solved(shared_idx)) {
-        log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n", j, shared_idx);
+        log.printf("DS skipping variable %d branch up (shared_idx %d): already solved by PDLP\n",
+                   j,
+                   shared_idx);
       } else {
         pseudo_cost_mutex_up[j].lock();
         if (pseudo_cost_num_up[j] < reliable_threshold) {
-          const auto [obj, status] = trial_branching(worker->leaf_problem,
-                                    settings,
-                                    var_types,
-                                    node_ptr->vstatus,
-                                    worker->leaf_edge_norms,
-                                    worker->basis_factors,
-                                    worker->basic_list,
-                                    worker->nonbasic_list,
-                                    j,
-                                    std::ceil(solution[j]),
-                                    worker->leaf_problem.upper[j],
-                                    upper_bound,
-                                    branch_and_bound_lp_iter_per_node,
-                                    start_time,
-                                    reliability_branching_settings.upper_max_lp_iter,
-                                    reliability_branching_settings.lower_max_lp_iter,
-                                    strong_branching_lp_iter);
-
-          ds_obj_up[i] = obj;
+          const auto [obj, status] =
+            trial_branching(worker->leaf_problem,
+                            settings,
+                            var_types,
+                            node_ptr->vstatus,
+                            worker->leaf_edge_norms,
+                            worker->basis_factors,
+                            worker->basic_list,
+                            worker->nonbasic_list,
+                            j,
+                            std::ceil(solution[j]),
+                            worker->leaf_problem.upper[j],
+                            upper_bound,
+                            branch_and_bound_lp_iter_per_node,
+                            start_time,
+                            reliability_branching_settings.upper_max_lp_iter,
+                            reliability_branching_settings.lower_max_lp_iter,
+                            strong_branching_lp_iter);
+
+          ds_obj_up[i]    = obj;
           ds_status_up[i] = status;
           if (!std::isnan(obj)) {
             f_t change_in_obj = std::max(obj - node_ptr->lower_bound, eps);
@@ -1216,23 +1270,19 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   f_t ds_elapsed = toc(ds_start_time);
 
   // TODO put back
-  //if (rb_mode != 2) {
+  // if (rb_mode != 2) {
   //  if (rb_mode == 1) {
   //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped (PDLP) in %.2fs\n",
-  //      num_candidates,
-  //      ds_optimal.load(), num_candidates * 2,
+  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed, %d skipped
+  //      (PDLP) in %.2fs\n", num_candidates, ds_optimal.load(), num_candidates * 2,
   //      ds_infeasible.load(), num_candidates * 2,
   //      ds_failed.load(), num_candidates * 2,
   //      ds_skipped.load(), ds_elapsed);
   //  } else {
   //    log.printf(
-  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in %.2fs\n",
-  //      num_candidates,
-  //      ds_optimal.load(), num_candidates * 2,
-  //      ds_infeasible.load(), num_candidates * 2,
-  //      ds_failed.load(), num_candidates * 2,
-  //      ds_elapsed);
+  //      "RB Dual Simplex: %d candidates, %d/%d optimal, %d/%d infeasible, %d/%d failed in
+  //      %.2fs\n", num_candidates, ds_optimal.load(), num_candidates * 2, ds_infeasible.load(),
+  //      num_candidates * 2, ds_failed.load(), num_candidates * 2, ds_elapsed);
   //  }
   //}
 
@@ -1249,7 +1299,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
         pdlp_optimal++;
         const auto [merged_obj, source] =
           merge_sb_result<i_t, f_t>(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true);
-        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable)
+        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
+        // calls may have made it reliable)
         if (source == 1) {
           pseudo_cost_mutex_down[j].lock();
           if (pseudo_cost_num_down[j] < reliable_threshold) {
@@ -1268,7 +1319,8 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
         pdlp_optimal++;
         const auto [merged_obj, source] =
           merge_sb_result<i_t, f_t>(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true);
-        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent calls may have made it reliable)
+        // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
+        // calls may have made it reliable)
         if (source == 1) {
           pseudo_cost_mutex_up[j].lock();
           if (pseudo_cost_num_up[j] < reliable_threshold) {
@@ -1290,7 +1342,10 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
     }
 
     log.printf("RB batch PDLP: %d candidates, %d/%d optimal, %d applied to pseudo-costs\n",
-      num_candidates, pdlp_optimal, num_candidates * 2, pdlp_applied);
+               num_candidates,
+               pdlp_optimal,
+               num_candidates * 2,
+               pdlp_applied);
   }
 
   log.printf(
diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
index 6cbea737f5..6840ccbb77 100644
--- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
+++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
@@ -34,10 +34,7 @@ struct shared_strong_branching_context_view_t {
 
   bool is_valid() const { return !solved.empty(); }
 
-  bool is_solved(i_t local_idx) const
-  {
-    return solved[local_idx].load() != 0;
-  }
+  bool is_solved(i_t local_idx) const { return solved[local_idx].load() != 0; }
 
   void mark_solved(i_t local_idx) const { solved[local_idx].store(1); }
 
diff --git a/cpp/src/dual_simplex/simplex_solver_settings.hpp b/cpp/src/dual_simplex/simplex_solver_settings.hpp
index c097baf561..882f7a14f7 100644
--- a/cpp/src/dual_simplex/simplex_solver_settings.hpp
+++ b/cpp/src/dual_simplex/simplex_solver_settings.hpp
@@ -187,8 +187,10 @@ struct simplex_solver_settings_t {
                                    // strengthening
   f_t cut_change_threshold;        // threshold for cut change
   f_t cut_min_orthogonality;       // minimum orthogonality for cuts
-  i_t mip_batch_pdlp_strong_branching{0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
-  i_t mip_batch_pdlp_reliability_branching{0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+  i_t
+    mip_batch_pdlp_strong_branching;  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+  i_t mip_batch_pdlp_reliability_branching;  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch
+                                             // PDLP only
 
   diving_heuristics_settings_t<i_t, f_t> diving_settings;  // Settings for the diving heuristics
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 749d89a35c..5440809754 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -99,8 +99,8 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
-    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
-    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
+    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 1},
+    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 1},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC},
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 37e9e1a31f..642c17758d 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -779,18 +779,19 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
   // Sync external solved status into internal termination strategy before all_done() check
   if (settings_.shared_sb_view.is_valid()) {
     for (size_t i = 0; i < climber_strategies_.size(); ++i) {
-      // If PDLP has solved it to optimality we want to keep it and resolved both solvers having solved the problem later
+      // If PDLP has solved it to optimality we want to keep it and resolved both solvers having
+      // solved the problem later
       if (current_termination_strategy_.is_done(
             current_termination_strategy_.get_termination_status(i)))
         continue;
       const i_t local_idx = climber_strategies_[i].original_index;
       if (settings_.shared_sb_view.is_solved(local_idx)) {
-        current_termination_strategy_.set_termination_status(i,
-          pdlp_termination_status_t::ConcurrentLimit);
+        current_termination_strategy_.set_termination_status(
+          i, pdlp_termination_status_t::ConcurrentLimit);
 #ifdef BATCH_VERBOSE_MODE
-        std::cout << "[COOP SB] DS already solved climber " << i << " (original_index "
-                  << local_idx << "), synced to ConcurrentLimit at step "
-                  << internal_solver_iterations_ << std::endl;
+        std::cout << "[COOP SB] DS already solved climber " << i << " (original_index " << local_idx
+                  << "), synced to ConcurrentLimit at step " << internal_solver_iterations_
+                  << std::endl;
 #endif
       }
     }
@@ -1863,7 +1864,7 @@ void pdlp_solver_t<i_t, f_t>::resize_and_swap_all_context_loop(
       &new_buf_size,
       stream_view_));
     current_op_problem_evaluation_cusparse_view_.buffer_transpose_batch.resize(new_buf_size,
-                                                                              stream_view_);
+                                                                               stream_view_);
 
     // Convergence info: A * batch_primal_solutions -> batch_tmp_duals
     RAFT_CUSPARSE_TRY(raft::sparse::detail::cusparsespmm_bufferSize(
@@ -1879,7 +1880,7 @@ void pdlp_solver_t<i_t, f_t>::resize_and_swap_all_context_loop(
       &new_buf_size,
       stream_view_));
     current_op_problem_evaluation_cusparse_view_.buffer_non_transpose_batch.resize(new_buf_size,
-                                                                                  stream_view_);
+                                                                                   stream_view_);
   }
 
   // Rerun preprocess
@@ -2315,13 +2316,16 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
   }
   if (settings_.get_initial_pdlp_iteration().has_value()) {
     total_pdlp_iterations_ = settings_.get_initial_pdlp_iteration().value();
-    // This is meaningless in batch mode since pdhg step is never used, set it just to avoid assertions
-    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_, stream_view_);
+    // This is meaningless in batch mode since pdhg step is never used, set it just to avoid
+    // assertions
+    pdhg_solver_.get_d_total_pdhg_iterations().set_value_async(total_pdlp_iterations_,
+                                                               stream_view_);
     pdhg_solver_.total_pdhg_iterations_ = total_pdlp_iterations_;
-    // Reset the fixed point error since at this pdlp iteration it is expected to already be initialized to some value
+    // Reset the fixed point error since at this pdlp iteration it is expected to already be
+    // initialized to some value
     std::fill(restart_strategy_.initial_fixed_point_error_.begin(),
-    restart_strategy_.initial_fixed_point_error_.end(),
-    f_t(0.0));
+              restart_strategy_.initial_fixed_point_error_.end(),
+              f_t(0.0));
     std::fill(restart_strategy_.fixed_point_error_.begin(),
               restart_strategy_.fixed_point_error_.end(),
               f_t(0.0));
@@ -2472,8 +2476,8 @@ optimization_problem_solution_t<i_t, f_t> pdlp_solver_t<i_t, f_t>::run_solver(co
     if (is_major_iteration || artificial_restart_check_main_loop || error_occured ||
         is_conditional_major) {
       if (verbose) {
-          std::cout << "-------------------------------" << std::endl;
-          std::cout << internal_solver_iterations_ << std::endl;
+        std::cout << "-------------------------------" << std::endl;
+        std::cout << internal_solver_iterations_ << std::endl;
         raft::print_device_vector("step_size", step_size_.data(), step_size_.size(), std::cout);
         raft::print_device_vector(
           "primal_weight", primal_weight_.data(), primal_weight_.size(), std::cout);
diff --git a/cpp/src/pdlp/pdlp_constants.hpp b/cpp/src/pdlp/pdlp_constants.hpp
index cf17cc985b..568d7d00b0 100644
--- a/cpp/src/pdlp/pdlp_constants.hpp
+++ b/cpp/src/pdlp/pdlp_constants.hpp
@@ -7,8 +7,6 @@
 
 #pragma once
 
-#include <raft/util/cuda_utils.cuh>
-
 #include <cuda/cmath>
 
 namespace cuopt::linear_programming::detail {
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 275c119d03..4763391d0e 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -904,19 +904,19 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   optimization_problem_t<i_t, f_t>& problem, pdlp_solver_settings_t<i_t, f_t> const& settings)
 {
   // Hyper parameter than can be changed, I have put what I believe to be the best
-  constexpr bool pdlp_primal_dual_init    = true;
-  constexpr bool primal_weight_init       = true;
+  constexpr bool pdlp_primal_dual_init       = true;
+  constexpr bool primal_weight_init          = true;
   constexpr bool use_initial_pdlp_iterations = true;
-  bool use_optimal_batch_size   = false;
-  constexpr int batch_iteration_limit = 100000;
-  constexpr f_t pdlp_tolerance = 1e-5;
+  bool use_optimal_batch_size                = false;
+  constexpr int batch_iteration_limit        = 100000;
+  constexpr f_t pdlp_tolerance               = 1e-5;
 
   rmm::cuda_stream_view stream = problem.get_handle_ptr()->get_stream();
 
   rmm::device_uvector<f_t> initial_primal(0, stream);
   rmm::device_uvector<f_t> initial_dual(0, stream);
-  f_t initial_step_size     = std::numeric_limits<f_t>::signaling_NaN();
-  f_t initial_primal_weight = std::numeric_limits<f_t>::signaling_NaN();
+  f_t initial_step_size      = std::numeric_limits<f_t>::signaling_NaN();
+  f_t initial_primal_weight  = std::numeric_limits<f_t>::signaling_NaN();
   i_t initial_pdlp_iteration = -1;
 
   cuopt_assert(settings.new_bounds.size() > 0, "Batch size should be greater than 0");
@@ -927,63 +927,61 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
   const double memory_estimate = batch_pdlp_memory_estimator(problem, max_batch_size);
   size_t st_free_mem, st_total_mem;
   RAFT_CUDA_TRY(cudaMemGetInfo(&st_free_mem, &st_total_mem));
-  const double free_mem = static_cast<double>(st_free_mem);
+  const double free_mem  = static_cast<double>(st_free_mem);
   const double total_mem = static_cast<double>(st_total_mem);
 
-  #ifdef BATCH_VERBOSE_MODE
+#ifdef BATCH_VERBOSE_MODE
   std::cout << "Memory estimate: " << memory_estimate << std::endl;
   std::cout << "Free memory: " << free_mem << std::endl;
   std::cout << "Total memory: " << total_mem << std::endl;
-  #endif
+#endif
 
   if (memory_estimate > free_mem) {
     use_optimal_batch_size = true;
     // Decrement batch size iteratively until we find a batch size that fits
     while (memory_max_batch_size > 1) {
-      const double memory_estimate =
-        batch_pdlp_memory_estimator(problem, memory_max_batch_size);
+      const double memory_estimate = batch_pdlp_memory_estimator(problem, memory_max_batch_size);
       if (memory_estimate <= free_mem) { break; }
-      #ifdef BATCH_VERBOSE_MODE
+#ifdef BATCH_VERBOSE_MODE
       std::cout << "Memory estimate: " << memory_estimate << std::endl;
       std::cout << "Memory max batch size: " << memory_max_batch_size << std::endl;
       std::cout << "Free memory: " << free_mem << std::endl;
       std::cout << "Total memory: " << total_mem << std::endl;
       std::cout << "--------------------------------" << std::endl;
-      #endif
+#endif
       memory_max_batch_size--;
     }
-    const double min_estimate =
-      batch_pdlp_memory_estimator(problem, memory_max_batch_size);
+    const double min_estimate = batch_pdlp_memory_estimator(problem, memory_max_batch_size);
     if (min_estimate > free_mem) {
-      return optimization_problem_solution_t<i_t, f_t>(
-        pdlp_termination_status_t::NumericalError, stream);
+      return optimization_problem_solution_t<i_t, f_t>(pdlp_termination_status_t::NumericalError,
+                                                       stream);
     }
   }
 
   size_t optimal_batch_size = use_optimal_batch_size
-                             ? detail::optimal_batch_size_handler(problem, memory_max_batch_size)
-                             : max_batch_size;
-  if (settings.sub_batch_size > 0) {
-    optimal_batch_size = settings.sub_batch_size;
-  }
+                                ? detail::optimal_batch_size_handler(problem, memory_max_batch_size)
+                                : max_batch_size;
+  if (settings.sub_batch_size > 0) { optimal_batch_size = settings.sub_batch_size; }
   cuopt_assert(optimal_batch_size != 0 && optimal_batch_size <= max_batch_size,
                "Optimal batch size should be between 1 and max batch size");
 
-  const bool warm_start_from_settings =
-    settings.has_initial_primal_solution() || settings.has_initial_dual_solution() ||
-    settings.get_initial_step_size().has_value() ||
-    settings.get_initial_primal_weight().has_value() ||
-    settings.get_initial_pdlp_iteration().has_value();
+  const bool warm_start_from_settings = settings.has_initial_primal_solution() ||
+                                        settings.has_initial_dual_solution() ||
+                                        settings.get_initial_step_size().has_value() ||
+                                        settings.get_initial_primal_weight().has_value() ||
+                                        settings.get_initial_pdlp_iteration().has_value();
 
   if (warm_start_from_settings) {
-    #ifdef BATCH_VERBOSE_MODE
+#ifdef BATCH_VERBOSE_MODE
     std::cout << "Using warm start from settings" << std::endl;
-    #endif
+#endif
     if (settings.has_initial_primal_solution() && pdlp_primal_dual_init) {
-      initial_primal = rmm::device_uvector<f_t>(settings.get_initial_primal_solution(), settings.get_initial_primal_solution().stream());
+      initial_primal = rmm::device_uvector<f_t>(settings.get_initial_primal_solution(),
+                                                settings.get_initial_primal_solution().stream());
     }
     if (settings.has_initial_dual_solution() && pdlp_primal_dual_init) {
-      initial_dual = rmm::device_uvector<f_t>(settings.get_initial_dual_solution(), settings.get_initial_dual_solution().stream());
+      initial_dual = rmm::device_uvector<f_t>(settings.get_initial_dual_solution(),
+                                              settings.get_initial_dual_solution().stream());
     }
     if (settings.get_initial_step_size().has_value() && pdlp_primal_dual_init) {
       initial_step_size = *settings.get_initial_step_size();
@@ -998,30 +996,33 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
 
   // Only used in tests
   const bool collect_solutions = settings.generate_batch_primal_dual_solution;
-  
-  rmm::device_uvector<f_t> full_primal_solution((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
-  rmm::device_uvector<f_t> full_dual_solution((collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream);
-  rmm::device_uvector<f_t> full_reduced_cost((collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
+
+  rmm::device_uvector<f_t> full_primal_solution(
+    (collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
+  rmm::device_uvector<f_t> full_dual_solution(
+    (collect_solutions) ? problem.get_n_constraints() * max_batch_size : 0, stream);
+  rmm::device_uvector<f_t> full_reduced_cost(
+    (collect_solutions) ? problem.get_n_variables() * max_batch_size : 0, stream);
 
   std::vector<
     typename optimization_problem_solution_t<i_t, f_t>::additional_termination_information_t>
     full_info;
   std::vector<pdlp_termination_status_t> full_status;
 
-  pdlp_solver_settings_t<i_t, f_t> batch_settings = settings;
-  const auto original_new_bounds                  = batch_settings.new_bounds;
-  batch_settings.method                           = cuopt::linear_programming::method_t::PDLP;
-  batch_settings.presolver                        = presolver_t::None;
-  batch_settings.pdlp_solver_mode                 = pdlp_solver_mode_t::Stable3;
-  batch_settings.detect_infeasibility             = false;
-  batch_settings.iteration_limit                  = batch_iteration_limit;
-  batch_settings.inside_mip                       = true;
-  batch_settings.tolerances.absolute_dual_tolerance = pdlp_tolerance;
-  batch_settings.tolerances.relative_dual_tolerance = pdlp_tolerance;
+  pdlp_solver_settings_t<i_t, f_t> batch_settings     = settings;
+  const auto original_new_bounds                      = batch_settings.new_bounds;
+  batch_settings.method                               = cuopt::linear_programming::method_t::PDLP;
+  batch_settings.presolver                            = presolver_t::None;
+  batch_settings.pdlp_solver_mode                     = pdlp_solver_mode_t::Stable3;
+  batch_settings.detect_infeasibility                 = false;
+  batch_settings.iteration_limit                      = batch_iteration_limit;
+  batch_settings.inside_mip                           = true;
+  batch_settings.tolerances.absolute_dual_tolerance   = pdlp_tolerance;
+  batch_settings.tolerances.relative_dual_tolerance   = pdlp_tolerance;
   batch_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
   batch_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
-  batch_settings.tolerances.absolute_gap_tolerance = pdlp_tolerance;
-  batch_settings.tolerances.relative_gap_tolerance = pdlp_tolerance;
+  batch_settings.tolerances.absolute_gap_tolerance    = pdlp_tolerance;
+  batch_settings.tolerances.relative_gap_tolerance    = pdlp_tolerance;
   if (initial_primal.size() > 0) {
     batch_settings.set_initial_primal_solution(
       initial_primal.data(), initial_primal.size(), initial_primal.stream());
@@ -1030,9 +1031,7 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     batch_settings.set_initial_dual_solution(
       initial_dual.data(), initial_dual.size(), initial_dual.stream());
   }
-  if (!std::isnan(initial_step_size)) {
-    batch_settings.set_initial_step_size(initial_step_size);
-  }
+  if (!std::isnan(initial_step_size)) { batch_settings.set_initial_step_size(initial_step_size); }
   if (initial_pdlp_iteration != -1) {
     batch_settings.set_initial_pdlp_iteration(initial_pdlp_iteration);
   }
@@ -1047,26 +1046,24 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
       original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size);
 
     if (settings.shared_sb_view.is_valid()) {
-      batch_settings.shared_sb_view =
-        settings.shared_sb_view.subview(i, current_batch_size);
+      batch_settings.shared_sb_view = settings.shared_sb_view.subview(i, current_batch_size);
     }
 
     auto sol = solve_lp(problem, batch_settings);
 
-    
     if (collect_solutions) {
       raft::copy(full_primal_solution.data() + i * problem.get_n_variables(),
-      sol.get_primal_solution().data(),
-      sol.get_primal_solution().size(),
-      stream);
+                 sol.get_primal_solution().data(),
+                 sol.get_primal_solution().size(),
+                 stream);
       raft::copy(full_dual_solution.data() + i * problem.get_n_constraints(),
-      sol.get_dual_solution().data(),
-      sol.get_dual_solution().size(),
-      stream);
+                 sol.get_dual_solution().data(),
+                 sol.get_dual_solution().size(),
+                 stream);
       raft::copy(full_reduced_cost.data() + i * problem.get_n_variables(),
-      sol.get_reduced_cost().data(),
-      sol.get_reduced_cost().size(),
-      stream);
+                 sol.get_reduced_cost().data(),
+                 sol.get_reduced_cost().size(),
+                 stream);
     }
     auto info = sol.get_additional_termination_informations();
     full_info.insert(full_info.end(), info.begin(), info.end());
diff --git a/cpp/src/pdlp/solver_settings.cu b/cpp/src/pdlp/solver_settings.cu
index 30d5ccaea5..ac2564bb16 100644
--- a/cpp/src/pdlp/solver_settings.cu
+++ b/cpp/src/pdlp/solver_settings.cu
@@ -61,12 +61,30 @@ void pdlp_solver_settings_t<i_t, f_t>::set_initial_dual_solution(const f_t* init
 template <typename i_t, typename f_t>
 void pdlp_solver_settings_t<i_t, f_t>::set_initial_step_size(f_t initial_step_size)
 {
+  cuopt_expects(initial_step_size > f_t(0),
+                error_type_t::ValidationError,
+                "Initial step size must be greater than 0");
+  cuopt_expects(!std::isinf(initial_step_size),
+                error_type_t::ValidationError,
+                "Initial step size must be finite");
+  cuopt_expects(!std::isnan(initial_step_size),
+                error_type_t::ValidationError,
+                "Initial step size must be a number");
   initial_step_size_ = std::make_optional(initial_step_size);
 }
 
 template <typename i_t, typename f_t>
 void pdlp_solver_settings_t<i_t, f_t>::set_initial_primal_weight(f_t initial_primal_weight)
 {
+  cuopt_expects(initial_primal_weight > f_t(0),
+                error_type_t::ValidationError,
+                "Initial primal weight must be greater than 0");
+  cuopt_expects(!std::isinf(initial_primal_weight),
+                error_type_t::ValidationError,
+                "Initial primal weight must be finite");
+  cuopt_expects(!std::isnan(initial_primal_weight),
+                error_type_t::ValidationError,
+                "Initial primal weight must be a number");
   initial_primal_weight_ = std::make_optional(initial_primal_weight);
 }
 
@@ -351,6 +369,9 @@ std::optional<f_t> pdlp_solver_settings_t<i_t, f_t>::get_initial_primal_weight()
 template <typename i_t, typename f_t>
 void pdlp_solver_settings_t<i_t, f_t>::set_initial_pdlp_iteration(i_t initial_pdlp_iteration)
 {
+  cuopt_expects(initial_pdlp_iteration >= 0,
+                error_type_t::ValidationError,
+                "Initial pdlp iteration must be greater than or equal to 0");
   initial_pdlp_iteration_ = std::make_optional(initial_pdlp_iteration);
 }
 
diff --git a/cpp/src/pdlp/termination_strategy/termination_strategy.cu b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
index 563850dc0c..167cf33e73 100644
--- a/cpp/src/pdlp/termination_strategy/termination_strategy.cu
+++ b/cpp/src/pdlp/termination_strategy/termination_strategy.cu
@@ -125,9 +125,10 @@ pdlp_termination_status_t pdlp_termination_strategy_t<i_t, f_t>::get_termination
 }
 
 template <typename i_t, typename f_t>
-void pdlp_termination_strategy_t<i_t, f_t>::set_termination_status(
-  i_t id, pdlp_termination_status_t status)
+void pdlp_termination_strategy_t<i_t, f_t>::set_termination_status(i_t id,
+                                                                   pdlp_termination_status_t status)
 {
+  cuopt_assert(id < termination_status_.size(), "id too big for batch size");
   termination_status_[id] = (i_t)status;
 }
 
diff --git a/cpp/src/pdlp/utilities/ping_pong_graph.cu b/cpp/src/pdlp/utilities/ping_pong_graph.cu
index 4ec5bff8c1..0df3861b5a 100644
--- a/cpp/src/pdlp/utilities/ping_pong_graph.cu
+++ b/cpp/src/pdlp/utilities/ping_pong_graph.cu
@@ -8,6 +8,7 @@
 #include <pdlp/utilities/ping_pong_graph.cuh>
 
 #include <raft/core/error.hpp>
+#include <raft/util/cudart_utils.hpp>
 
 #include <utilities/logger.hpp>
 
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index be91e96015..ef43b1a591 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -1680,11 +1680,11 @@ TEST(pdlp_class, strong_branching_test)
   const std::vector<int> fractional     = {1, 2, 4};
   const std::vector<double> root_soln_x = {0.891, 0.109, 0.636429};
 
-  auto solver_settings                                  = pdlp_solver_settings_t<int, double>{};
-  solver_settings.method                                = cuopt::linear_programming::method_t::PDLP;
-  solver_settings.pdlp_solver_mode                      = pdlp_solver_mode_t::Stable3;
-  solver_settings.presolver                             = cuopt::linear_programming::presolver_t::None;
-  solver_settings.generate_batch_primal_dual_solution   = true;
+  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+  solver_settings.generate_batch_primal_dual_solution = true;
 
   const int n_fractional = fractional.size();
   const int batch_size   = n_fractional * 2;
@@ -2170,11 +2170,11 @@ TEST(pdlp_class, shared_sb_view_subbatch)
   const int n_fractional                = fractional.size();
   const int batch_size                  = n_fractional * 2;
 
-  auto solver_settings                   = pdlp_solver_settings_t<int, double>{};
-  solver_settings.method                 = cuopt::linear_programming::method_t::PDLP;
-  solver_settings.pdlp_solver_mode       = pdlp_solver_mode_t::Stable3;
-  solver_settings.presolver              = cuopt::linear_programming::presolver_t::None;
-  solver_settings.sub_batch_size         = 2;
+  auto solver_settings             = pdlp_solver_settings_t<int, double>{};
+  solver_settings.method           = cuopt::linear_programming::method_t::PDLP;
+  solver_settings.pdlp_solver_mode = pdlp_solver_mode_t::Stable3;
+  solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
+  solver_settings.sub_batch_size   = 2;
 
   shared_strong_branching_context_t<int, double> ctx(batch_size);
 
@@ -2227,9 +2227,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark)
   solver_settings.iteration_limit  = 1000000;
 
   for (int i = 0; i < n_fractional; ++i)
-    solver_settings.new_bounds.push_back({fractional[0],
-                                          -5,
-                                          -5});
+    solver_settings.new_bounds.push_back({fractional[0], -5, -5});
 
   for (int i = 0; i < n_fractional; ++i)
     solver_settings.new_bounds.push_back({fractional[i],
@@ -2266,7 +2264,9 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark)
     // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it)
     EXPECT_TRUE(status == pdlp_termination_status_t::Optimal ||
                 status == pdlp_termination_status_t::ConcurrentLimit)
-      << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t<int, double>::get_termination_status_string(status);
+      << "Entry " << i << " has unexpected status "
+      << cuopt::linear_programming::optimization_problem_solution_t<int, double>::
+           get_termination_status_string(status);
   }
 
   // All entries should end up marked solved
@@ -2298,9 +2298,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible)
   solver_settings.iteration_limit  = 1000000;
 
   for (int i = 0; i < n_fractional; ++i)
-    solver_settings.new_bounds.push_back({fractional[0],
-                                          -5,
-                                          -5});
+    solver_settings.new_bounds.push_back({fractional[0], -5, -5});
 
   shared_strong_branching_context_t<int, double> ctx(batch_size);
 
@@ -2331,7 +2329,9 @@ TEST(pdlp_class, shared_sb_view_all_infeasible)
     auto status = solution.get_termination_status(i);
     // Each entry should be either Optimal (PDLP solved it first) or ConcurrentLimit (DS marked it)
     EXPECT_TRUE(status == pdlp_termination_status_t::ConcurrentLimit)
-      << "Entry " << i << " has unexpected status " << cuopt::linear_programming::optimization_problem_solution_t<int, double>::get_termination_status_string(status);
+      << "Entry " << i << " has unexpected status "
+      << cuopt::linear_programming::optimization_problem_solution_t<int, double>::
+           get_termination_status_string(status);
   }
 
   // All entries should end up marked solved
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index 32cf860f28..ddc38539f5 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -451,13 +451,13 @@ class SolverConfig(BaseModel):
         "heuristics and branch and bound for MILP",
     )
     mip_batch_pdlp_strong_branching: Optional[int] = Field(
-        default=0,
+        default=1,
         description="Strong branching mode: 0 = Dual Simplex only, "
         "1 = cooperative work-stealing (DS + batch PDLP), "
         "2 = batch PDLP only.",
     )
     mip_batch_pdlp_reliability_branching: Optional[int] = Field(
-        default=0,
+        default=1,
         description="Reliability branching mode: 0 = Dual Simplex only, "
         "1 = cooperative work-stealing (DS + batch PDLP), "
         "2 = batch PDLP only.",

From 962c2eabff9704b1de9c85cf144ad3e8d39a0547 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 13:34:10 +0200
Subject: [PATCH 36/43] fix: disable batch pdlp if deterministic mode

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 0dbc4764f5..6287150e86 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -442,7 +442,14 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   const f_t elapsed_time = toc(start_time);
   if (elapsed_time > settings.time_limit) { return; }
 
-  const i_t effective_batch_pdlp = settings.sub_mip ? 0 : settings.mip_batch_pdlp_strong_branching;
+  const i_t effective_batch_pdlp =
+    (settings.sub_mip || settings.deterministic) ? 0 : settings.mip_batch_pdlp_strong_branching;
+
+  if (settings.mip_batch_pdlp_strong_branching != 0 &&
+      (settings.sub_mip || settings.deterministic)) {
+    settings.log.printf(
+      "Batch PDLP strong branching is disabled because sub-MIP or deterministic mode is enabled\n");
+  }
 
   settings.log.printf("Strong branching using %d threads and %ld fractional variables\n",
                       settings.num_threads,
@@ -991,13 +998,19 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // using batch PDLP
   constexpr i_t min_num_candidates_for_pdlp                        = 5;
   constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
-  const bool use_pdlp = (rb_mode != 0) && pdlp_warm_cache.populated &&
+  const bool use_pdlp = (rb_mode != 0) && !settings.sub_mip && !settings.deterministic &&
+                        pdlp_warm_cache.populated &&
                         unreliable_list.size() > min_num_candidates_for_pdlp &&
                         pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root >
                           min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp;
 
   if (rb_mode != 0 && !pdlp_warm_cache.populated) {
     log.printf("PDLP warm start data not populated, using DS only\n");
+  } else if (rb_mode != 0 && settings.sub_mip) {
+    log.printf("Batch PDLP reliability branching is disabled because sub-MIP is enabled\n");
+  } else if (rb_mode != 0 && settings.deterministic) {
+    log.printf(
+      "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n");
   } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
     log.printf("Not enough candidates to use batch PDLP, using DS only\n");
   } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) {

From 496c4fd91bc355101588aded0bf56e6a96ace4bb Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 13:40:20 +0200
Subject: [PATCH 37/43] fix: add size assertion to shared strong branching
 context

---
 .../shared_strong_branching_context.hpp             | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
index 6840ccbb77..60982d9344 100644
--- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
+++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
@@ -34,12 +34,21 @@ struct shared_strong_branching_context_view_t {
 
   bool is_valid() const { return !solved.empty(); }
 
-  bool is_solved(i_t local_idx) const { return solved[local_idx].load() != 0; }
+  bool is_solved(i_t local_idx) const
+  {
+    assert(local_idx < solved.size() && "local_idx out of bounds");
+    return solved[local_idx].load() != 0;
+  }
 
-  void mark_solved(i_t local_idx) const { solved[local_idx].store(1); }
+  void mark_solved(i_t local_idx) const
+  {
+    assert(local_idx < solved.size() && "local_idx out of bounds");
+    solved[local_idx].store(1);
+  }
 
   shared_strong_branching_context_view_t subview(i_t offset, i_t count) const
   {
+    assert(offset + count <= solved.size() && "subview out of bounds");
     return {solved.subspan(offset, count)};
   }
 };

From 9ec3f40576e942bd58e20dd49ebff84c189c400e Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 14:34:44 +0200
Subject: [PATCH 38/43] cleanup names

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 79 ++++++++++++-----------
 cpp/src/branch_and_bound/pseudo_costs.hpp |  2 +-
 2 files changed, 44 insertions(+), 37 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 6287150e86..a04dd6a1f5 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -27,7 +27,7 @@ namespace cuopt::linear_programming::dual_simplex {
 
 namespace {
 
-static bool ds_is_valid_done(dual::status_t status)
+static bool is_dual_simplex_done(dual::status_t status)
 {
   return status == dual::status_t::DUAL_UNBOUNDED || status == dual::status_t::OPTIMAL ||
          status == dual::status_t::ITERATION_LIMIT || status == dual::status_t::CUTOFF;
@@ -158,8 +158,8 @@ void strong_branch_helper(i_t start,
       // Mark the subproblem as solved so that batch PDLP removes it from the batch
       if (sb_view.is_valid()) {
         // We could not mark as solved nodes hitting iteartion limit in DS
-        if ((branch == 0 && ds_is_valid_done(ds_status_down[k])) ||
-            (branch == 1 && ds_is_valid_done(ds_status_up[k]))) {
+        if ((branch == 0 && is_dual_simplex_done(ds_status_down[k])) ||
+            (branch == 1 && is_dual_simplex_done(ds_status_up[k]))) {
           sb_view.mark_solved(shared_idx);
           settings.log.printf(
             "[COOP SB] DS thread %d solved variable %d branch %s (shared_idx %d), marking in "
@@ -381,6 +381,8 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
   return mps_model;
 }
 
+enum class sb_source_t { DUAL_SIMPLEX, PDLP, NONE };
+
 // Merge a single strong branching result from Dual Simplex and PDLP.
 // Rules:
 //   1. If both found optimal   -> keep DS (higher quality vertex solution)
@@ -388,35 +390,40 @@ static cuopt::mps_parser::mps_data_model_t<i_t, f_t> simplex_problem_to_mps_data
 //   3. Else if one is optimal -> keep the optimal one
 //   4. Else if Dual Simplex hit iteration limit -> keep DS
 //   5. Else if none converged -> NaN (original objective)
-// Return {value, source} where source is 0 if Dual Simplex, 1 if PDLP, 2 if both
 template <typename i_t, typename f_t>
-static std::pair<f_t, i_t> merge_sb_result(f_t ds_val,
-                                           dual::status_t ds_status,
-                                           f_t pdlp_dual_obj,
-                                           bool pdlp_optimal)
+static std::pair<f_t, sb_source_t> merge_sb_result(f_t ds_val,
+                                                   dual::status_t ds_status,
+                                                   f_t pdlp_dual_obj,
+                                                   bool pdlp_optimal)
 {
   // Dual simplex always maintains dual feasibility, so OPTIMAL and ITERATION_LIMIT both qualify
 
   // Rule 1: Both optimal -> keep DS
-  if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) { return {ds_val, 0}; }
+  if (ds_status == dual::status_t::OPTIMAL && pdlp_optimal) {
+    return {ds_val, sb_source_t::DUAL_SIMPLEX};
+  }
 
   // Rule 2: Dual Simplex found infeasible -> declare infeasible
   if (ds_status == dual::status_t::DUAL_UNBOUNDED) {
-    return {std::numeric_limits<f_t>::infinity(), 0};
+    return {std::numeric_limits<f_t>::infinity(), sb_source_t::DUAL_SIMPLEX};
   }
 
   // Rule 3: Only one converged -> keep that
-  if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) { return {ds_val, 0}; }
-  if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) { return {pdlp_dual_obj, 1}; }
+  if (ds_status == dual::status_t::OPTIMAL && !pdlp_optimal) {
+    return {ds_val, sb_source_t::DUAL_SIMPLEX};
+  }
+  if (pdlp_optimal && ds_status != dual::status_t::OPTIMAL) {
+    return {pdlp_dual_obj, sb_source_t::PDLP};
+  }
 
   // Rule 4: Dual Simplex hit iteration limit or work limit or cutoff -> keep DS
   if (ds_status == dual::status_t::ITERATION_LIMIT || ds_status == dual::status_t::WORK_LIMIT ||
       ds_status == dual::status_t::CUTOFF) {
-    return {ds_val, 0};
+    return {ds_val, sb_source_t::DUAL_SIMPLEX};
   }
 
   // Rule 5: None converged -> NaN
-  return {std::numeric_limits<f_t>::quiet_NaN(), 2};
+  return {std::numeric_limits<f_t>::quiet_NaN(), sb_source_t::NONE};
 }
 
 template <typename i_t, typename f_t>
@@ -766,9 +773,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     const auto [value_down, source_down] =
       merge_sb_result<i_t, f_t>(ds_obj_down[k], ds_status_down[k], pdlp_obj_down[k], pdlp_has_down);
     pc.strong_branch_down[k] = value_down;
-    if (source_down == 0)
+    if (source_down == sb_source_t::DUAL_SIMPLEX)
       merged_from_ds++;
-    else if (source_down == 1)
+    else if (source_down == sb_source_t::PDLP)
       merged_from_pdlp++;
     else
       merged_nan++;
@@ -779,7 +786,7 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
         fractional[k],
         ds_obj_down[k],
         pdlp_obj_down[k],
-        source_down == 0 ? "DS" : "PDLP");
+        source_down == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP");
     }
 
     bool ds_has_up   = ds_status_up[k] != dual::status_t::UNSET;
@@ -787,9 +794,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     const auto [value_up, source_up] =
       merge_sb_result<i_t, f_t>(ds_obj_up[k], ds_status_up[k], pdlp_obj_up[k], pdlp_has_up);
     pc.strong_branch_up[k] = value_up;
-    if (source_up == 0)
+    if (source_up == sb_source_t::DUAL_SIMPLEX)
       merged_from_ds++;
-    else if (source_up == 1)
+    else if (source_up == sb_source_t::PDLP)
       merged_from_pdlp++;
     else
       merged_nan++;
@@ -800,16 +807,16 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
         fractional[k],
         ds_obj_up[k],
         pdlp_obj_up[k],
-        source_up == 0 ? "DS" : "PDLP");
+        source_up == sb_source_t::DUAL_SIMPLEX ? "DS" : "PDLP");
     }
   }
 
   if (effective_batch_pdlp != 0) {
-    pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root =
+    pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root =
       (f_t(merged_from_pdlp) / f_t(fractional.size() * 2)) * 100.0;
     settings.log.printf(
-      "Batch PDLP only for strong branching. Pourcent solved by batch PDLP at root: %f\n",
-      pc.pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root);
+      "Batch PDLP only for strong branching. percent solved by batch PDLP at root: %f\n",
+      pc.pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root);
     settings.log.printf(
       "Merged results: %d from DS, %d from PDLP, %d unresolved (NaN), %d/%d solved by both "
       "(down/up)\n",
@@ -996,13 +1003,13 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // It is also off if the number of candidate is very small
   // If warm start could run but almost none of the BPDLP results were used, we also want to avoid
   // using batch PDLP
-  constexpr i_t min_num_candidates_for_pdlp                        = 5;
-  constexpr f_t min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
+  constexpr i_t min_num_candidates_for_pdlp                       = 5;
+  constexpr f_t min_percent_solved_by_batch_pdlp_at_root_for_pdlp = 5.0;
   const bool use_pdlp = (rb_mode != 0) && !settings.sub_mip && !settings.deterministic &&
                         pdlp_warm_cache.populated &&
                         unreliable_list.size() > min_num_candidates_for_pdlp &&
-                        pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root >
-                          min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp;
+                        pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root >
+                          min_percent_solved_by_batch_pdlp_at_root_for_pdlp;
 
   if (rb_mode != 0 && !pdlp_warm_cache.populated) {
     log.printf("PDLP warm start data not populated, using DS only\n");
@@ -1013,16 +1020,16 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
       "Batch PDLP reliability branching is disabled because deterministic mode is enabled\n");
   } else if (rb_mode != 0 && unreliable_list.size() < min_num_candidates_for_pdlp) {
     log.printf("Not enough candidates to use batch PDLP, using DS only\n");
-  } else if (rb_mode != 0 && pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root < 5.0) {
-    log.printf("Pourcent solved by batch PDLP at root is too low, using DS only\n");
+  } else if (rb_mode != 0 && pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root < 5.0) {
+    log.printf("Percent solved by batch PDLP at root is too low, using DS only\n");
   } else if (use_pdlp) {
     log.printf(
-      "Using batch PDLP because populated, unreliable list size is %d (> %d), and pourcent solved "
+      "Using batch PDLP because populated, unreliable list size is %d (> %d), and percent solved "
       "by batch PDLP at root is %f%% (> %f%%)\n",
       static_cast<i_t>(unreliable_list.size()),
       min_num_candidates_for_pdlp,
-      pdlp_warm_cache.pourcent_solved_by_batch_pdlp_at_root,
-      min_pourcent_solved_by_batch_pdlp_at_root_for_pdlp);
+      pdlp_warm_cache.percent_solved_by_batch_pdlp_at_root,
+      min_percent_solved_by_batch_pdlp_at_root_for_pdlp);
   }
 
   const int num_tasks     = std::max(max_num_tasks, 1);
@@ -1216,7 +1223,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             pseudo_cost_sum_down[j] += change_in_obj / change_in_x;
             pseudo_cost_num_down[j]++;
             // Should be valid if were are already here
-            if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(i); }
+            if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); }
           }
         }
         pseudo_cost_mutex_down[j].unlock();
@@ -1259,7 +1266,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             pseudo_cost_sum_up[j] += change_in_obj / change_in_x;
             pseudo_cost_num_up[j]++;
             // Should be valid if were are already here
-            if (rb_mode == 1 && ds_is_valid_done(status)) { sb_view.mark_solved(shared_idx); }
+            if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); }
           }
         }
         pseudo_cost_mutex_up[j].unlock();
@@ -1314,7 +1321,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           merge_sb_result<i_t, f_t>(ds_obj_down[i], ds_status_down[i], pdlp_obj_down[i], true);
         // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
         // calls may have made it reliable)
-        if (source == 1) {
+        if (source == sb_source_t::PDLP) {
           pseudo_cost_mutex_down[j].lock();
           if (pseudo_cost_num_down[j] < reliable_threshold) {
             f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
@@ -1334,7 +1341,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
           merge_sb_result<i_t, f_t>(ds_obj_up[i], ds_status_up[i], pdlp_obj_up[i], true);
         // PDLP won the merge, update the pseudo-cost only if node is still unreliable (concurrent
         // calls may have made it reliable)
-        if (source == 1) {
+        if (source == sb_source_t::PDLP) {
           pseudo_cost_mutex_up[j].lock();
           if (pseudo_cost_num_up[j] < reliable_threshold) {
             f_t change_in_obj = std::max(merged_obj - node_ptr->lower_bound, eps);
diff --git a/cpp/src/branch_and_bound/pseudo_costs.hpp b/cpp/src/branch_and_bound/pseudo_costs.hpp
index be8f9f71d4..322daa8907 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.hpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.hpp
@@ -416,7 +416,7 @@ struct batch_pdlp_warm_cache_t {
   f_t step_size{std::numeric_limits<f_t>::signaling_NaN()};
   f_t primal_weight{std::numeric_limits<f_t>::signaling_NaN()};
   i_t pdlp_iteration{-1};
-  f_t pourcent_solved_by_batch_pdlp_at_root{f_t(0.0)};
+  f_t percent_solved_by_batch_pdlp_at_root{f_t(0.0)};
   bool populated{false};
 };
 

From 16e4e5fbec08770973c8cb24122e6b81be30053f Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 18:00:00 +0200
Subject: [PATCH 39/43] multiples fixes: use span only in solver, use tasks to
 launch bpdlp rather than thread, put both bpdlp call in functions

---
 .../pdlp/solver_settings.hpp                  |   7 +-
 cpp/src/branch_and_bound/pseudo_costs.cpp     | 623 ++++++++++--------
 .../shared_strong_branching_context.hpp       |  10 +-
 cpp/src/pdlp/pdlp.cu                          |  16 +-
 cpp/src/pdlp/pdlp.cuh                         |   3 +
 cpp/src/pdlp/solve.cu                         |   4 +-
 cpp/tests/linear_programming/pdlp_test.cu     |  49 +-
 7 files changed, 381 insertions(+), 331 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
index ded180fdf3..6abefb2d5d 100644
--- a/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/pdlp/solver_settings.hpp
@@ -17,8 +17,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <atomic>
-
-#include <branch_and_bound/shared_strong_branching_context.hpp>
+#include <span>
 
 namespace cuopt::linear_programming {
 
@@ -275,8 +274,8 @@ class pdlp_solver_settings_t {
   bool inside_mip{false};
   // For concurrent termination
   std::atomic<int>* concurrent_halt{nullptr};
-  // Shared strong branching context view for cooperative DS + PDLP
-  dual_simplex::shared_strong_branching_context_view_t<i_t, f_t> shared_sb_view;
+  // Shared strong branching solved flags for cooperative DS + PDLP
+  std::span<std::atomic<int>> shared_sb_solved;
   static constexpr f_t minimal_absolute_tolerance = 1.0e-12;
   pdlp_hyper_params::pdlp_hyper_params_t hyper_params;
   // Holds the information of new variable lower and upper bounds for each climber in the format:
diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index a04dd6a1f5..5bcd819ba5 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -157,7 +157,7 @@ void strong_branch_helper(i_t start,
       }
       // Mark the subproblem as solved so that batch PDLP removes it from the batch
       if (sb_view.is_valid()) {
-        // We could not mark as solved nodes hitting iteartion limit in DS
+        // We could not mark as solved nodes hitting iteration limit in DS
         if ((branch == 0 && is_dual_simplex_done(ds_status_down[k])) ||
             (branch == 1 && is_dual_simplex_done(ds_status_up[k]))) {
           sb_view.mark_solved(shared_idx);
@@ -426,6 +426,299 @@ static std::pair<f_t, sb_source_t> merge_sb_result(f_t ds_val,
   return {std::numeric_limits<f_t>::quiet_NaN(), sb_source_t::NONE};
 }
 
+template <typename i_t, typename f_t>
+static void batch_pdlp_strong_branching_task(
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  i_t effective_batch_pdlp,
+  f_t start_time,
+  std::atomic<int>& concurrent_halt,
+  const lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const std::vector<f_t>& root_soln,
+  const std::vector<i_t>& fractional,
+  f_t root_obj,
+  pseudo_costs_t<i_t, f_t>& pc,
+  shared_strong_branching_context_view_t<i_t, f_t>& sb_view,
+  std::vector<f_t>& pdlp_obj_down,
+  std::vector<f_t>& pdlp_obj_up)
+{
+  settings.log.printf(effective_batch_pdlp == 2
+                        ? "Batch PDLP only for strong branching\n"
+                        : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
+
+  f_t start_batch = tic();
+  std::vector<f_t> original_root_soln_x;
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  const auto mps_model =
+    simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
+
+  std::vector<f_t> fraction_values;
+
+  std::vector<f_t> original_root_soln_y, original_root_soln_z;
+  // TODO put back later once Chris has this part
+  /*uncrush_dual_solution(
+    original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y,
+    original_root_soln_z);*/
+
+  for (i_t k = 0; k < fractional.size(); k++) {
+    const i_t j = fractional[k];
+    fraction_values.push_back(original_root_soln_x[j]);
+  }
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  f_t batch_elapsed_time = toc(start_time);
+  const f_t warm_start_remaining_time =
+    std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+  if (warm_start_remaining_time <= 0.0) { return; }
+
+  assert(!pc.pdlp_warm_cache.populated && "PDLP warm cache should not be populated at this point");
+
+  if (!pc.pdlp_warm_cache.populated) {
+    pdlp_solver_settings_t<i_t, f_t> ws_settings;
+    ws_settings.method               = method_t::PDLP;
+    ws_settings.presolver            = presolver_t::None;
+    ws_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
+    ws_settings.detect_infeasibility = false;
+    // Since the warm start will be used over and over again we want to maximize the chance of
+    // convergeance Batch PDLP is very compute intensive so we want to minimize the number of
+    // iterations
+    constexpr int warm_start_iteration_limit         = 500000;
+    ws_settings.iteration_limit                      = warm_start_iteration_limit;
+    ws_settings.time_limit                           = warm_start_remaining_time;
+    constexpr f_t pdlp_tolerance                     = 1e-5;
+    ws_settings.tolerances.relative_dual_tolerance   = pdlp_tolerance;
+    ws_settings.tolerances.absolute_dual_tolerance   = pdlp_tolerance;
+    ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
+    ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
+    ws_settings.tolerances.relative_gap_tolerance    = pdlp_tolerance;
+    ws_settings.tolerances.absolute_gap_tolerance    = pdlp_tolerance;
+    ws_settings.inside_mip                           = true;
+    if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
+
+#ifdef BATCH_VERBOSE_MODE
+    auto start_time = std::chrono::high_resolution_clock::now();
+#endif
+
+    auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
+
+#ifdef BATCH_VERBOSE_MODE
+    auto end_time = std::chrono::high_resolution_clock::now();
+    auto duration =
+      std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
+    std::cout << "Original problem solved in " << duration << " milliseconds"
+              << " and iterations: "
+              << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
+#endif
+
+    if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) {
+      auto& cache           = pc.pdlp_warm_cache;
+      const auto& ws_primal = ws_solution.get_primal_solution();
+      const auto& ws_dual   = ws_solution.get_dual_solution();
+      // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm
+      // start
+      cache.initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
+      cache.initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
+      cache.step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
+      cache.primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
+      cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
+      cache.populated      = true;
+
+      settings.log.printf(
+        "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
+        cache.initial_primal.size(),
+        cache.initial_dual.size(),
+        cache.step_size,
+        cache.primal_weight,
+        cache.pdlp_iteration);
+    } else {
+      settings.log.printf(
+        "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
+        ws_solution.get_termination_status_string().c_str());
+      return;
+    }
+  }
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
+  if (effective_batch_pdlp == 1) {
+    pdlp_settings.concurrent_halt  = &concurrent_halt;
+    pdlp_settings.shared_sb_solved = sb_view.solved;
+  }
+
+  batch_elapsed_time = toc(start_time);
+  const f_t batch_remaining_time =
+    std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+  if (batch_remaining_time <= 0.0) { return; }
+  pdlp_settings.time_limit = batch_remaining_time;
+
+  if (pc.pdlp_warm_cache.populated) {
+    auto& cache = pc.pdlp_warm_cache;
+    pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
+                                              cache.initial_primal.size(),
+                                              cache.batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_dual_solution(
+      cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_step_size(cache.step_size);
+    pdlp_settings.set_initial_primal_weight(cache.primal_weight);
+    pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
+  }
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  const auto solutions = batch_pdlp_solve(
+    &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
+  f_t batch_pdlp_strong_branching_time = toc(start_batch);
+
+  // Fail safe in case the batch PDLP failed and produced no solutions
+  if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) {
+    settings.log.printf("Batch PDLP failed and produced no solutions\n");
+    return;
+  }
+
+  // Find max iteration on how many are done accross the batch
+  i_t max_iterations = 0;
+  i_t amount_done    = 0;
+  for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) {
+    max_iterations = std::max(
+      max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken);
+    // TODO batch mode infeasible: should also count as done if infeasible
+    if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+      amount_done++;
+    }
+  }
+
+  settings.log.printf(
+    "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n",
+    batch_pdlp_strong_branching_time,
+    amount_done,
+    fractional.size() * 2,
+    max_iterations);
+
+  for (i_t k = 0; k < fractional.size(); k++) {
+    f_t obj_down = (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal)
+                     ? solutions.get_dual_objective_value(k)
+                     : std::numeric_limits<f_t>::quiet_NaN();
+
+    f_t obj_up = (solutions.get_termination_status(k + fractional.size()) ==
+                  pdlp_termination_status_t::Optimal)
+                   ? solutions.get_dual_objective_value(k + fractional.size())
+                   : std::numeric_limits<f_t>::quiet_NaN();
+
+    pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0));
+    pdlp_obj_up[k]   = std::max(obj_up - root_obj, f_t(0.0));
+  }
+}
+
+template <typename i_t, typename f_t>
+static void batch_pdlp_reliability_branching_task(
+  logger_t& log,
+  i_t rb_mode,
+  i_t num_candidates,
+  f_t start_time,
+  std::atomic<int>& concurrent_halt,
+  const lp_problem_t<i_t, f_t>& original_lp,
+  const std::vector<i_t>& new_slacks,
+  const std::vector<f_t>& solution,
+  branch_and_bound_worker_t<i_t, f_t>* worker,
+  const std::vector<i_t>& candidate_vars,
+  const simplex_solver_settings_t<i_t, f_t>& settings,
+  shared_strong_branching_context_view_t<i_t, f_t>& sb_view,
+  batch_pdlp_warm_cache_t<i_t, f_t>& pdlp_warm_cache,
+  std::vector<f_t>& pdlp_obj_down,
+  std::vector<f_t>& pdlp_obj_up)
+{
+  log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n"
+                          : "RB cooperative batch PDLP and DS for %d candidates\n",
+             num_candidates);
+
+  f_t start_batch = tic();
+
+  std::vector<f_t> original_soln_x;
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  auto mps_model =
+    simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x);
+  {
+    const i_t n_orig = original_lp.num_cols - new_slacks.size();
+    for (i_t j = 0; j < n_orig; j++) {
+      mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j];
+      mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j];
+    }
+  }
+
+  std::vector<f_t> fraction_values;
+  fraction_values.reserve(num_candidates);
+  for (i_t j : candidate_vars) {
+    fraction_values.push_back(original_soln_x[j]);
+  }
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  const f_t batch_elapsed_time = toc(start_time);
+  const f_t batch_remaining_time =
+    std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
+  if (batch_remaining_time <= 0.0) { return; }
+
+  pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
+  if (rb_mode == 1) {
+    pdlp_settings.concurrent_halt  = &concurrent_halt;
+    pdlp_settings.shared_sb_solved = sb_view.solved;
+  }
+  pdlp_settings.time_limit = batch_remaining_time;
+
+  if (pdlp_warm_cache.populated) {
+    auto& cache = pdlp_warm_cache;
+    pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
+                                              cache.initial_primal.size(),
+                                              cache.batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_dual_solution(
+      cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_step_size(cache.step_size);
+    pdlp_settings.set_initial_primal_weight(cache.primal_weight);
+    pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
+  }
+
+  if (concurrent_halt.load() == 1) { return; }
+
+  const auto solutions = batch_pdlp_solve(
+    &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
+
+  f_t batch_pdlp_time = toc(start_batch);
+
+  if (solutions.get_additional_termination_informations().size() !=
+      static_cast<size_t>(num_candidates) * 2) {
+    log.printf("RB batch PDLP failed and produced no solutions\n");
+    return;
+  }
+
+  i_t amount_done = 0;
+  for (i_t k = 0; k < num_candidates * 2; k++) {
+    if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+      amount_done++;
+    }
+  }
+
+  log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
+             batch_pdlp_time,
+             amount_done,
+             num_candidates * 2);
+
+  for (i_t k = 0; k < num_candidates; k++) {
+    if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
+      pdlp_obj_down[k] = solutions.get_dual_objective_value(k);
+    }
+    if (solutions.get_termination_status(k + num_candidates) ==
+        pdlp_termination_status_t::Optimal) {
+      pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates);
+    }
+  }
+}
+
 template <typename i_t, typename f_t>
 void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
                       const simplex_solver_settings_t<i_t, f_t>& settings,
@@ -450,7 +743,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
   if (elapsed_time > settings.time_limit) { return; }
 
   const i_t effective_batch_pdlp =
-    (settings.sub_mip || settings.deterministic) ? 0 : settings.mip_batch_pdlp_strong_branching;
+    (settings.sub_mip || (settings.deterministic && settings.mip_batch_pdlp_strong_branching == 1))
+      ? 0
+      : settings.mip_batch_pdlp_strong_branching;
 
   if (settings.mip_batch_pdlp_strong_branching != 0 &&
       (settings.sub_mip || settings.deterministic)) {
@@ -464,200 +759,29 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
 
   // Cooperative DS + PDLP: shared context tracks which subproblems are solved
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * fractional.size());
-  shared_strong_branching_context_view_t<i_t, f_t> sb_view(std::span(shared_ctx.solved));
+  shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
 
   std::atomic<int> concurrent_halt{0};
 
   std::vector<f_t> pdlp_obj_down(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(fractional.size(), std::numeric_limits<f_t>::quiet_NaN());
 
-  auto pdlp_thread = std::thread([&]() {
-    if (effective_batch_pdlp == 0) return;
-
-    settings.log.printf(effective_batch_pdlp == 2
-                          ? "Batch PDLP only for strong branching\n"
-                          : "Cooperative batch PDLP and Dual Simplex for strong branching\n");
-
-    f_t start_batch = tic();
-    std::vector<f_t> original_root_soln_x;
-
-    if (concurrent_halt.load() == 1) { return; }
-
-    const auto mps_model =
-      simplex_problem_to_mps_data_model(original_lp, new_slacks, root_soln, original_root_soln_x);
-
-    std::vector<f_t> fraction_values;
-
-    std::vector<f_t> original_root_soln_y, original_root_soln_z;
-    // TODO put back later once Chris has this part
-    /*uncrush_dual_solution(
-      original_problem, original_lp, root_soln_y, root_soln_z, original_root_soln_y,
-      original_root_soln_z);*/
-
-    for (i_t k = 0; k < fractional.size(); k++) {
-      const i_t j = fractional[k];
-      fraction_values.push_back(original_root_soln_x[j]);
-    }
-
-    if (concurrent_halt.load() == 1) { return; }
-
-    f_t batch_elapsed_time = toc(start_time);
-    const f_t warm_start_remaining_time =
-      std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
-    if (warm_start_remaining_time <= 0.0) { return; }
-
-    assert(!pc.pdlp_warm_cache.populated &&
-           "PDLP warm cache should not be populated at this point");
-
-    if (!pc.pdlp_warm_cache.populated) {
-      pdlp_solver_settings_t<i_t, f_t> ws_settings;
-      ws_settings.method               = method_t::PDLP;
-      ws_settings.presolver            = presolver_t::None;
-      ws_settings.pdlp_solver_mode     = pdlp_solver_mode_t::Stable3;
-      ws_settings.detect_infeasibility = false;
-      // Since the warm start will be used over and over again we want to maximize the chance of
-      // convergeance Batch PDLP is very compute intensive so we want to minimize the number of
-      // iterations
-      constexpr int warm_start_iteration_limit         = 500000;
-      ws_settings.iteration_limit                      = warm_start_iteration_limit;
-      ws_settings.time_limit                           = warm_start_remaining_time;
-      constexpr f_t pdlp_tolerance                     = 1e-5;
-      ws_settings.tolerances.relative_dual_tolerance   = pdlp_tolerance;
-      ws_settings.tolerances.absolute_dual_tolerance   = pdlp_tolerance;
-      ws_settings.tolerances.relative_primal_tolerance = pdlp_tolerance;
-      ws_settings.tolerances.absolute_primal_tolerance = pdlp_tolerance;
-      ws_settings.tolerances.relative_gap_tolerance    = pdlp_tolerance;
-      ws_settings.tolerances.absolute_gap_tolerance    = pdlp_tolerance;
-      ws_settings.inside_mip                           = true;
-      if (effective_batch_pdlp == 1) { ws_settings.concurrent_halt = &concurrent_halt; }
-
-#ifdef BATCH_VERBOSE_MODE
-      auto start_time = std::chrono::high_resolution_clock::now();
-#endif
-
-      auto ws_solution = solve_lp(&pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, ws_settings);
-
-#ifdef BATCH_VERBOSE_MODE
-      auto end_time = std::chrono::high_resolution_clock::now();
-      auto duration =
-        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time).count();
-      std::cout << "Original problem solved in " << duration << " milliseconds"
-                << " and iterations: "
-                << ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_ << std::endl;
-#endif
-
-      if (ws_solution.get_termination_status() == pdlp_termination_status_t::Optimal) {
-        auto& cache           = pc.pdlp_warm_cache;
-        const auto& ws_primal = ws_solution.get_primal_solution();
-        const auto& ws_dual   = ws_solution.get_dual_solution();
-        // Need to use the pc steam since the batch pdlp handle will get destroyed after the warm
-        // start
-        cache.initial_primal = rmm::device_uvector<f_t>(ws_primal, ws_primal.stream());
-        cache.initial_dual   = rmm::device_uvector<f_t>(ws_dual, ws_dual.stream());
-        cache.step_size      = ws_solution.get_pdlp_warm_start_data().initial_step_size_;
-        cache.primal_weight  = ws_solution.get_pdlp_warm_start_data().initial_primal_weight_;
-        cache.pdlp_iteration = ws_solution.get_pdlp_warm_start_data().total_pdlp_iterations_;
-        cache.populated      = true;
-
-        settings.log.printf(
-          "Cached PDLP warm start: primal=%zu dual=%zu step_size=%e primal_weight=%e iters=%d\n",
-          cache.initial_primal.size(),
-          cache.initial_dual.size(),
-          cache.step_size,
-          cache.primal_weight,
-          cache.pdlp_iteration);
-      } else {
-        settings.log.printf(
-          "PDLP warm start solve did not reach optimality (%s), skipping cache and batch PDLP\n",
-          ws_solution.get_termination_status_string().c_str());
-        return;
-      }
-    }
-
-    if (concurrent_halt.load() == 1) { return; }
-
-    pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-    if (effective_batch_pdlp == 1) {
-      pdlp_settings.concurrent_halt = &concurrent_halt;
-      pdlp_settings.shared_sb_view  = sb_view;
-    }
-
-    batch_elapsed_time = toc(start_time);
-    const f_t batch_remaining_time =
-      std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
-    if (batch_remaining_time <= 0.0) { return; }
-    pdlp_settings.time_limit = batch_remaining_time;
-
-    if (pc.pdlp_warm_cache.populated) {
-      auto& cache = pc.pdlp_warm_cache;
-      pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
-                                                cache.initial_primal.size(),
-                                                cache.batch_pdlp_handle.get_stream());
-      pdlp_settings.set_initial_dual_solution(
-        cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
-      pdlp_settings.set_initial_step_size(cache.step_size);
-      pdlp_settings.set_initial_primal_weight(cache.primal_weight);
-      pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
-    }
-
-    if (concurrent_halt.load() == 1) { return; }
-
-    const auto solutions = batch_pdlp_solve(
-      &pc.pdlp_warm_cache.batch_pdlp_handle, mps_model, fractional, fraction_values, pdlp_settings);
-    f_t batch_pdlp_strong_branching_time = toc(start_batch);
-
-    // Fail safe in case the batch PDLP failed and produced no solutions
-    if (solutions.get_additional_termination_informations().size() != fractional.size() * 2) {
-      settings.log.printf("Batch PDLP failed and produced no solutions\n");
-      return;
-    }
-
-    // Find max iteration on how many are done accross the batch
-    i_t max_iterations = 0;
-    i_t amount_done    = 0;
-    for (i_t k = 0; k < solutions.get_additional_termination_informations().size(); k++) {
-      max_iterations = std::max(
-        max_iterations, solutions.get_additional_termination_information(k).number_of_steps_taken);
-      // TODO batch mode infeasible: should also count as done if infeasible
-      if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
-        amount_done++;
-      }
-    }
-
-    settings.log.printf(
-      "Batch PDLP strong branching completed in %.2fs. Solved %d/%d with max %d iterations\n",
-      batch_pdlp_strong_branching_time,
-      amount_done,
-      fractional.size() * 2,
-      max_iterations);
-
-    for (i_t k = 0; k < fractional.size(); k++) {
-      // Call BatchLP solver. Solve 2*fractional.size() subproblems.
-      // Let j = fractional[k]. We want to solve the two trial branching problems
-      // Branch down:
-      // minimize c^T x
-      // subject to lb <= A*x <= ub
-      // x_j <= floor(root_soln[j])
-      // l <= x < u
-      // Let the optimal objective value of thie problem be obj_down
-      f_t obj_down = (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal)
-                       ? solutions.get_dual_objective_value(k)
-                       : std::numeric_limits<f_t>::quiet_NaN();
-
-      // Branch up:
-      // minimize c^T x
-      // subject to lb <= A*x <= ub
-      // x_j >= ceil(root_soln[j])
-      // Let the optimal objective value of thie problem be obj_up
-      f_t obj_up = (solutions.get_termination_status(k + fractional.size()) ==
-                    pdlp_termination_status_t::Optimal)
-                     ? solutions.get_dual_objective_value(k + fractional.size())
-                     : std::numeric_limits<f_t>::quiet_NaN();
-
-      pdlp_obj_down[k] = std::max(obj_down - root_obj, f_t(0.0));
-      pdlp_obj_up[k]   = std::max(obj_up - root_obj, f_t(0.0));
-    }
-  });
+  if (effective_batch_pdlp != 0) {
+#pragma omp task default(shared)
+    batch_pdlp_strong_branching_task(settings,
+                                     effective_batch_pdlp,
+                                     start_time,
+                                     concurrent_halt,
+                                     original_lp,
+                                     new_slacks,
+                                     root_soln,
+                                     fractional,
+                                     root_obj,
+                                     pc,
+                                     sb_view,
+                                     pdlp_obj_down,
+                                     pdlp_obj_up);
+  }
 
   std::vector<dual::status_t> ds_status_down(fractional.size(), dual::status_t::UNSET);
   std::vector<dual::status_t> ds_status_up(fractional.size(), dual::status_t::UNSET);
@@ -712,7 +836,9 @@ void strong_branching(const lp_problem_t<i_t, f_t>& original_lp,
     concurrent_halt.store(1);
   }
 
-  pdlp_thread.join();
+  if (effective_batch_pdlp != 0) {
+#pragma omp taskwait
+  }
 
   settings.log.printf("Strong branching took %.2fs\n", toc(dual_simplex_strong_branching_time));
 
@@ -1062,114 +1188,37 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   // Shared context for cooperative work-stealing (mode 1)
   // [0..num_candidates) = down, [num_candidates..2*num_candidates) = up
   shared_strong_branching_context_t<i_t, f_t> shared_ctx(2 * num_candidates);
-  shared_strong_branching_context_view_t<i_t, f_t> sb_view(std::span(shared_ctx.solved));
+  shared_strong_branching_context_view_t<i_t, f_t> sb_view(shared_ctx.solved);
 
   std::vector<f_t> pdlp_obj_down(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
   std::vector<f_t> pdlp_obj_up(num_candidates, std::numeric_limits<f_t>::quiet_NaN());
 
   std::atomic<int> concurrent_halt{0};
-  std::thread pdlp_thread;
 
   if (use_pdlp) {
-    pdlp_thread = std::thread([&]() {
-      log.printf(rb_mode == 2 ? "RB batch PDLP only for %d candidates\n"
-                              : "RB cooperative batch PDLP and DS for %d candidates\n",
-                 num_candidates);
-
-      f_t start_batch = tic();
-
-      std::vector<f_t> original_soln_x;
-
-      if (concurrent_halt.load() == 1) { return; }
-
-      auto mps_model =
-        simplex_problem_to_mps_data_model(original_lp, new_slacks, solution, original_soln_x);
-      {
-        const i_t n_orig = original_lp.num_cols - new_slacks.size();
-        for (i_t j = 0; j < n_orig; j++) {
-          mps_model.variable_lower_bounds_[j] = worker->leaf_problem.lower[j];
-          mps_model.variable_upper_bounds_[j] = worker->leaf_problem.upper[j];
-        }
-      }
-
-      std::vector<f_t> fraction_values;
-      fraction_values.reserve(num_candidates);
-      for (i_t j : candidate_vars) {
-        fraction_values.push_back(original_soln_x[j]);
-      }
-
-      if (concurrent_halt.load() == 1) { return; }
-
-      const f_t batch_elapsed_time = toc(start_time);
-      const f_t batch_remaining_time =
-        std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
-      if (batch_remaining_time <= 0.0) { return; }
-
-      pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
-      if (rb_mode == 1) {
-        pdlp_settings.concurrent_halt = &concurrent_halt;
-        pdlp_settings.shared_sb_view  = sb_view;
-      }
-      pdlp_settings.time_limit = batch_remaining_time;
-
-      if (pdlp_warm_cache.populated) {
-        auto& cache = pdlp_warm_cache;
-        pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
-                                                  cache.initial_primal.size(),
-                                                  cache.batch_pdlp_handle.get_stream());
-        pdlp_settings.set_initial_dual_solution(cache.initial_dual.data(),
-                                                cache.initial_dual.size(),
-                                                cache.batch_pdlp_handle.get_stream());
-        pdlp_settings.set_initial_step_size(cache.step_size);
-        pdlp_settings.set_initial_primal_weight(cache.primal_weight);
-        pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
-      }
-
-      if (concurrent_halt.load() == 1) { return; }
-
-      const auto solutions = batch_pdlp_solve(&pdlp_warm_cache.batch_pdlp_handle,
-                                              mps_model,
-                                              candidate_vars,
-                                              fraction_values,
-                                              pdlp_settings);
-
-      f_t batch_pdlp_time = toc(start_batch);
-
-      if (solutions.get_additional_termination_informations().size() !=
-          static_cast<size_t>(num_candidates) * 2) {
-        log.printf("RB batch PDLP failed and produced no solutions\n");
-        return;
-      }
-
-      i_t amount_done = 0;
-      for (i_t k = 0; k < num_candidates * 2; k++) {
-        if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
-          amount_done++;
-        }
-      }
-
-      log.printf("RB batch PDLP completed in %.2fs. Solved %d/%d\n",
-                 batch_pdlp_time,
-                 amount_done,
-                 num_candidates * 2);
-
-      for (i_t k = 0; k < num_candidates; k++) {
-        if (solutions.get_termination_status(k) == pdlp_termination_status_t::Optimal) {
-          pdlp_obj_down[k] = solutions.get_dual_objective_value(k);
-        }
-        if (solutions.get_termination_status(k + num_candidates) ==
-            pdlp_termination_status_t::Optimal) {
-          pdlp_obj_up[k] = solutions.get_dual_objective_value(k + num_candidates);
-        }
-      }
-    });
+#pragma omp task default(shared)
+    batch_pdlp_reliability_branching_task(log,
+                                          rb_mode,
+                                          num_candidates,
+                                          start_time,
+                                          concurrent_halt,
+                                          original_lp,
+                                          new_slacks,
+                                          solution,
+                                          worker,
+                                          candidate_vars,
+                                          settings,
+                                          sb_view,
+                                          pdlp_warm_cache,
+                                          pdlp_obj_down,
+                                          pdlp_obj_up);
   }
 
   if (toc(start_time) > settings.time_limit) {
     log.printf("Time limit reached\n");
     if (use_pdlp) {
       concurrent_halt.store(1);
-      pdlp_thread.join();
+#pragma omp taskwait
     }
     return branch_var;
   }
@@ -1307,7 +1356,7 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
   //}
 
   if (use_pdlp) {
-    pdlp_thread.join();
+#pragma omp taskwait
 
     i_t pdlp_applied = 0;
     i_t pdlp_optimal = 0;
diff --git a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
index 60982d9344..a9e697ae58 100644
--- a/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
+++ b/cpp/src/branch_and_bound/shared_strong_branching_context.hpp
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <atomic>
+#include <cassert>
 #include <span>
 #include <vector>
 
@@ -36,19 +37,22 @@ struct shared_strong_branching_context_view_t {
 
   bool is_solved(i_t local_idx) const
   {
-    assert(local_idx < solved.size() && "local_idx out of bounds");
+    assert(local_idx >= 0 && static_cast<size_t>(local_idx) < solved.size() &&
+           "local_idx out of bounds");
     return solved[local_idx].load() != 0;
   }
 
   void mark_solved(i_t local_idx) const
   {
-    assert(local_idx < solved.size() && "local_idx out of bounds");
+    assert(local_idx >= 0 && static_cast<size_t>(local_idx) < solved.size() &&
+           "local_idx out of bounds");
     solved[local_idx].store(1);
   }
 
   shared_strong_branching_context_view_t subview(i_t offset, i_t count) const
   {
-    assert(offset + count <= solved.size() && "subview out of bounds");
+    assert(offset >= 0 && count >= 0 && static_cast<size_t>(offset + count) <= solved.size() &&
+           "subview out of bounds");
     return {solved.subspan(offset, count)};
   }
 };
diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 642c17758d..85cba335ba 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -777,7 +777,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
 #endif
 
   // Sync external solved status into internal termination strategy before all_done() check
-  if (settings_.shared_sb_view.is_valid()) {
+  if (sb_view_.is_valid()) {
     for (size_t i = 0; i < climber_strategies_.size(); ++i) {
       // If PDLP has solved it to optimality we want to keep it and resolved both solvers having
       // solved the problem later
@@ -785,7 +785,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
             current_termination_strategy_.get_termination_status(i)))
         continue;
       const i_t local_idx = climber_strategies_[i].original_index;
-      if (settings_.shared_sb_view.is_solved(local_idx)) {
+      if (sb_view_.is_solved(local_idx)) {
         current_termination_strategy_.set_termination_status(
           i, pdlp_termination_status_t::ConcurrentLimit);
 #ifdef BATCH_VERBOSE_MODE
@@ -844,9 +844,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
           .get_additional_termination_informations()[climber_strategies_[i].original_index]
           .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) !=
                              pdlp_termination_status_t::ConcurrentLimit);
-        if (settings_.shared_sb_view.is_valid()) {
-          settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
-        }
+        if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); }
       }
       current_termination_strategy_.fill_gpu_terms_stats(total_pdlp_iterations_);
       RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
@@ -863,9 +861,9 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
         std::move(batch_solution_to_return_.get_additional_termination_informations()),
         std::move(batch_solution_to_return_.get_terminations_status())};
     }
-    if (settings_.shared_sb_view.is_valid()) {
+    if (sb_view_.is_valid()) {
       for (size_t i = 0; i < climber_strategies_.size(); ++i) {
-        settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
+        sb_view_.mark_solved(climber_strategies_[i].original_index);
       }
     }
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view_));
@@ -923,9 +921,7 @@ pdlp_solver_t<i_t, f_t>::check_batch_termination(const timer_t& timer)
           .get_additional_termination_informations()[climber_strategies_[i].original_index]
           .solved_by_pdlp = (current_termination_strategy_.get_termination_status(i) !=
                              pdlp_termination_status_t::ConcurrentLimit);
-        if (settings_.shared_sb_view.is_valid()) {
-          settings_.shared_sb_view.mark_solved(climber_strategies_[i].original_index);
-        }
+        if (sb_view_.is_valid()) { sb_view_.mark_solved(climber_strategies_[i].original_index); }
       }
     }
     if (to_remove.size() > 0) {
diff --git a/cpp/src/pdlp/pdlp.cuh b/cpp/src/pdlp/pdlp.cuh
index de0cf69c91..d03430f150 100644
--- a/cpp/src/pdlp/pdlp.cuh
+++ b/cpp/src/pdlp/pdlp.cuh
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <branch_and_bound/shared_strong_branching_context.hpp>
 #include <cuopt/linear_programming/pdlp/solver_settings.hpp>
 #include <cuopt/linear_programming/pdlp/solver_solution.hpp>
 
@@ -138,6 +139,8 @@ class pdlp_solver_t {
   rmm::cuda_stream_view stream_view_;
   // Intentionnaly take a copy to avoid an unintentional modification in the calling context
   const pdlp_solver_settings_t<i_t, f_t> settings_;
+  dual_simplex::shared_strong_branching_context_view_t<i_t, f_t> sb_view_{
+    settings_.shared_sb_solved};
 
   problem_t<i_t, f_t>* problem_ptr;
   // Combined bounds in op_problem_scaled_ will only be scaled if
diff --git a/cpp/src/pdlp/solve.cu b/cpp/src/pdlp/solve.cu
index 267e149029..341edb2c1f 100644
--- a/cpp/src/pdlp/solve.cu
+++ b/cpp/src/pdlp/solve.cu
@@ -1045,8 +1045,8 @@ optimization_problem_solution_t<i_t, f_t> run_batch_pdlp(
     batch_settings.new_bounds = std::vector<std::tuple<i_t, f_t, f_t>>(
       original_new_bounds.begin() + i, original_new_bounds.begin() + i + current_batch_size);
 
-    if (settings.shared_sb_view.is_valid()) {
-      batch_settings.shared_sb_view = settings.shared_sb_view.subview(i, current_batch_size);
+    if (!settings.shared_sb_solved.empty()) {
+      batch_settings.shared_sb_solved = settings.shared_sb_solved.subspan(i, current_batch_size);
     }
 
     auto sol = solve_lp(problem, batch_settings);
diff --git a/cpp/tests/linear_programming/pdlp_test.cu b/cpp/tests/linear_programming/pdlp_test.cu
index ef43b1a591..5c6edad27b 100644
--- a/cpp/tests/linear_programming/pdlp_test.cu
+++ b/cpp/tests/linear_programming/pdlp_test.cu
@@ -5,6 +5,7 @@
  */
 /* clang-format on */
 
+#include <branch_and_bound/shared_strong_branching_context.hpp>
 #include <mps_parser.hpp>
 #include <pdlp/cusparse_view.hpp>
 #include <pdlp/pdlp.cuh>
@@ -46,8 +47,6 @@
 #include <thread>
 #include <vector>
 
-#include <branch_and_bound/shared_strong_branching_context.hpp>
-
 namespace cuopt::linear_programming::test {
 
 constexpr double afiro_primal_objective = -464.0;
@@ -2057,7 +2056,7 @@ TEST(pdlp_class, shared_sb_context_unit)
 
   constexpr int N = 10;
   shared_strong_branching_context_t<int, double> ctx(N);
-  shared_strong_branching_context_view_t<int, double> view(std::span(ctx.solved));
+  shared_strong_branching_context_view_t<int, double> view(ctx.solved);
 
   EXPECT_TRUE(view.is_valid());
 
@@ -2127,14 +2126,14 @@ TEST(pdlp_class, shared_sb_view_batch_pre_solved)
                                           std::ceil(root_soln_x[i]),
                                           op_problem.get_variable_upper_bounds()[fractional[i]]});
 
-  shared_strong_branching_context_t<int, double> ctx(batch_size);
+  shared_strong_branching_context_t<int, double> shared_ctx(batch_size);
+  shared_strong_branching_context_view_t<int, double> sb_view(shared_ctx.solved);
 
   // Pre-mark entries 1 and 4 as solved (simulating DS)
-  ctx.solved[1].store(1);
-  ctx.solved[4].store(1);
+  sb_view.mark_solved(1);
+  sb_view.mark_solved(4);
 
-  solver_settings.shared_sb_view =
-    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+  solver_settings.shared_sb_solved = sb_view.solved;
 
   auto solution = solve_lp(&handle_, op_problem, solver_settings);
 
@@ -2152,7 +2151,7 @@ TEST(pdlp_class, shared_sb_view_batch_pre_solved)
 
   // All entries should now be marked solved in the shared context
   for (int i = 0; i < batch_size; ++i) {
-    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+    EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved";
   }
 }
 
@@ -2176,14 +2175,14 @@ TEST(pdlp_class, shared_sb_view_subbatch)
   solver_settings.presolver        = cuopt::linear_programming::presolver_t::None;
   solver_settings.sub_batch_size   = 2;
 
-  shared_strong_branching_context_t<int, double> ctx(batch_size);
+  shared_strong_branching_context_t<int, double> shared_ctx(batch_size);
+  shared_strong_branching_context_view_t<int, double> sb_view(shared_ctx.solved);
 
   // Pre-mark one entry in each sub-batch of size 2: indices 1, 4
-  ctx.solved[1].store(1);
-  ctx.solved[4].store(1);
+  sb_view.mark_solved(1);
+  sb_view.mark_solved(4);
 
-  solver_settings.shared_sb_view =
-    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+  solver_settings.shared_sb_solved = sb_view.solved;
 
   auto solution = batch_pdlp_solve(&handle_, op_problem, fractional, root_soln_x, solver_settings);
 
@@ -2202,7 +2201,7 @@ TEST(pdlp_class, shared_sb_view_subbatch)
 
   // All should be marked solved
   for (int i = 0; i < batch_size; ++i) {
-    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+    EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved";
   }
 }
 
@@ -2234,10 +2233,10 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark)
                                           std::ceil(root_soln_x[i]),
                                           op_problem.get_variable_upper_bounds()[fractional[i]]});
 
-  shared_strong_branching_context_t<int, double> ctx(batch_size);
+  shared_strong_branching_context_t<int, double> shared_ctx(batch_size);
+  shared_strong_branching_context_view_t<int, double> sb_view(shared_ctx.solved);
 
-  solver_settings.shared_sb_view =
-    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+  solver_settings.shared_sb_solved = sb_view.solved;
 
   optimization_problem_solution_t<int, double>* result_ptr = nullptr;
 
@@ -2250,7 +2249,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark)
   // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS)
   std::this_thread::sleep_for(std::chrono::milliseconds(200));
   for (int i = 0; i < n_fractional; ++i)
-    ctx.solved[i].store(1);
+    sb_view.mark_solved(i);
 
   pdlp_thread.join();
 
@@ -2271,7 +2270,7 @@ TEST(pdlp_class, shared_sb_view_concurrent_mark)
 
   // All entries should end up marked solved
   for (int i = 0; i < batch_size; ++i) {
-    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+    EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved";
   }
 
   delete result_ptr;
@@ -2300,10 +2299,10 @@ TEST(pdlp_class, shared_sb_view_all_infeasible)
   for (int i = 0; i < n_fractional; ++i)
     solver_settings.new_bounds.push_back({fractional[0], -5, -5});
 
-  shared_strong_branching_context_t<int, double> ctx(batch_size);
+  shared_strong_branching_context_t<int, double> shared_ctx(batch_size);
+  shared_strong_branching_context_view_t<int, double> sb_view(shared_ctx.solved);
 
-  solver_settings.shared_sb_view =
-    shared_strong_branching_context_view_t<int, double>(std::span(ctx.solved));
+  solver_settings.shared_sb_solved = sb_view.solved;
 
   optimization_problem_solution_t<int, double>* result_ptr = nullptr;
 
@@ -2316,7 +2315,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible)
   // Wait a bit then mark entries 0, 2, 4 as solved (simulating DS)
   std::this_thread::sleep_for(std::chrono::milliseconds(200));
   for (int i = 0; i < n_fractional; ++i)
-    ctx.solved[i].store(1);
+    sb_view.mark_solved(i);
 
   pdlp_thread.join();
 
@@ -2336,7 +2335,7 @@ TEST(pdlp_class, shared_sb_view_all_infeasible)
 
   // All entries should end up marked solved
   for (int i = 0; i < batch_size; ++i) {
-    EXPECT_TRUE(ctx.solved[i].load() != 0) << "Entry " << i << " should be solved";
+    EXPECT_TRUE(sb_view.is_solved(i)) << "Entry " << i << " should be solved";
   }
 
   delete result_ptr;

From 843c53236b3b0a6f7ef2083a6eb961cb9fe133ac Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 18:25:20 +0200
Subject: [PATCH 40/43] two improvements: mark variables as solved in DS if
 node became reliable, use one stream per BPDLP in RB

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 5bcd819ba5..9e5ff12bbd 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -664,6 +664,9 @@ static void batch_pdlp_reliability_branching_task(
     std::max(static_cast<f_t>(0.0), settings.time_limit - batch_elapsed_time);
   if (batch_remaining_time <= 0.0) { return; }
 
+  // One handle per batch PDLP since there can be concurrent calls
+  const raft::handle_t batch_pdlp_handle;
+
   pdlp_solver_settings_t<i_t, f_t> pdlp_settings;
   if (rb_mode == 1) {
     pdlp_settings.concurrent_halt  = &concurrent_halt;
@@ -675,9 +678,9 @@ static void batch_pdlp_reliability_branching_task(
     auto& cache = pdlp_warm_cache;
     pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
                                               cache.initial_primal.size(),
-                                              cache.batch_pdlp_handle.get_stream());
+                                              batch_pdlp_handle.get_stream());
     pdlp_settings.set_initial_dual_solution(
-      cache.initial_dual.data(), cache.initial_dual.size(), cache.batch_pdlp_handle.get_stream());
+      cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream());
     pdlp_settings.set_initial_step_size(cache.step_size);
     pdlp_settings.set_initial_primal_weight(cache.primal_weight);
     pdlp_settings.set_initial_pdlp_iteration(cache.pdlp_iteration);
@@ -686,7 +689,7 @@ static void batch_pdlp_reliability_branching_task(
   if (concurrent_halt.load() == 1) { return; }
 
   const auto solutions = batch_pdlp_solve(
-    &pdlp_warm_cache.batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
+    &batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
 
   f_t batch_pdlp_time = toc(start_batch);
 
@@ -1274,6 +1277,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(i); }
           }
+        } else {
+          // Variable became reliable, make it as solved so that batch PDLP does not solve it again
+          if (rb_mode == 1) sb_view.mark_solved(i);
         }
         pseudo_cost_mutex_down[j].unlock();
       }
@@ -1317,6 +1323,9 @@ i_t pseudo_costs_t<i_t, f_t>::reliable_variable_selection(
             // Should be valid if were are already here
             if (rb_mode == 1 && is_dual_simplex_done(status)) { sb_view.mark_solved(shared_idx); }
           }
+        } else {
+          // Variable became reliable, make it as solved so that batch PDLP does not solve it again
+          if (rb_mode == 1) sb_view.mark_solved(shared_idx);
         }
         pseudo_cost_mutex_up[j].unlock();
       }

From a9fd42095cfebc23dd48dd62b1ce69367d7b7c02 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 18:52:52 +0200
Subject: [PATCH 41/43] fix: avoid early exit if solved at step 0 even when
 initial pdlp iteartion is given

---
 cpp/src/pdlp/pdlp.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/pdlp/pdlp.cu b/cpp/src/pdlp/pdlp.cu
index 85cba335ba..33c080ee3c 100644
--- a/cpp/src/pdlp/pdlp.cu
+++ b/cpp/src/pdlp/pdlp.cu
@@ -995,7 +995,7 @@ std::optional<optimization_problem_solution_t<i_t, f_t>> pdlp_solver_t<i_t, f_t>
   // To avoid that we allow at least two iterations at first before checking (in practice 0 wasn't
   // enough) We still need to check iteration and time limit prior without breaking the logic below
   // of first checking termination before the limit
-  if (total_pdlp_iterations_ <= 1) {
+  if (internal_solver_iterations_ <= 1) {
     print_termination_criteria(timer);
     return check_limits(timer);
   }

From edae2997fca04bc3690a4f3adcc93f9e5f795396 Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 19:02:59 +0200
Subject: [PATCH 42/43] disable both by default

---
 cpp/include/cuopt/linear_programming/mip/solver_settings.hpp  | 4 ++--
 cpp/src/math_optimization/solver_settings.cu                  | 4 ++--
 .../cuopt_server/utils/linear_programming/data_definition.py  | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
index 77b18dc17e..3da9ea8f1f 100644
--- a/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
+++ b/cpp/include/cuopt/linear_programming/mip/solver_settings.hpp
@@ -99,9 +99,9 @@ class mip_solver_settings_t {
   f_t cut_change_threshold       = -1.0;
   f_t cut_min_orthogonality      = 0.5;
   i_t mip_batch_pdlp_strong_branching{
-    1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+    0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
   i_t mip_batch_pdlp_reliability_branching{
-    1};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
+    0};  // 0 = DS only, 1 = cooperative DS + PDLP, 2 = batch PDLP only
   i_t num_gpus        = 1;
   bool log_to_console = true;
 
diff --git a/cpp/src/math_optimization/solver_settings.cu b/cpp/src/math_optimization/solver_settings.cu
index 52fc95a6bd..9d933f3c98 100644
--- a/cpp/src/math_optimization/solver_settings.cu
+++ b/cpp/src/math_optimization/solver_settings.cu
@@ -99,8 +99,8 @@ solver_settings_t<i_t, f_t>::solver_settings_t() : pdlp_settings(), mip_settings
     {CUOPT_MIP_REDUCED_COST_STRENGTHENING, &mip_settings.reduced_cost_strengthening, -1, std::numeric_limits<i_t>::max(), -1},
     {CUOPT_NUM_GPUS, &pdlp_settings.num_gpus, 1, 2, 1},
     {CUOPT_NUM_GPUS, &mip_settings.num_gpus, 1, 2, 1},
-    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 1},
-    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 1},
+    {CUOPT_MIP_BATCH_PDLP_STRONG_BRANCHING, &mip_settings.mip_batch_pdlp_strong_branching, 0, 2, 0},
+    {CUOPT_MIP_BATCH_PDLP_RELIABILITY_BRANCHING, &mip_settings.mip_batch_pdlp_reliability_branching, 0, 2, 0},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&pdlp_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_PRESOLVE, reinterpret_cast<int*>(&mip_settings.presolver), CUOPT_PRESOLVE_DEFAULT, CUOPT_PRESOLVE_PSLP, CUOPT_PRESOLVE_DEFAULT},
     {CUOPT_MIP_DETERMINISM_MODE, &mip_settings.determinism_mode, CUOPT_MODE_OPPORTUNISTIC, CUOPT_MODE_DETERMINISTIC, CUOPT_MODE_OPPORTUNISTIC},
diff --git a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
index ddc38539f5..32cf860f28 100644
--- a/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
+++ b/python/cuopt_server/cuopt_server/utils/linear_programming/data_definition.py
@@ -451,13 +451,13 @@ class SolverConfig(BaseModel):
         "heuristics and branch and bound for MILP",
     )
     mip_batch_pdlp_strong_branching: Optional[int] = Field(
-        default=1,
+        default=0,
         description="Strong branching mode: 0 = Dual Simplex only, "
         "1 = cooperative work-stealing (DS + batch PDLP), "
         "2 = batch PDLP only.",
     )
     mip_batch_pdlp_reliability_branching: Optional[int] = Field(
-        default=1,
+        default=0,
         description="Reliability branching mode: 0 = Dual Simplex only, "
         "1 = cooperative work-stealing (DS + batch PDLP), "
         "2 = batch PDLP only.",

From 6e5baa5acea722cdce3231ef46ff4b56f7ef1e0a Mon Sep 17 00:00:00 2001
From: Nicolas Blin <nicolas.blin7@hotmail.fr>
Date: Mon, 30 Mar 2026 19:03:14 +0200
Subject: [PATCH 43/43] styl

---
 cpp/src/branch_and_bound/pseudo_costs.cpp | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/cpp/src/branch_and_bound/pseudo_costs.cpp b/cpp/src/branch_and_bound/pseudo_costs.cpp
index 9e5ff12bbd..4625b4343b 100644
--- a/cpp/src/branch_and_bound/pseudo_costs.cpp
+++ b/cpp/src/branch_and_bound/pseudo_costs.cpp
@@ -676,9 +676,8 @@ static void batch_pdlp_reliability_branching_task(
 
   if (pdlp_warm_cache.populated) {
     auto& cache = pdlp_warm_cache;
-    pdlp_settings.set_initial_primal_solution(cache.initial_primal.data(),
-                                              cache.initial_primal.size(),
-                                              batch_pdlp_handle.get_stream());
+    pdlp_settings.set_initial_primal_solution(
+      cache.initial_primal.data(), cache.initial_primal.size(), batch_pdlp_handle.get_stream());
     pdlp_settings.set_initial_dual_solution(
       cache.initial_dual.data(), cache.initial_dual.size(), batch_pdlp_handle.get_stream());
     pdlp_settings.set_initial_step_size(cache.step_size);
@@ -688,8 +687,8 @@ static void batch_pdlp_reliability_branching_task(
 
   if (concurrent_halt.load() == 1) { return; }
 
-  const auto solutions = batch_pdlp_solve(
-    &batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
+  const auto solutions =
+    batch_pdlp_solve(&batch_pdlp_handle, mps_model, candidate_vars, fraction_values, pdlp_settings);
 
   f_t batch_pdlp_time = toc(start_batch);